12025cf9eSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 2f6bed0efSShaohua Li /* 3f6bed0efSShaohua Li * Copyright (C) 2015 Shaohua Li <shli@fb.com> 4b4c625c6SSong Liu * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 5f6bed0efSShaohua Li */ 6f6bed0efSShaohua Li #include <linux/kernel.h> 7f6bed0efSShaohua Li #include <linux/wait.h> 8f6bed0efSShaohua Li #include <linux/blkdev.h> 9f6bed0efSShaohua Li #include <linux/slab.h> 10f6bed0efSShaohua Li #include <linux/raid/md_p.h> 115cb2fbd6SShaohua Li #include <linux/crc32c.h> 12f6bed0efSShaohua Li #include <linux/random.h> 13ce1ccd07SShaohua Li #include <linux/kthread.h> 1403b047f4SSong Liu #include <linux/types.h> 15f6bed0efSShaohua Li #include "md.h" 16f6bed0efSShaohua Li #include "raid5.h" 17935fe098SMike Snitzer #include "md-bitmap.h" 1870d466f7SSong Liu #include "raid5-log.h" 19f6bed0efSShaohua Li 20f6bed0efSShaohua Li /* 21f6bed0efSShaohua Li * metadata/data stored in disk with 4k size unit (a block) regardless 22f6bed0efSShaohua Li * underneath hardware sector size. only works with PAGE_SIZE == 4096 23f6bed0efSShaohua Li */ 24f6bed0efSShaohua Li #define BLOCK_SECTORS (8) 25effe6ee7SSong Liu #define BLOCK_SECTOR_SHIFT (3) 26f6bed0efSShaohua Li 270576b1c6SShaohua Li /* 28a39f7afdSSong Liu * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 29a39f7afdSSong Liu * 30a39f7afdSSong Liu * In write through mode, the reclaim runs every log->max_free_space. 31a39f7afdSSong Liu * This can prevent the recovery scans for too long 320576b1c6SShaohua Li */ 330576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 340576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 350576b1c6SShaohua Li 36a39f7afdSSong Liu /* wake up reclaim thread periodically */ 37a39f7afdSSong Liu #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 38a39f7afdSSong Liu /* start flush with these full stripes */ 3984890c03SShaohua Li #define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4) 40a39f7afdSSong Liu /* reclaim stripes in groups */ 41a39f7afdSSong Liu #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 42a39f7afdSSong Liu 43c38d29b3SChristoph Hellwig /* 44c38d29b3SChristoph Hellwig * We only need 2 bios per I/O unit to make progress, but ensure we 45c38d29b3SChristoph Hellwig * have a few more available to not get too tight. 46c38d29b3SChristoph Hellwig */ 47c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE 4 48c38d29b3SChristoph Hellwig 492c7da14bSSong Liu static char *r5c_journal_mode_str[] = {"write-through", 502c7da14bSSong Liu "write-back"}; 512ded3703SSong Liu /* 522ded3703SSong Liu * raid5 cache state machine 532ded3703SSong Liu * 549b69173eSJackieLiu * With the RAID cache, each stripe works in two phases: 552ded3703SSong Liu * - caching phase 562ded3703SSong Liu * - writing-out phase 572ded3703SSong Liu * 582ded3703SSong Liu * These two phases are controlled by bit STRIPE_R5C_CACHING: 592ded3703SSong Liu * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 602ded3703SSong Liu * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 612ded3703SSong Liu * 622ded3703SSong Liu * When there is no journal, or the journal is in write-through mode, 632ded3703SSong Liu * the stripe is always in writing-out phase. 642ded3703SSong Liu * 652ded3703SSong Liu * For write-back journal, the stripe is sent to caching phase on write 662ded3703SSong Liu * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 672ded3703SSong Liu * the write-out phase by clearing STRIPE_R5C_CACHING. 682ded3703SSong Liu * 692ded3703SSong Liu * Stripes in caching phase do not write the raid disks. Instead, all 702ded3703SSong Liu * writes are committed from the log device. Therefore, a stripe in 712ded3703SSong Liu * caching phase handles writes as: 722ded3703SSong Liu * - write to log device 732ded3703SSong Liu * - return IO 742ded3703SSong Liu * 752ded3703SSong Liu * Stripes in writing-out phase handle writes as: 762ded3703SSong Liu * - calculate parity 772ded3703SSong Liu * - write pending data and parity to journal 782ded3703SSong Liu * - write data and parity to raid disks 792ded3703SSong Liu * - return IO for pending writes 802ded3703SSong Liu */ 812ded3703SSong Liu 82f6bed0efSShaohua Li struct r5l_log { 83f6bed0efSShaohua Li struct md_rdev *rdev; 84f6bed0efSShaohua Li 85f6bed0efSShaohua Li u32 uuid_checksum; 86f6bed0efSShaohua Li 87f6bed0efSShaohua Li sector_t device_size; /* log device size, round to 88f6bed0efSShaohua Li * BLOCK_SECTORS */ 890576b1c6SShaohua Li sector_t max_free_space; /* reclaim run if free space is at 900576b1c6SShaohua Li * this size */ 91f6bed0efSShaohua Li 92f6bed0efSShaohua Li sector_t last_checkpoint; /* log tail. where recovery scan 93f6bed0efSShaohua Li * starts from */ 94f6bed0efSShaohua Li u64 last_cp_seq; /* log tail sequence */ 95f6bed0efSShaohua Li 96f6bed0efSShaohua Li sector_t log_start; /* log head. where new data appends */ 97f6bed0efSShaohua Li u64 seq; /* log head sequence */ 98f6bed0efSShaohua Li 9917036461SChristoph Hellwig sector_t next_checkpoint; 10017036461SChristoph Hellwig 101f6bed0efSShaohua Li struct mutex io_mutex; 102f6bed0efSShaohua Li struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 103f6bed0efSShaohua Li 104f6bed0efSShaohua Li spinlock_t io_list_lock; 105f6bed0efSShaohua Li struct list_head running_ios; /* io_units which are still running, 106f6bed0efSShaohua Li * and have not yet been completely 107f6bed0efSShaohua Li * written to the log */ 108f6bed0efSShaohua Li struct list_head io_end_ios; /* io_units which have been completely 109f6bed0efSShaohua Li * written to the log but not yet written 110f6bed0efSShaohua Li * to the RAID */ 111a8c34f91SShaohua Li struct list_head flushing_ios; /* io_units which are waiting for log 112a8c34f91SShaohua Li * cache flush */ 11304732f74SChristoph Hellwig struct list_head finished_ios; /* io_units which settle down in log disk */ 114a8c34f91SShaohua Li struct bio flush_bio; 115f6bed0efSShaohua Li 1165036c390SChristoph Hellwig struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 1175036c390SChristoph Hellwig 118f6bed0efSShaohua Li struct kmem_cache *io_kc; 119afeee514SKent Overstreet mempool_t io_pool; 120afeee514SKent Overstreet struct bio_set bs; 121afeee514SKent Overstreet mempool_t meta_pool; 122f6bed0efSShaohua Li 1230576b1c6SShaohua Li struct md_thread *reclaim_thread; 1240576b1c6SShaohua Li unsigned long reclaim_target; /* number of space that need to be 1250576b1c6SShaohua Li * reclaimed. if it's 0, reclaim spaces 1260576b1c6SShaohua Li * used by io_units which are in 1270576b1c6SShaohua Li * IO_UNIT_STRIPE_END state (eg, reclaim 1280576b1c6SShaohua Li * dones't wait for specific io_unit 1290576b1c6SShaohua Li * switching to IO_UNIT_STRIPE_END 1300576b1c6SShaohua Li * state) */ 1310fd22b45SShaohua Li wait_queue_head_t iounit_wait; 1320576b1c6SShaohua Li 133f6bed0efSShaohua Li struct list_head no_space_stripes; /* pending stripes, log has no space */ 134f6bed0efSShaohua Li spinlock_t no_space_stripes_lock; 13556fef7c6SChristoph Hellwig 13656fef7c6SChristoph Hellwig bool need_cache_flush; 1372ded3703SSong Liu 1382ded3703SSong Liu /* for r5c_cache */ 1392ded3703SSong Liu enum r5c_journal_mode r5c_journal_mode; 140a39f7afdSSong Liu 141a39f7afdSSong Liu /* all stripes in r5cache, in the order of seq at sh->log_start */ 142a39f7afdSSong Liu struct list_head stripe_in_journal_list; 143a39f7afdSSong Liu 144a39f7afdSSong Liu spinlock_t stripe_in_journal_lock; 145a39f7afdSSong Liu atomic_t stripe_in_journal_count; 1463bddb7f8SSong Liu 1473bddb7f8SSong Liu /* to submit async io_units, to fulfill ordering of flush */ 1483bddb7f8SSong Liu struct work_struct deferred_io_work; 1492e38a37fSSong Liu /* to disable write back during in degraded mode */ 1502e38a37fSSong Liu struct work_struct disable_writeback_work; 15103b047f4SSong Liu 15203b047f4SSong Liu /* to for chunk_aligned_read in writeback mode, details below */ 15303b047f4SSong Liu spinlock_t tree_lock; 15403b047f4SSong Liu struct radix_tree_root big_stripe_tree; 155f6bed0efSShaohua Li }; 156f6bed0efSShaohua Li 157f6bed0efSShaohua Li /* 15803b047f4SSong Liu * Enable chunk_aligned_read() with write back cache. 15903b047f4SSong Liu * 16003b047f4SSong Liu * Each chunk may contain more than one stripe (for example, a 256kB 16103b047f4SSong Liu * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For 16203b047f4SSong Liu * chunk_aligned_read, these stripes are grouped into one "big_stripe". 16303b047f4SSong Liu * For each big_stripe, we count how many stripes of this big_stripe 16403b047f4SSong Liu * are in the write back cache. These data are tracked in a radix tree 16503b047f4SSong Liu * (big_stripe_tree). We use radix_tree item pointer as the counter. 16603b047f4SSong Liu * r5c_tree_index() is used to calculate keys for the radix tree. 16703b047f4SSong Liu * 16803b047f4SSong Liu * chunk_aligned_read() calls r5c_big_stripe_cached() to look up 16903b047f4SSong Liu * big_stripe of each chunk in the tree. If this big_stripe is in the 17003b047f4SSong Liu * tree, chunk_aligned_read() aborts. This look up is protected by 17103b047f4SSong Liu * rcu_read_lock(). 17203b047f4SSong Liu * 17303b047f4SSong Liu * It is necessary to remember whether a stripe is counted in 17403b047f4SSong Liu * big_stripe_tree. Instead of adding new flag, we reuses existing flags: 17503b047f4SSong Liu * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these 17603b047f4SSong Liu * two flags are set, the stripe is counted in big_stripe_tree. This 17703b047f4SSong Liu * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to 17803b047f4SSong Liu * r5c_try_caching_write(); and moving clear_bit of 17903b047f4SSong Liu * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to 18003b047f4SSong Liu * r5c_finish_stripe_write_out(). 18103b047f4SSong Liu */ 18203b047f4SSong Liu 18303b047f4SSong Liu /* 18403b047f4SSong Liu * radix tree requests lowest 2 bits of data pointer to be 2b'00. 18503b047f4SSong Liu * So it is necessary to left shift the counter by 2 bits before using it 18603b047f4SSong Liu * as data pointer of the tree. 18703b047f4SSong Liu */ 18803b047f4SSong Liu #define R5C_RADIX_COUNT_SHIFT 2 18903b047f4SSong Liu 19003b047f4SSong Liu /* 19103b047f4SSong Liu * calculate key for big_stripe_tree 19203b047f4SSong Liu * 19303b047f4SSong Liu * sect: align_bi->bi_iter.bi_sector or sh->sector 19403b047f4SSong Liu */ 19503b047f4SSong Liu static inline sector_t r5c_tree_index(struct r5conf *conf, 19603b047f4SSong Liu sector_t sect) 19703b047f4SSong Liu { 198*52923083SDamien Le Moal sector_div(sect, conf->chunk_sectors); 19903b047f4SSong Liu return sect; 20003b047f4SSong Liu } 20103b047f4SSong Liu 20203b047f4SSong Liu /* 203f6bed0efSShaohua Li * an IO range starts from a meta data block and end at the next meta data 204f6bed0efSShaohua Li * block. The io unit's the meta data block tracks data/parity followed it. io 205f6bed0efSShaohua Li * unit is written to log disk with normal write, as we always flush log disk 206f6bed0efSShaohua Li * first and then start move data to raid disks, there is no requirement to 207f6bed0efSShaohua Li * write io unit with FLUSH/FUA 208f6bed0efSShaohua Li */ 209f6bed0efSShaohua Li struct r5l_io_unit { 210f6bed0efSShaohua Li struct r5l_log *log; 211f6bed0efSShaohua Li 212f6bed0efSShaohua Li struct page *meta_page; /* store meta block */ 213f6bed0efSShaohua Li int meta_offset; /* current offset in meta_page */ 214f6bed0efSShaohua Li 215f6bed0efSShaohua Li struct bio *current_bio;/* current_bio accepting new data */ 216f6bed0efSShaohua Li 217f6bed0efSShaohua Li atomic_t pending_stripe;/* how many stripes not flushed to raid */ 218f6bed0efSShaohua Li u64 seq; /* seq number of the metablock */ 219f6bed0efSShaohua Li sector_t log_start; /* where the io_unit starts */ 220f6bed0efSShaohua Li sector_t log_end; /* where the io_unit ends */ 221f6bed0efSShaohua Li struct list_head log_sibling; /* log->running_ios */ 222f6bed0efSShaohua Li struct list_head stripe_list; /* stripes added to the io_unit */ 223f6bed0efSShaohua Li 224f6bed0efSShaohua Li int state; 2256143e2ceSChristoph Hellwig bool need_split_bio; 2263bddb7f8SSong Liu struct bio *split_bio; 2273bddb7f8SSong Liu 2283bddb7f8SSong Liu unsigned int has_flush:1; /* include flush request */ 2293bddb7f8SSong Liu unsigned int has_fua:1; /* include fua request */ 230a9501d74SSong Liu unsigned int has_null_flush:1; /* include null flush request */ 231a9501d74SSong Liu unsigned int has_flush_payload:1; /* include flush payload */ 2323bddb7f8SSong Liu /* 2333bddb7f8SSong Liu * io isn't sent yet, flush/fua request can only be submitted till it's 2343bddb7f8SSong Liu * the first IO in running_ios list 2353bddb7f8SSong Liu */ 2363bddb7f8SSong Liu unsigned int io_deferred:1; 2373bddb7f8SSong Liu 2383bddb7f8SSong Liu struct bio_list flush_barriers; /* size == 0 flush bios */ 239f6bed0efSShaohua Li }; 240f6bed0efSShaohua Li 241f6bed0efSShaohua Li /* r5l_io_unit state */ 242f6bed0efSShaohua Li enum r5l_io_unit_state { 243f6bed0efSShaohua Li IO_UNIT_RUNNING = 0, /* accepting new IO */ 244f6bed0efSShaohua Li IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 245f6bed0efSShaohua Li * don't accepting new bio */ 246f6bed0efSShaohua Li IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 247a8c34f91SShaohua Li IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 248f6bed0efSShaohua Li }; 249f6bed0efSShaohua Li 2502ded3703SSong Liu bool r5c_is_writeback(struct r5l_log *log) 2512ded3703SSong Liu { 2522ded3703SSong Liu return (log != NULL && 2532ded3703SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 2542ded3703SSong Liu } 2552ded3703SSong Liu 256f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 257f6bed0efSShaohua Li { 258f6bed0efSShaohua Li start += inc; 259f6bed0efSShaohua Li if (start >= log->device_size) 260f6bed0efSShaohua Li start = start - log->device_size; 261f6bed0efSShaohua Li return start; 262f6bed0efSShaohua Li } 263f6bed0efSShaohua Li 264f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 265f6bed0efSShaohua Li sector_t end) 266f6bed0efSShaohua Li { 267f6bed0efSShaohua Li if (end >= start) 268f6bed0efSShaohua Li return end - start; 269f6bed0efSShaohua Li else 270f6bed0efSShaohua Li return end + log->device_size - start; 271f6bed0efSShaohua Li } 272f6bed0efSShaohua Li 273f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 274f6bed0efSShaohua Li { 275f6bed0efSShaohua Li sector_t used_size; 276f6bed0efSShaohua Li 277f6bed0efSShaohua Li used_size = r5l_ring_distance(log, log->last_checkpoint, 278f6bed0efSShaohua Li log->log_start); 279f6bed0efSShaohua Li 280f6bed0efSShaohua Li return log->device_size > used_size + size; 281f6bed0efSShaohua Li } 282f6bed0efSShaohua Li 283f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 284f6bed0efSShaohua Li enum r5l_io_unit_state state) 285f6bed0efSShaohua Li { 286f6bed0efSShaohua Li if (WARN_ON(io->state >= state)) 287f6bed0efSShaohua Li return; 288f6bed0efSShaohua Li io->state = state; 289f6bed0efSShaohua Li } 290f6bed0efSShaohua Li 2911e6d690bSSong Liu static void 292bd83d0a2SNeilBrown r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) 2931e6d690bSSong Liu { 2941e6d690bSSong Liu struct bio *wbi, *wbi2; 2951e6d690bSSong Liu 2961e6d690bSSong Liu wbi = dev->written; 2971e6d690bSSong Liu dev->written = NULL; 2981e6d690bSSong Liu while (wbi && wbi->bi_iter.bi_sector < 2991e6d690bSSong Liu dev->sector + STRIPE_SECTORS) { 3001e6d690bSSong Liu wbi2 = r5_next_bio(wbi, dev->sector); 3011e6d690bSSong Liu md_write_end(conf->mddev); 302bd83d0a2SNeilBrown bio_endio(wbi); 3031e6d690bSSong Liu wbi = wbi2; 3041e6d690bSSong Liu } 3051e6d690bSSong Liu } 3061e6d690bSSong Liu 3071e6d690bSSong Liu void r5c_handle_cached_data_endio(struct r5conf *conf, 308bd83d0a2SNeilBrown struct stripe_head *sh, int disks) 3091e6d690bSSong Liu { 3101e6d690bSSong Liu int i; 3111e6d690bSSong Liu 3121e6d690bSSong Liu for (i = sh->disks; i--; ) { 3131e6d690bSSong Liu if (sh->dev[i].written) { 3141e6d690bSSong Liu set_bit(R5_UPTODATE, &sh->dev[i].flags); 315bd83d0a2SNeilBrown r5c_return_dev_pending_writes(conf, &sh->dev[i]); 316e64e4018SAndy Shevchenko md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3171e6d690bSSong Liu STRIPE_SECTORS, 3181e6d690bSSong Liu !test_bit(STRIPE_DEGRADED, &sh->state), 3191e6d690bSSong Liu 0); 3201e6d690bSSong Liu } 3211e6d690bSSong Liu } 3221e6d690bSSong Liu } 3231e6d690bSSong Liu 324ff875738SArtur Paszkiewicz void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 325ff875738SArtur Paszkiewicz 326a39f7afdSSong Liu /* Check whether we should flush some stripes to free up stripe cache */ 327a39f7afdSSong Liu void r5c_check_stripe_cache_usage(struct r5conf *conf) 328a39f7afdSSong Liu { 329a39f7afdSSong Liu int total_cached; 330a39f7afdSSong Liu 331a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 332a39f7afdSSong Liu return; 333a39f7afdSSong Liu 334a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 335a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 336a39f7afdSSong Liu 337a39f7afdSSong Liu /* 338a39f7afdSSong Liu * The following condition is true for either of the following: 339a39f7afdSSong Liu * - stripe cache pressure high: 340a39f7afdSSong Liu * total_cached > 3/4 min_nr_stripes || 341a39f7afdSSong Liu * empty_inactive_list_nr > 0 342a39f7afdSSong Liu * - stripe cache pressure moderate: 343a39f7afdSSong Liu * total_cached > 1/2 min_nr_stripes 344a39f7afdSSong Liu */ 345a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 1 / 2 || 346a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 347a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 348a39f7afdSSong Liu } 349a39f7afdSSong Liu 350a39f7afdSSong Liu /* 351a39f7afdSSong Liu * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 352a39f7afdSSong Liu * stripes in the cache 353a39f7afdSSong Liu */ 354a39f7afdSSong Liu void r5c_check_cached_full_stripe(struct r5conf *conf) 355a39f7afdSSong Liu { 356a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 357a39f7afdSSong Liu return; 358a39f7afdSSong Liu 359a39f7afdSSong Liu /* 360a39f7afdSSong Liu * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 361a39f7afdSSong Liu * or a full stripe (chunk size / 4k stripes). 362a39f7afdSSong Liu */ 363a39f7afdSSong Liu if (atomic_read(&conf->r5c_cached_full_stripes) >= 36484890c03SShaohua Li min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), 365a39f7afdSSong Liu conf->chunk_sectors >> STRIPE_SHIFT)) 366a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 367a39f7afdSSong Liu } 368a39f7afdSSong Liu 369a39f7afdSSong Liu /* 370a39f7afdSSong Liu * Total log space (in sectors) needed to flush all data in cache 371a39f7afdSSong Liu * 37239b99586SSong Liu * To avoid deadlock due to log space, it is necessary to reserve log 37339b99586SSong Liu * space to flush critical stripes (stripes that occupying log space near 37439b99586SSong Liu * last_checkpoint). This function helps check how much log space is 37539b99586SSong Liu * required to flush all cached stripes. 376a39f7afdSSong Liu * 37739b99586SSong Liu * To reduce log space requirements, two mechanisms are used to give cache 37839b99586SSong Liu * flush higher priorities: 37939b99586SSong Liu * 1. In handle_stripe_dirtying() and schedule_reconstruction(), 38039b99586SSong Liu * stripes ALREADY in journal can be flushed w/o pending writes; 38139b99586SSong Liu * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal 38239b99586SSong Liu * can be delayed (r5l_add_no_space_stripe). 383a39f7afdSSong Liu * 38439b99586SSong Liu * In cache flush, the stripe goes through 1 and then 2. For a stripe that 38539b99586SSong Liu * already passed 1, flushing it requires at most (conf->max_degraded + 1) 38639b99586SSong Liu * pages of journal space. For stripes that has not passed 1, flushing it 38739b99586SSong Liu * requires (conf->raid_disks + 1) pages of journal space. There are at 38839b99586SSong Liu * most (conf->group_cnt + 1) stripe that passed 1. So total journal space 38939b99586SSong Liu * required to flush all cached stripes (in pages) is: 39039b99586SSong Liu * 39139b99586SSong Liu * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + 39239b99586SSong Liu * (group_cnt + 1) * (raid_disks + 1) 39339b99586SSong Liu * or 39439b99586SSong Liu * (stripe_in_journal_count) * (max_degraded + 1) + 39539b99586SSong Liu * (group_cnt + 1) * (raid_disks - max_degraded) 396a39f7afdSSong Liu */ 397a39f7afdSSong Liu static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 398a39f7afdSSong Liu { 399a39f7afdSSong Liu struct r5l_log *log = conf->log; 400a39f7afdSSong Liu 401a39f7afdSSong Liu if (!r5c_is_writeback(log)) 402a39f7afdSSong Liu return 0; 403a39f7afdSSong Liu 40439b99586SSong Liu return BLOCK_SECTORS * 40539b99586SSong Liu ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + 40639b99586SSong Liu (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); 407a39f7afdSSong Liu } 408a39f7afdSSong Liu 409a39f7afdSSong Liu /* 410a39f7afdSSong Liu * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 411a39f7afdSSong Liu * 412a39f7afdSSong Liu * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 413a39f7afdSSong Liu * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 414a39f7afdSSong Liu * device is less than 2x of reclaim_required_space. 415a39f7afdSSong Liu */ 416a39f7afdSSong Liu static inline void r5c_update_log_state(struct r5l_log *log) 417a39f7afdSSong Liu { 418a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 419a39f7afdSSong Liu sector_t free_space; 420a39f7afdSSong Liu sector_t reclaim_space; 421f687a33eSSong Liu bool wake_reclaim = false; 422a39f7afdSSong Liu 423a39f7afdSSong Liu if (!r5c_is_writeback(log)) 424a39f7afdSSong Liu return; 425a39f7afdSSong Liu 426a39f7afdSSong Liu free_space = r5l_ring_distance(log, log->log_start, 427a39f7afdSSong Liu log->last_checkpoint); 428a39f7afdSSong Liu reclaim_space = r5c_log_required_to_flush_cache(conf); 429a39f7afdSSong Liu if (free_space < 2 * reclaim_space) 430a39f7afdSSong Liu set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 431f687a33eSSong Liu else { 432f687a33eSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 433f687a33eSSong Liu wake_reclaim = true; 434a39f7afdSSong Liu clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 435f687a33eSSong Liu } 436a39f7afdSSong Liu if (free_space < 3 * reclaim_space) 437a39f7afdSSong Liu set_bit(R5C_LOG_TIGHT, &conf->cache_state); 438a39f7afdSSong Liu else 439a39f7afdSSong Liu clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 440f687a33eSSong Liu 441f687a33eSSong Liu if (wake_reclaim) 442f687a33eSSong Liu r5l_wake_reclaim(log, 0); 443a39f7afdSSong Liu } 444a39f7afdSSong Liu 4452ded3703SSong Liu /* 4462ded3703SSong Liu * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 4472ded3703SSong Liu * This function should only be called in write-back mode. 4482ded3703SSong Liu */ 449a39f7afdSSong Liu void r5c_make_stripe_write_out(struct stripe_head *sh) 4502ded3703SSong Liu { 4512ded3703SSong Liu struct r5conf *conf = sh->raid_conf; 4522ded3703SSong Liu struct r5l_log *log = conf->log; 4532ded3703SSong Liu 4542ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 4552ded3703SSong Liu 4562ded3703SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 4572ded3703SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 4581e6d690bSSong Liu 4591e6d690bSSong Liu if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4601e6d690bSSong Liu atomic_inc(&conf->preread_active_stripes); 4611e6d690bSSong Liu } 4621e6d690bSSong Liu 4631e6d690bSSong Liu static void r5c_handle_data_cached(struct stripe_head *sh) 4641e6d690bSSong Liu { 4651e6d690bSSong Liu int i; 4661e6d690bSSong Liu 4671e6d690bSSong Liu for (i = sh->disks; i--; ) 4681e6d690bSSong Liu if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 4691e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[i].flags); 4701e6d690bSSong Liu clear_bit(R5_LOCKED, &sh->dev[i].flags); 4711e6d690bSSong Liu } 4721e6d690bSSong Liu clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 4731e6d690bSSong Liu } 4741e6d690bSSong Liu 4751e6d690bSSong Liu /* 4761e6d690bSSong Liu * this journal write must contain full parity, 4771e6d690bSSong Liu * it may also contain some data pages 4781e6d690bSSong Liu */ 4791e6d690bSSong Liu static void r5c_handle_parity_cached(struct stripe_head *sh) 4801e6d690bSSong Liu { 4811e6d690bSSong Liu int i; 4821e6d690bSSong Liu 4831e6d690bSSong Liu for (i = sh->disks; i--; ) 4841e6d690bSSong Liu if (test_bit(R5_InJournal, &sh->dev[i].flags)) 4851e6d690bSSong Liu set_bit(R5_Wantwrite, &sh->dev[i].flags); 4862ded3703SSong Liu } 4872ded3703SSong Liu 4882ded3703SSong Liu /* 4892ded3703SSong Liu * Setting proper flags after writing (or flushing) data and/or parity to the 4902ded3703SSong Liu * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 4912ded3703SSong Liu */ 4922ded3703SSong Liu static void r5c_finish_cache_stripe(struct stripe_head *sh) 4932ded3703SSong Liu { 4942ded3703SSong Liu struct r5l_log *log = sh->raid_conf->log; 4952ded3703SSong Liu 4962ded3703SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 4972ded3703SSong Liu BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 4982ded3703SSong Liu /* 4992ded3703SSong Liu * Set R5_InJournal for parity dev[pd_idx]. This means 5002ded3703SSong Liu * all data AND parity in the journal. For RAID 6, it is 5012ded3703SSong Liu * NOT necessary to set the flag for dev[qd_idx], as the 5022ded3703SSong Liu * two parities are written out together. 5032ded3703SSong Liu */ 5042ded3703SSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 5051e6d690bSSong Liu } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 5061e6d690bSSong Liu r5c_handle_data_cached(sh); 5071e6d690bSSong Liu } else { 5081e6d690bSSong Liu r5c_handle_parity_cached(sh); 5091e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 5101e6d690bSSong Liu } 5112ded3703SSong Liu } 5122ded3703SSong Liu 513d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io) 514d8858f43SChristoph Hellwig { 515d8858f43SChristoph Hellwig struct stripe_head *sh, *next; 516d8858f43SChristoph Hellwig 517d8858f43SChristoph Hellwig list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 518d8858f43SChristoph Hellwig list_del_init(&sh->log_list); 5192ded3703SSong Liu 5202ded3703SSong Liu r5c_finish_cache_stripe(sh); 5212ded3703SSong Liu 522d8858f43SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 523d8858f43SChristoph Hellwig raid5_release_stripe(sh); 524d8858f43SChristoph Hellwig } 525d8858f43SChristoph Hellwig } 526d8858f43SChristoph Hellwig 52756fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log) 52856fef7c6SChristoph Hellwig { 52956fef7c6SChristoph Hellwig struct r5l_io_unit *io, *next; 53056fef7c6SChristoph Hellwig 531efa4b77bSShaohua Li lockdep_assert_held(&log->io_list_lock); 53256fef7c6SChristoph Hellwig 53356fef7c6SChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 53456fef7c6SChristoph Hellwig /* don't change list order */ 53556fef7c6SChristoph Hellwig if (io->state < IO_UNIT_IO_END) 53656fef7c6SChristoph Hellwig break; 53756fef7c6SChristoph Hellwig 53856fef7c6SChristoph Hellwig list_move_tail(&io->log_sibling, &log->finished_ios); 53956fef7c6SChristoph Hellwig r5l_io_run_stripes(io); 54056fef7c6SChristoph Hellwig } 54156fef7c6SChristoph Hellwig } 54256fef7c6SChristoph Hellwig 5433848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log) 5443848c0bcSChristoph Hellwig { 5453848c0bcSChristoph Hellwig struct r5l_io_unit *io, *next; 5463848c0bcSChristoph Hellwig 547efa4b77bSShaohua Li lockdep_assert_held(&log->io_list_lock); 5483848c0bcSChristoph Hellwig 5493848c0bcSChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 5503848c0bcSChristoph Hellwig /* don't change list order */ 5513848c0bcSChristoph Hellwig if (io->state < IO_UNIT_IO_END) 5523848c0bcSChristoph Hellwig break; 5533848c0bcSChristoph Hellwig list_move_tail(&io->log_sibling, &log->io_end_ios); 5543848c0bcSChristoph Hellwig } 5553848c0bcSChristoph Hellwig } 5563848c0bcSChristoph Hellwig 5573bddb7f8SSong Liu static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 558f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio) 559f6bed0efSShaohua Li { 560f6bed0efSShaohua Li struct r5l_io_unit *io = bio->bi_private; 5613bddb7f8SSong Liu struct r5l_io_unit *io_deferred; 562f6bed0efSShaohua Li struct r5l_log *log = io->log; 563509ffec7SChristoph Hellwig unsigned long flags; 564a9501d74SSong Liu bool has_null_flush; 565a9501d74SSong Liu bool has_flush_payload; 566f6bed0efSShaohua Li 5674e4cbee9SChristoph Hellwig if (bio->bi_status) 5686e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 5696e74a9cfSShaohua Li 570f6bed0efSShaohua Li bio_put(bio); 571afeee514SKent Overstreet mempool_free(io->meta_page, &log->meta_pool); 572f6bed0efSShaohua Li 573509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 574509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 575a9501d74SSong Liu 576a9501d74SSong Liu /* 577a9501d74SSong Liu * if the io doesn't not have null_flush or flush payload, 578a9501d74SSong Liu * it is not safe to access it after releasing io_list_lock. 579a9501d74SSong Liu * Therefore, it is necessary to check the condition with 580a9501d74SSong Liu * the lock held. 581a9501d74SSong Liu */ 582a9501d74SSong Liu has_null_flush = io->has_null_flush; 583a9501d74SSong Liu has_flush_payload = io->has_flush_payload; 584a9501d74SSong Liu 585ea17481fSSong Liu if (log->need_cache_flush && !list_empty(&io->stripe_list)) 5863848c0bcSChristoph Hellwig r5l_move_to_end_ios(log); 58756fef7c6SChristoph Hellwig else 58856fef7c6SChristoph Hellwig r5l_log_run_stripes(log); 5893bddb7f8SSong Liu if (!list_empty(&log->running_ios)) { 5903bddb7f8SSong Liu /* 5913bddb7f8SSong Liu * FLUSH/FUA io_unit is deferred because of ordering, now we 5923bddb7f8SSong Liu * can dispatch it 5933bddb7f8SSong Liu */ 5943bddb7f8SSong Liu io_deferred = list_first_entry(&log->running_ios, 5953bddb7f8SSong Liu struct r5l_io_unit, log_sibling); 5963bddb7f8SSong Liu if (io_deferred->io_deferred) 5973bddb7f8SSong Liu schedule_work(&log->deferred_io_work); 5983bddb7f8SSong Liu } 5993bddb7f8SSong Liu 600509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 601509ffec7SChristoph Hellwig 60256fef7c6SChristoph Hellwig if (log->need_cache_flush) 603f6bed0efSShaohua Li md_wakeup_thread(log->rdev->mddev->thread); 6043bddb7f8SSong Liu 605a9501d74SSong Liu /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ 606a9501d74SSong Liu if (has_null_flush) { 6073bddb7f8SSong Liu struct bio *bi; 6083bddb7f8SSong Liu 6093bddb7f8SSong Liu WARN_ON(bio_list_empty(&io->flush_barriers)); 6103bddb7f8SSong Liu while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 6113bddb7f8SSong Liu bio_endio(bi); 612a9501d74SSong Liu if (atomic_dec_and_test(&io->pending_stripe)) { 613a9501d74SSong Liu __r5l_stripe_write_finished(io); 614a9501d74SSong Liu return; 6153bddb7f8SSong Liu } 616ea17481fSSong Liu } 617a9501d74SSong Liu } 618a9501d74SSong Liu /* decrease pending_stripe for flush payload */ 619a9501d74SSong Liu if (has_flush_payload) 620a9501d74SSong Liu if (atomic_dec_and_test(&io->pending_stripe)) 6213bddb7f8SSong Liu __r5l_stripe_write_finished(io); 6223bddb7f8SSong Liu } 6233bddb7f8SSong Liu 6243bddb7f8SSong Liu static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 6253bddb7f8SSong Liu { 6263bddb7f8SSong Liu unsigned long flags; 6273bddb7f8SSong Liu 6283bddb7f8SSong Liu spin_lock_irqsave(&log->io_list_lock, flags); 6293bddb7f8SSong Liu __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 6303bddb7f8SSong Liu spin_unlock_irqrestore(&log->io_list_lock, flags); 6313bddb7f8SSong Liu 632bb3338d3SSong Liu /* 633bb3338d3SSong Liu * In case of journal device failures, submit_bio will get error 634bb3338d3SSong Liu * and calls endio, then active stripes will continue write 635bb3338d3SSong Liu * process. Therefore, it is not necessary to check Faulty bit 636bb3338d3SSong Liu * of journal device here. 637bb3338d3SSong Liu * 638bb3338d3SSong Liu * We can't check split_bio after current_bio is submitted. If 639bb3338d3SSong Liu * io->split_bio is null, after current_bio is submitted, current_bio 640bb3338d3SSong Liu * might already be completed and the io_unit is freed. We submit 641bb3338d3SSong Liu * split_bio first to avoid the issue. 642bb3338d3SSong Liu */ 643bb3338d3SSong Liu if (io->split_bio) { 6443bddb7f8SSong Liu if (io->has_flush) 64520737738SShaohua Li io->split_bio->bi_opf |= REQ_PREFLUSH; 6463bddb7f8SSong Liu if (io->has_fua) 64720737738SShaohua Li io->split_bio->bi_opf |= REQ_FUA; 6483bddb7f8SSong Liu submit_bio(io->split_bio); 6493bddb7f8SSong Liu } 6503bddb7f8SSong Liu 651bb3338d3SSong Liu if (io->has_flush) 652bb3338d3SSong Liu io->current_bio->bi_opf |= REQ_PREFLUSH; 653bb3338d3SSong Liu if (io->has_fua) 654bb3338d3SSong Liu io->current_bio->bi_opf |= REQ_FUA; 655bb3338d3SSong Liu submit_bio(io->current_bio); 656bb3338d3SSong Liu } 657bb3338d3SSong Liu 6583bddb7f8SSong Liu /* deferred io_unit will be dispatched here */ 6593bddb7f8SSong Liu static void r5l_submit_io_async(struct work_struct *work) 6603bddb7f8SSong Liu { 6613bddb7f8SSong Liu struct r5l_log *log = container_of(work, struct r5l_log, 6623bddb7f8SSong Liu deferred_io_work); 6633bddb7f8SSong Liu struct r5l_io_unit *io = NULL; 6643bddb7f8SSong Liu unsigned long flags; 6653bddb7f8SSong Liu 6663bddb7f8SSong Liu spin_lock_irqsave(&log->io_list_lock, flags); 6673bddb7f8SSong Liu if (!list_empty(&log->running_ios)) { 6683bddb7f8SSong Liu io = list_first_entry(&log->running_ios, struct r5l_io_unit, 6693bddb7f8SSong Liu log_sibling); 6703bddb7f8SSong Liu if (!io->io_deferred) 6713bddb7f8SSong Liu io = NULL; 6723bddb7f8SSong Liu else 6733bddb7f8SSong Liu io->io_deferred = 0; 6743bddb7f8SSong Liu } 6753bddb7f8SSong Liu spin_unlock_irqrestore(&log->io_list_lock, flags); 6763bddb7f8SSong Liu if (io) 6773bddb7f8SSong Liu r5l_do_submit_io(log, io); 678f6bed0efSShaohua Li } 679f6bed0efSShaohua Li 6802e38a37fSSong Liu static void r5c_disable_writeback_async(struct work_struct *work) 6812e38a37fSSong Liu { 6822e38a37fSSong Liu struct r5l_log *log = container_of(work, struct r5l_log, 6832e38a37fSSong Liu disable_writeback_work); 6842e38a37fSSong Liu struct mddev *mddev = log->rdev->mddev; 6854d5324f7SNeilBrown struct r5conf *conf = mddev->private; 6864d5324f7SNeilBrown int locked = 0; 6872e38a37fSSong Liu 6882e38a37fSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 6892e38a37fSSong Liu return; 6902e38a37fSSong Liu pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", 6912e38a37fSSong Liu mdname(mddev)); 69270d466f7SSong Liu 69370d466f7SSong Liu /* wait superblock change before suspend */ 69470d466f7SSong Liu wait_event(mddev->sb_wait, 6954d5324f7SNeilBrown conf->log == NULL || 6964d5324f7SNeilBrown (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && 6974d5324f7SNeilBrown (locked = mddev_trylock(mddev)))); 6984d5324f7SNeilBrown if (locked) { 6992e38a37fSSong Liu mddev_suspend(mddev); 7002e38a37fSSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 7012e38a37fSSong Liu mddev_resume(mddev); 7024d5324f7SNeilBrown mddev_unlock(mddev); 7034d5324f7SNeilBrown } 7042e38a37fSSong Liu } 7052e38a37fSSong Liu 706f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log) 707f6bed0efSShaohua Li { 708f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 709f6bed0efSShaohua Li struct r5l_meta_block *block; 710509ffec7SChristoph Hellwig unsigned long flags; 711f6bed0efSShaohua Li u32 crc; 7123bddb7f8SSong Liu bool do_submit = true; 713f6bed0efSShaohua Li 714f6bed0efSShaohua Li if (!io) 715f6bed0efSShaohua Li return; 716f6bed0efSShaohua Li 717f6bed0efSShaohua Li block = page_address(io->meta_page); 718f6bed0efSShaohua Li block->meta_size = cpu_to_le32(io->meta_offset); 7195cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 720f6bed0efSShaohua Li block->checksum = cpu_to_le32(crc); 721f6bed0efSShaohua Li 722f6bed0efSShaohua Li log->current_io = NULL; 723509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 7243bddb7f8SSong Liu if (io->has_flush || io->has_fua) { 7253bddb7f8SSong Liu if (io != list_first_entry(&log->running_ios, 7263bddb7f8SSong Liu struct r5l_io_unit, log_sibling)) { 7273bddb7f8SSong Liu io->io_deferred = 1; 7283bddb7f8SSong Liu do_submit = false; 7293bddb7f8SSong Liu } 7303bddb7f8SSong Liu } 731509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 7323bddb7f8SSong Liu if (do_submit) 7333bddb7f8SSong Liu r5l_do_submit_io(log, io); 734f6bed0efSShaohua Li } 735f6bed0efSShaohua Li 7366143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log) 737b349feb3SChristoph Hellwig { 738afeee514SKent Overstreet struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs); 739b349feb3SChristoph Hellwig 740796a5cf0SMike Christie bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 74174d46992SChristoph Hellwig bio_set_dev(bio, log->rdev->bdev); 7421e932a37SChristoph Hellwig bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 743b349feb3SChristoph Hellwig 744b349feb3SChristoph Hellwig return bio; 745b349feb3SChristoph Hellwig } 746b349feb3SChristoph Hellwig 747c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 748c1b99198SChristoph Hellwig { 749c1b99198SChristoph Hellwig log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 750c1b99198SChristoph Hellwig 751a39f7afdSSong Liu r5c_update_log_state(log); 752c1b99198SChristoph Hellwig /* 753c1b99198SChristoph Hellwig * If we filled up the log device start from the beginning again, 754c1b99198SChristoph Hellwig * which will require a new bio. 755c1b99198SChristoph Hellwig * 756c1b99198SChristoph Hellwig * Note: for this to work properly the log size needs to me a multiple 757c1b99198SChristoph Hellwig * of BLOCK_SECTORS. 758c1b99198SChristoph Hellwig */ 759c1b99198SChristoph Hellwig if (log->log_start == 0) 7606143e2ceSChristoph Hellwig io->need_split_bio = true; 761c1b99198SChristoph Hellwig 762c1b99198SChristoph Hellwig io->log_end = log->log_start; 763c1b99198SChristoph Hellwig } 764c1b99198SChristoph Hellwig 765f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 766f6bed0efSShaohua Li { 767f6bed0efSShaohua Li struct r5l_io_unit *io; 768f6bed0efSShaohua Li struct r5l_meta_block *block; 769f6bed0efSShaohua Li 770afeee514SKent Overstreet io = mempool_alloc(&log->io_pool, GFP_ATOMIC); 7715036c390SChristoph Hellwig if (!io) 7725036c390SChristoph Hellwig return NULL; 7735036c390SChristoph Hellwig memset(io, 0, sizeof(*io)); 7745036c390SChristoph Hellwig 77551039cd0SChristoph Hellwig io->log = log; 77651039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->log_sibling); 77751039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->stripe_list); 7783bddb7f8SSong Liu bio_list_init(&io->flush_barriers); 77951039cd0SChristoph Hellwig io->state = IO_UNIT_RUNNING; 780f6bed0efSShaohua Li 781afeee514SKent Overstreet io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO); 782f6bed0efSShaohua Li block = page_address(io->meta_page); 783e8deb638SChristoph Hellwig clear_page(block); 784f6bed0efSShaohua Li block->magic = cpu_to_le32(R5LOG_MAGIC); 785f6bed0efSShaohua Li block->version = R5LOG_VERSION; 786f6bed0efSShaohua Li block->seq = cpu_to_le64(log->seq); 787f6bed0efSShaohua Li block->position = cpu_to_le64(log->log_start); 788f6bed0efSShaohua Li 789f6bed0efSShaohua Li io->log_start = log->log_start; 790f6bed0efSShaohua Li io->meta_offset = sizeof(struct r5l_meta_block); 7912b8ef16eSChristoph Hellwig io->seq = log->seq++; 792f6bed0efSShaohua Li 7936143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 7946143e2ceSChristoph Hellwig io->current_bio->bi_end_io = r5l_log_endio; 7956143e2ceSChristoph Hellwig io->current_bio->bi_private = io; 796b349feb3SChristoph Hellwig bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 797f6bed0efSShaohua Li 798c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 799f6bed0efSShaohua Li 800f6bed0efSShaohua Li spin_lock_irq(&log->io_list_lock); 801f6bed0efSShaohua Li list_add_tail(&io->log_sibling, &log->running_ios); 802f6bed0efSShaohua Li spin_unlock_irq(&log->io_list_lock); 803f6bed0efSShaohua Li 804f6bed0efSShaohua Li return io; 805f6bed0efSShaohua Li } 806f6bed0efSShaohua Li 807f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 808f6bed0efSShaohua Li { 80922581f58SChristoph Hellwig if (log->current_io && 81022581f58SChristoph Hellwig log->current_io->meta_offset + payload_size > PAGE_SIZE) 811f6bed0efSShaohua Li r5l_submit_current_io(log); 812f6bed0efSShaohua Li 8135036c390SChristoph Hellwig if (!log->current_io) { 814f6bed0efSShaohua Li log->current_io = r5l_new_meta(log); 8155036c390SChristoph Hellwig if (!log->current_io) 8165036c390SChristoph Hellwig return -ENOMEM; 8175036c390SChristoph Hellwig } 8185036c390SChristoph Hellwig 819f6bed0efSShaohua Li return 0; 820f6bed0efSShaohua Li } 821f6bed0efSShaohua Li 822f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 823f6bed0efSShaohua Li sector_t location, 824f6bed0efSShaohua Li u32 checksum1, u32 checksum2, 825f6bed0efSShaohua Li bool checksum2_valid) 826f6bed0efSShaohua Li { 827f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 828f6bed0efSShaohua Li struct r5l_payload_data_parity *payload; 829f6bed0efSShaohua Li 830f6bed0efSShaohua Li payload = page_address(io->meta_page) + io->meta_offset; 831f6bed0efSShaohua Li payload->header.type = cpu_to_le16(type); 832f6bed0efSShaohua Li payload->header.flags = cpu_to_le16(0); 833f6bed0efSShaohua Li payload->size = cpu_to_le32((1 + !!checksum2_valid) << 834f6bed0efSShaohua Li (PAGE_SHIFT - 9)); 835f6bed0efSShaohua Li payload->location = cpu_to_le64(location); 836f6bed0efSShaohua Li payload->checksum[0] = cpu_to_le32(checksum1); 837f6bed0efSShaohua Li if (checksum2_valid) 838f6bed0efSShaohua Li payload->checksum[1] = cpu_to_le32(checksum2); 839f6bed0efSShaohua Li 840f6bed0efSShaohua Li io->meta_offset += sizeof(struct r5l_payload_data_parity) + 841f6bed0efSShaohua Li sizeof(__le32) * (1 + !!checksum2_valid); 842f6bed0efSShaohua Li } 843f6bed0efSShaohua Li 844f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 845f6bed0efSShaohua Li { 846f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 847f6bed0efSShaohua Li 8486143e2ceSChristoph Hellwig if (io->need_split_bio) { 8493bddb7f8SSong Liu BUG_ON(io->split_bio); 8503bddb7f8SSong Liu io->split_bio = io->current_bio; 8516143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 8523bddb7f8SSong Liu bio_chain(io->current_bio, io->split_bio); 8533bddb7f8SSong Liu io->need_split_bio = false; 854f6bed0efSShaohua Li } 855f6bed0efSShaohua Li 8566143e2ceSChristoph Hellwig if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 8576143e2ceSChristoph Hellwig BUG(); 8586143e2ceSChristoph Hellwig 859c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 860f6bed0efSShaohua Li } 861f6bed0efSShaohua Li 862ea17481fSSong Liu static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) 863ea17481fSSong Liu { 864ea17481fSSong Liu struct mddev *mddev = log->rdev->mddev; 865ea17481fSSong Liu struct r5conf *conf = mddev->private; 866ea17481fSSong Liu struct r5l_io_unit *io; 867ea17481fSSong Liu struct r5l_payload_flush *payload; 868ea17481fSSong Liu int meta_size; 869ea17481fSSong Liu 870ea17481fSSong Liu /* 871ea17481fSSong Liu * payload_flush requires extra writes to the journal. 872ea17481fSSong Liu * To avoid handling the extra IO in quiesce, just skip 873ea17481fSSong Liu * flush_payload 874ea17481fSSong Liu */ 875ea17481fSSong Liu if (conf->quiesce) 876ea17481fSSong Liu return; 877ea17481fSSong Liu 878ea17481fSSong Liu mutex_lock(&log->io_mutex); 879ea17481fSSong Liu meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64); 880ea17481fSSong Liu 881ea17481fSSong Liu if (r5l_get_meta(log, meta_size)) { 882ea17481fSSong Liu mutex_unlock(&log->io_mutex); 883ea17481fSSong Liu return; 884ea17481fSSong Liu } 885ea17481fSSong Liu 886ea17481fSSong Liu /* current implementation is one stripe per flush payload */ 887ea17481fSSong Liu io = log->current_io; 888ea17481fSSong Liu payload = page_address(io->meta_page) + io->meta_offset; 889ea17481fSSong Liu payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH); 890ea17481fSSong Liu payload->header.flags = cpu_to_le16(0); 891ea17481fSSong Liu payload->size = cpu_to_le32(sizeof(__le64)); 892ea17481fSSong Liu payload->flush_stripes[0] = cpu_to_le64(sect); 893ea17481fSSong Liu io->meta_offset += meta_size; 894a9501d74SSong Liu /* multiple flush payloads count as one pending_stripe */ 895a9501d74SSong Liu if (!io->has_flush_payload) { 896a9501d74SSong Liu io->has_flush_payload = 1; 897a9501d74SSong Liu atomic_inc(&io->pending_stripe); 898a9501d74SSong Liu } 899ea17481fSSong Liu mutex_unlock(&log->io_mutex); 900ea17481fSSong Liu } 901ea17481fSSong Liu 9025036c390SChristoph Hellwig static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 903f6bed0efSShaohua Li int data_pages, int parity_pages) 904f6bed0efSShaohua Li { 905f6bed0efSShaohua Li int i; 906f6bed0efSShaohua Li int meta_size; 9075036c390SChristoph Hellwig int ret; 908f6bed0efSShaohua Li struct r5l_io_unit *io; 909f6bed0efSShaohua Li 910f6bed0efSShaohua Li meta_size = 911f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 912f6bed0efSShaohua Li * data_pages) + 913f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 914f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 915f6bed0efSShaohua Li 9165036c390SChristoph Hellwig ret = r5l_get_meta(log, meta_size); 9175036c390SChristoph Hellwig if (ret) 9185036c390SChristoph Hellwig return ret; 9195036c390SChristoph Hellwig 920f6bed0efSShaohua Li io = log->current_io; 921f6bed0efSShaohua Li 9223bddb7f8SSong Liu if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 9233bddb7f8SSong Liu io->has_flush = 1; 9243bddb7f8SSong Liu 925f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 9261e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 9271e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 928f6bed0efSShaohua Li continue; 929f6bed0efSShaohua Li if (i == sh->pd_idx || i == sh->qd_idx) 930f6bed0efSShaohua Li continue; 9313bddb7f8SSong Liu if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 9323bddb7f8SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 9333bddb7f8SSong Liu io->has_fua = 1; 9343bddb7f8SSong Liu /* 9353bddb7f8SSong Liu * we need to flush journal to make sure recovery can 9363bddb7f8SSong Liu * reach the data with fua flag 9373bddb7f8SSong Liu */ 9383bddb7f8SSong Liu io->has_flush = 1; 9393bddb7f8SSong Liu } 940f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 941f6bed0efSShaohua Li raid5_compute_blocknr(sh, i, 0), 942f6bed0efSShaohua Li sh->dev[i].log_checksum, 0, false); 943f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[i].page); 944f6bed0efSShaohua Li } 945f6bed0efSShaohua Li 9462ded3703SSong Liu if (parity_pages == 2) { 947f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 948f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 949f6bed0efSShaohua Li sh->dev[sh->qd_idx].log_checksum, true); 950f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 951f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 9522ded3703SSong Liu } else if (parity_pages == 1) { 953f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 954f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 955f6bed0efSShaohua Li 0, false); 956f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 9572ded3703SSong Liu } else /* Just writing data, not parity, in caching phase */ 9582ded3703SSong Liu BUG_ON(parity_pages != 0); 959f6bed0efSShaohua Li 960f6bed0efSShaohua Li list_add_tail(&sh->log_list, &io->stripe_list); 961f6bed0efSShaohua Li atomic_inc(&io->pending_stripe); 962f6bed0efSShaohua Li sh->log_io = io; 9635036c390SChristoph Hellwig 964a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 965a39f7afdSSong Liu return 0; 966a39f7afdSSong Liu 967a39f7afdSSong Liu if (sh->log_start == MaxSector) { 968a39f7afdSSong Liu BUG_ON(!list_empty(&sh->r5c)); 969a39f7afdSSong Liu sh->log_start = io->log_start; 970a39f7afdSSong Liu spin_lock_irq(&log->stripe_in_journal_lock); 971a39f7afdSSong Liu list_add_tail(&sh->r5c, 972a39f7afdSSong Liu &log->stripe_in_journal_list); 973a39f7afdSSong Liu spin_unlock_irq(&log->stripe_in_journal_lock); 974a39f7afdSSong Liu atomic_inc(&log->stripe_in_journal_count); 975a39f7afdSSong Liu } 9765036c390SChristoph Hellwig return 0; 977f6bed0efSShaohua Li } 978f6bed0efSShaohua Li 979a39f7afdSSong Liu /* add stripe to no_space_stripes, and then wake up reclaim */ 980a39f7afdSSong Liu static inline void r5l_add_no_space_stripe(struct r5l_log *log, 981a39f7afdSSong Liu struct stripe_head *sh) 982a39f7afdSSong Liu { 983a39f7afdSSong Liu spin_lock(&log->no_space_stripes_lock); 984a39f7afdSSong Liu list_add_tail(&sh->log_list, &log->no_space_stripes); 985a39f7afdSSong Liu spin_unlock(&log->no_space_stripes_lock); 986a39f7afdSSong Liu } 987a39f7afdSSong Liu 988f6bed0efSShaohua Li /* 989f6bed0efSShaohua Li * running in raid5d, where reclaim could wait for raid5d too (when it flushes 990f6bed0efSShaohua Li * data from log to raid disks), so we shouldn't wait for reclaim here 991f6bed0efSShaohua Li */ 992f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 993f6bed0efSShaohua Li { 994a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 995f6bed0efSShaohua Li int write_disks = 0; 996f6bed0efSShaohua Li int data_pages, parity_pages; 997f6bed0efSShaohua Li int reserve; 998f6bed0efSShaohua Li int i; 9995036c390SChristoph Hellwig int ret = 0; 1000a39f7afdSSong Liu bool wake_reclaim = false; 1001f6bed0efSShaohua Li 1002f6bed0efSShaohua Li if (!log) 1003f6bed0efSShaohua Li return -EAGAIN; 1004f6bed0efSShaohua Li /* Don't support stripe batch */ 1005f6bed0efSShaohua Li if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 1006f6bed0efSShaohua Li test_bit(STRIPE_SYNCING, &sh->state)) { 1007f6bed0efSShaohua Li /* the stripe is written to log, we start writing it to raid */ 1008f6bed0efSShaohua Li clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 1009f6bed0efSShaohua Li return -EAGAIN; 1010f6bed0efSShaohua Li } 1011f6bed0efSShaohua Li 10122ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 10132ded3703SSong Liu 1014f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 1015f6bed0efSShaohua Li void *addr; 1016f6bed0efSShaohua Li 10171e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 10181e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 1019f6bed0efSShaohua Li continue; 10201e6d690bSSong Liu 1021f6bed0efSShaohua Li write_disks++; 1022f6bed0efSShaohua Li /* checksum is already calculated in last run */ 1023f6bed0efSShaohua Li if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 1024f6bed0efSShaohua Li continue; 1025f6bed0efSShaohua Li addr = kmap_atomic(sh->dev[i].page); 10265cb2fbd6SShaohua Li sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 1027f6bed0efSShaohua Li addr, PAGE_SIZE); 1028f6bed0efSShaohua Li kunmap_atomic(addr); 1029f6bed0efSShaohua Li } 1030f6bed0efSShaohua Li parity_pages = 1 + !!(sh->qd_idx >= 0); 1031f6bed0efSShaohua Li data_pages = write_disks - parity_pages; 1032f6bed0efSShaohua Li 1033f6bed0efSShaohua Li set_bit(STRIPE_LOG_TRAPPED, &sh->state); 1034253f9fd4SShaohua Li /* 1035253f9fd4SShaohua Li * The stripe must enter state machine again to finish the write, so 1036253f9fd4SShaohua Li * don't delay. 1037253f9fd4SShaohua Li */ 1038253f9fd4SShaohua Li clear_bit(STRIPE_DELAYED, &sh->state); 1039f6bed0efSShaohua Li atomic_inc(&sh->count); 1040f6bed0efSShaohua Li 1041f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 1042f6bed0efSShaohua Li /* meta + data */ 1043f6bed0efSShaohua Li reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 1044f6bed0efSShaohua Li 1045a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 1046a39f7afdSSong Liu if (!r5l_has_free_space(log, reserve)) { 1047a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 1048a39f7afdSSong Liu wake_reclaim = true; 10495036c390SChristoph Hellwig } else { 10505036c390SChristoph Hellwig ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 10515036c390SChristoph Hellwig if (ret) { 10525036c390SChristoph Hellwig spin_lock_irq(&log->io_list_lock); 1053a39f7afdSSong Liu list_add_tail(&sh->log_list, 1054a39f7afdSSong Liu &log->no_mem_stripes); 10555036c390SChristoph Hellwig spin_unlock_irq(&log->io_list_lock); 1056f6bed0efSShaohua Li } 10575036c390SChristoph Hellwig } 1058a39f7afdSSong Liu } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 1059a39f7afdSSong Liu /* 1060a39f7afdSSong Liu * log space critical, do not process stripes that are 1061a39f7afdSSong Liu * not in cache yet (sh->log_start == MaxSector). 1062a39f7afdSSong Liu */ 1063a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 1064a39f7afdSSong Liu sh->log_start == MaxSector) { 1065a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 1066a39f7afdSSong Liu wake_reclaim = true; 1067a39f7afdSSong Liu reserve = 0; 1068a39f7afdSSong Liu } else if (!r5l_has_free_space(log, reserve)) { 1069a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 1070a39f7afdSSong Liu BUG(); 1071a39f7afdSSong Liu else 1072a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 1073a39f7afdSSong Liu } else { 1074a39f7afdSSong Liu ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 1075a39f7afdSSong Liu if (ret) { 1076a39f7afdSSong Liu spin_lock_irq(&log->io_list_lock); 1077a39f7afdSSong Liu list_add_tail(&sh->log_list, 1078a39f7afdSSong Liu &log->no_mem_stripes); 1079a39f7afdSSong Liu spin_unlock_irq(&log->io_list_lock); 1080a39f7afdSSong Liu } 1081a39f7afdSSong Liu } 1082a39f7afdSSong Liu } 1083f6bed0efSShaohua Li 10845036c390SChristoph Hellwig mutex_unlock(&log->io_mutex); 1085a39f7afdSSong Liu if (wake_reclaim) 1086a39f7afdSSong Liu r5l_wake_reclaim(log, reserve); 1087f6bed0efSShaohua Li return 0; 1088f6bed0efSShaohua Li } 1089f6bed0efSShaohua Li 1090f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log) 1091f6bed0efSShaohua Li { 1092f6bed0efSShaohua Li if (!log) 1093f6bed0efSShaohua Li return; 1094f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 1095f6bed0efSShaohua Li r5l_submit_current_io(log); 1096f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 1097f6bed0efSShaohua Li } 1098f6bed0efSShaohua Li 1099828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 1100828cbe98SShaohua Li { 11013bddb7f8SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 1102828cbe98SShaohua Li /* 11033bddb7f8SSong Liu * in write through (journal only) 11043bddb7f8SSong Liu * we flush log disk cache first, then write stripe data to 11053bddb7f8SSong Liu * raid disks. So if bio is finished, the log disk cache is 11063bddb7f8SSong Liu * flushed already. The recovery guarantees we can recovery 11073bddb7f8SSong Liu * the bio from log disk, so we don't need to flush again 1108828cbe98SShaohua Li */ 1109828cbe98SShaohua Li if (bio->bi_iter.bi_size == 0) { 1110828cbe98SShaohua Li bio_endio(bio); 1111828cbe98SShaohua Li return 0; 1112828cbe98SShaohua Li } 11131eff9d32SJens Axboe bio->bi_opf &= ~REQ_PREFLUSH; 11143bddb7f8SSong Liu } else { 11153bddb7f8SSong Liu /* write back (with cache) */ 11163bddb7f8SSong Liu if (bio->bi_iter.bi_size == 0) { 11173bddb7f8SSong Liu mutex_lock(&log->io_mutex); 11183bddb7f8SSong Liu r5l_get_meta(log, 0); 11193bddb7f8SSong Liu bio_list_add(&log->current_io->flush_barriers, bio); 11203bddb7f8SSong Liu log->current_io->has_flush = 1; 11213bddb7f8SSong Liu log->current_io->has_null_flush = 1; 11223bddb7f8SSong Liu atomic_inc(&log->current_io->pending_stripe); 11233bddb7f8SSong Liu r5l_submit_current_io(log); 11243bddb7f8SSong Liu mutex_unlock(&log->io_mutex); 11253bddb7f8SSong Liu return 0; 11263bddb7f8SSong Liu } 11273bddb7f8SSong Liu } 1128828cbe98SShaohua Li return -EAGAIN; 1129828cbe98SShaohua Li } 1130828cbe98SShaohua Li 1131f6bed0efSShaohua Li /* This will run after log space is reclaimed */ 1132f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log) 1133f6bed0efSShaohua Li { 1134f6bed0efSShaohua Li struct stripe_head *sh; 1135f6bed0efSShaohua Li 1136f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 1137f6bed0efSShaohua Li while (!list_empty(&log->no_space_stripes)) { 1138f6bed0efSShaohua Li sh = list_first_entry(&log->no_space_stripes, 1139f6bed0efSShaohua Li struct stripe_head, log_list); 1140f6bed0efSShaohua Li list_del_init(&sh->log_list); 1141f6bed0efSShaohua Li set_bit(STRIPE_HANDLE, &sh->state); 1142f6bed0efSShaohua Li raid5_release_stripe(sh); 1143f6bed0efSShaohua Li } 1144f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 1145f6bed0efSShaohua Li } 1146f6bed0efSShaohua Li 1147a39f7afdSSong Liu /* 1148a39f7afdSSong Liu * calculate new last_checkpoint 1149a39f7afdSSong Liu * for write through mode, returns log->next_checkpoint 1150a39f7afdSSong Liu * for write back, returns log_start of first sh in stripe_in_journal_list 1151a39f7afdSSong Liu */ 1152a39f7afdSSong Liu static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1153a39f7afdSSong Liu { 1154a39f7afdSSong Liu struct stripe_head *sh; 1155a39f7afdSSong Liu struct r5l_log *log = conf->log; 1156a39f7afdSSong Liu sector_t new_cp; 1157a39f7afdSSong Liu unsigned long flags; 1158a39f7afdSSong Liu 1159a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1160a39f7afdSSong Liu return log->next_checkpoint; 1161a39f7afdSSong Liu 1162a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1163a39f7afdSSong Liu if (list_empty(&conf->log->stripe_in_journal_list)) { 1164a39f7afdSSong Liu /* all stripes flushed */ 1165d3014e21SDan Carpenter spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1166a39f7afdSSong Liu return log->next_checkpoint; 1167a39f7afdSSong Liu } 1168a39f7afdSSong Liu sh = list_first_entry(&conf->log->stripe_in_journal_list, 1169a39f7afdSSong Liu struct stripe_head, r5c); 1170a39f7afdSSong Liu new_cp = sh->log_start; 1171a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1172a39f7afdSSong Liu return new_cp; 1173a39f7afdSSong Liu } 1174a39f7afdSSong Liu 117517036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log) 117617036461SChristoph Hellwig { 1177a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 1178a39f7afdSSong Liu 117917036461SChristoph Hellwig return r5l_ring_distance(log, log->last_checkpoint, 1180a39f7afdSSong Liu r5c_calculate_new_cp(conf)); 118117036461SChristoph Hellwig } 118217036461SChristoph Hellwig 11835036c390SChristoph Hellwig static void r5l_run_no_mem_stripe(struct r5l_log *log) 11845036c390SChristoph Hellwig { 11855036c390SChristoph Hellwig struct stripe_head *sh; 11865036c390SChristoph Hellwig 1187efa4b77bSShaohua Li lockdep_assert_held(&log->io_list_lock); 11885036c390SChristoph Hellwig 11895036c390SChristoph Hellwig if (!list_empty(&log->no_mem_stripes)) { 11905036c390SChristoph Hellwig sh = list_first_entry(&log->no_mem_stripes, 11915036c390SChristoph Hellwig struct stripe_head, log_list); 11925036c390SChristoph Hellwig list_del_init(&sh->log_list); 11935036c390SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 11945036c390SChristoph Hellwig raid5_release_stripe(sh); 11955036c390SChristoph Hellwig } 11965036c390SChristoph Hellwig } 11975036c390SChristoph Hellwig 119804732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log) 119917036461SChristoph Hellwig { 120017036461SChristoph Hellwig struct r5l_io_unit *io, *next; 120117036461SChristoph Hellwig bool found = false; 120217036461SChristoph Hellwig 1203efa4b77bSShaohua Li lockdep_assert_held(&log->io_list_lock); 120417036461SChristoph Hellwig 120504732f74SChristoph Hellwig list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 120617036461SChristoph Hellwig /* don't change list order */ 120717036461SChristoph Hellwig if (io->state < IO_UNIT_STRIPE_END) 120817036461SChristoph Hellwig break; 120917036461SChristoph Hellwig 121017036461SChristoph Hellwig log->next_checkpoint = io->log_start; 121117036461SChristoph Hellwig 121217036461SChristoph Hellwig list_del(&io->log_sibling); 1213afeee514SKent Overstreet mempool_free(io, &log->io_pool); 12145036c390SChristoph Hellwig r5l_run_no_mem_stripe(log); 121517036461SChristoph Hellwig 121617036461SChristoph Hellwig found = true; 121717036461SChristoph Hellwig } 121817036461SChristoph Hellwig 121917036461SChristoph Hellwig return found; 122017036461SChristoph Hellwig } 122117036461SChristoph Hellwig 1222509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1223509ffec7SChristoph Hellwig { 1224509ffec7SChristoph Hellwig struct r5l_log *log = io->log; 1225a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 1226509ffec7SChristoph Hellwig unsigned long flags; 1227509ffec7SChristoph Hellwig 1228509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 1229509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 123017036461SChristoph Hellwig 123104732f74SChristoph Hellwig if (!r5l_complete_finished_ios(log)) { 123285f2f9a4SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 123385f2f9a4SShaohua Li return; 123485f2f9a4SShaohua Li } 1235509ffec7SChristoph Hellwig 1236a39f7afdSSong Liu if (r5l_reclaimable_space(log) > log->max_free_space || 1237a39f7afdSSong Liu test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1238509ffec7SChristoph Hellwig r5l_wake_reclaim(log, 0); 1239509ffec7SChristoph Hellwig 1240509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 1241509ffec7SChristoph Hellwig wake_up(&log->iounit_wait); 1242509ffec7SChristoph Hellwig } 1243509ffec7SChristoph Hellwig 12440576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh) 12450576b1c6SShaohua Li { 12460576b1c6SShaohua Li struct r5l_io_unit *io; 12470576b1c6SShaohua Li 12480576b1c6SShaohua Li io = sh->log_io; 12490576b1c6SShaohua Li sh->log_io = NULL; 12500576b1c6SShaohua Li 1251509ffec7SChristoph Hellwig if (io && atomic_dec_and_test(&io->pending_stripe)) 1252509ffec7SChristoph Hellwig __r5l_stripe_write_finished(io); 12530576b1c6SShaohua Li } 12540576b1c6SShaohua Li 1255a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio) 1256a8c34f91SShaohua Li { 1257a8c34f91SShaohua Li struct r5l_log *log = container_of(bio, struct r5l_log, 1258a8c34f91SShaohua Li flush_bio); 1259a8c34f91SShaohua Li unsigned long flags; 1260a8c34f91SShaohua Li struct r5l_io_unit *io; 1261a8c34f91SShaohua Li 12624e4cbee9SChristoph Hellwig if (bio->bi_status) 12636e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 12646e74a9cfSShaohua Li 1265a8c34f91SShaohua Li spin_lock_irqsave(&log->io_list_lock, flags); 1266d8858f43SChristoph Hellwig list_for_each_entry(io, &log->flushing_ios, log_sibling) 1267d8858f43SChristoph Hellwig r5l_io_run_stripes(io); 126804732f74SChristoph Hellwig list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1269a8c34f91SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 1270a8c34f91SShaohua Li } 1271a8c34f91SShaohua Li 12720576b1c6SShaohua Li /* 12730576b1c6SShaohua Li * Starting dispatch IO to raid. 12740576b1c6SShaohua Li * io_unit(meta) consists of a log. There is one situation we want to avoid. A 12750576b1c6SShaohua Li * broken meta in the middle of a log causes recovery can't find meta at the 12760576b1c6SShaohua Li * head of log. If operations require meta at the head persistent in log, we 12770576b1c6SShaohua Li * must make sure meta before it persistent in log too. A case is: 12780576b1c6SShaohua Li * 12790576b1c6SShaohua Li * stripe data/parity is in log, we start write stripe to raid disks. stripe 12800576b1c6SShaohua Li * data/parity must be persistent in log before we do the write to raid disks. 12810576b1c6SShaohua Li * 12820576b1c6SShaohua Li * The solution is we restrictly maintain io_unit list order. In this case, we 12830576b1c6SShaohua Li * only write stripes of an io_unit to raid disks till the io_unit is the first 12840576b1c6SShaohua Li * one whose data/parity is in log. 12850576b1c6SShaohua Li */ 12860576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log) 12870576b1c6SShaohua Li { 1288a8c34f91SShaohua Li bool do_flush; 128956fef7c6SChristoph Hellwig 129056fef7c6SChristoph Hellwig if (!log || !log->need_cache_flush) 12910576b1c6SShaohua Li return; 12920576b1c6SShaohua Li 1293a8c34f91SShaohua Li spin_lock_irq(&log->io_list_lock); 1294a8c34f91SShaohua Li /* flush bio is running */ 1295a8c34f91SShaohua Li if (!list_empty(&log->flushing_ios)) { 1296a8c34f91SShaohua Li spin_unlock_irq(&log->io_list_lock); 12970576b1c6SShaohua Li return; 12980576b1c6SShaohua Li } 1299a8c34f91SShaohua Li list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1300a8c34f91SShaohua Li do_flush = !list_empty(&log->flushing_ios); 13010576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 1302a8c34f91SShaohua Li 1303a8c34f91SShaohua Li if (!do_flush) 1304a8c34f91SShaohua Li return; 1305a8c34f91SShaohua Li bio_reset(&log->flush_bio); 130674d46992SChristoph Hellwig bio_set_dev(&log->flush_bio, log->rdev->bdev); 1307a8c34f91SShaohua Li log->flush_bio.bi_end_io = r5l_log_flush_endio; 130870fd7614SChristoph Hellwig log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 13094e49ea4aSMike Christie submit_bio(&log->flush_bio); 13100576b1c6SShaohua Li } 13110576b1c6SShaohua Li 13120576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp); 13134b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log, 13144b482044SShaohua Li sector_t end) 13154b482044SShaohua Li { 13164b482044SShaohua Li struct block_device *bdev = log->rdev->bdev; 13174b482044SShaohua Li struct mddev *mddev; 13184b482044SShaohua Li 13194b482044SShaohua Li r5l_write_super(log, end); 13204b482044SShaohua Li 13214b482044SShaohua Li if (!blk_queue_discard(bdev_get_queue(bdev))) 13224b482044SShaohua Li return; 13234b482044SShaohua Li 13244b482044SShaohua Li mddev = log->rdev->mddev; 13254b482044SShaohua Li /* 13268e018c21SShaohua Li * Discard could zero data, so before discard we must make sure 13278e018c21SShaohua Li * superblock is updated to new log tail. Updating superblock (either 13288e018c21SShaohua Li * directly call md_update_sb() or depend on md thread) must hold 13298e018c21SShaohua Li * reconfig mutex. On the other hand, raid5_quiesce is called with 13308e018c21SShaohua Li * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 13318e018c21SShaohua Li * for all IO finish, hence waitting for reclaim thread, while reclaim 13328e018c21SShaohua Li * thread is calling this function and waitting for reconfig mutex. So 13338e018c21SShaohua Li * there is a deadlock. We workaround this issue with a trylock. 13348e018c21SShaohua Li * FIXME: we could miss discard if we can't take reconfig mutex 13354b482044SShaohua Li */ 13362953079cSShaohua Li set_mask_bits(&mddev->sb_flags, 0, 13372953079cSShaohua Li BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 13388e018c21SShaohua Li if (!mddev_trylock(mddev)) 13398e018c21SShaohua Li return; 13404b482044SShaohua Li md_update_sb(mddev, 1); 13418e018c21SShaohua Li mddev_unlock(mddev); 13424b482044SShaohua Li 13436e74a9cfSShaohua Li /* discard IO error really doesn't matter, ignore it */ 13444b482044SShaohua Li if (log->last_checkpoint < end) { 13454b482044SShaohua Li blkdev_issue_discard(bdev, 13464b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 13474b482044SShaohua Li end - log->last_checkpoint, GFP_NOIO, 0); 13484b482044SShaohua Li } else { 13494b482044SShaohua Li blkdev_issue_discard(bdev, 13504b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 13514b482044SShaohua Li log->device_size - log->last_checkpoint, 13524b482044SShaohua Li GFP_NOIO, 0); 13534b482044SShaohua Li blkdev_issue_discard(bdev, log->rdev->data_offset, end, 13544b482044SShaohua Li GFP_NOIO, 0); 13554b482044SShaohua Li } 13564b482044SShaohua Li } 13574b482044SShaohua Li 1358a39f7afdSSong Liu /* 1359a39f7afdSSong Liu * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1360a39f7afdSSong Liu * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1361a39f7afdSSong Liu * 1362a39f7afdSSong Liu * must hold conf->device_lock 1363a39f7afdSSong Liu */ 1364a39f7afdSSong Liu static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1365a39f7afdSSong Liu { 1366a39f7afdSSong Liu BUG_ON(list_empty(&sh->lru)); 1367a39f7afdSSong Liu BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1368a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1369a39f7afdSSong Liu 1370a39f7afdSSong Liu /* 1371a39f7afdSSong Liu * The stripe is not ON_RELEASE_LIST, so it is safe to call 1372a39f7afdSSong Liu * raid5_release_stripe() while holding conf->device_lock 1373a39f7afdSSong Liu */ 1374a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1375efa4b77bSShaohua Li lockdep_assert_held(&conf->device_lock); 1376a39f7afdSSong Liu 1377a39f7afdSSong Liu list_del_init(&sh->lru); 1378a39f7afdSSong Liu atomic_inc(&sh->count); 1379a39f7afdSSong Liu 1380a39f7afdSSong Liu set_bit(STRIPE_HANDLE, &sh->state); 1381a39f7afdSSong Liu atomic_inc(&conf->active_stripes); 1382a39f7afdSSong Liu r5c_make_stripe_write_out(sh); 1383a39f7afdSSong Liu 1384e33fbb9cSShaohua Li if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 1385e33fbb9cSShaohua Li atomic_inc(&conf->r5c_flushing_partial_stripes); 1386e33fbb9cSShaohua Li else 1387e33fbb9cSShaohua Li atomic_inc(&conf->r5c_flushing_full_stripes); 1388a39f7afdSSong Liu raid5_release_stripe(sh); 1389a39f7afdSSong Liu } 1390a39f7afdSSong Liu 1391a39f7afdSSong Liu /* 1392a39f7afdSSong Liu * if num == 0, flush all full stripes 1393a39f7afdSSong Liu * if num > 0, flush all full stripes. If less than num full stripes are 1394a39f7afdSSong Liu * flushed, flush some partial stripes until totally num stripes are 1395a39f7afdSSong Liu * flushed or there is no more cached stripes. 1396a39f7afdSSong Liu */ 1397a39f7afdSSong Liu void r5c_flush_cache(struct r5conf *conf, int num) 1398a39f7afdSSong Liu { 1399a39f7afdSSong Liu int count; 1400a39f7afdSSong Liu struct stripe_head *sh, *next; 1401a39f7afdSSong Liu 1402efa4b77bSShaohua Li lockdep_assert_held(&conf->device_lock); 1403a39f7afdSSong Liu if (!conf->log) 1404a39f7afdSSong Liu return; 1405a39f7afdSSong Liu 1406a39f7afdSSong Liu count = 0; 1407a39f7afdSSong Liu list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1408a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1409a39f7afdSSong Liu count++; 1410a39f7afdSSong Liu } 1411a39f7afdSSong Liu 1412a39f7afdSSong Liu if (count >= num) 1413a39f7afdSSong Liu return; 1414a39f7afdSSong Liu list_for_each_entry_safe(sh, next, 1415a39f7afdSSong Liu &conf->r5c_partial_stripe_list, lru) { 1416a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1417a39f7afdSSong Liu if (++count >= num) 1418a39f7afdSSong Liu break; 1419a39f7afdSSong Liu } 1420a39f7afdSSong Liu } 1421a39f7afdSSong Liu 1422a39f7afdSSong Liu static void r5c_do_reclaim(struct r5conf *conf) 1423a39f7afdSSong Liu { 1424a39f7afdSSong Liu struct r5l_log *log = conf->log; 1425a39f7afdSSong Liu struct stripe_head *sh; 1426a39f7afdSSong Liu int count = 0; 1427a39f7afdSSong Liu unsigned long flags; 1428a39f7afdSSong Liu int total_cached; 1429a39f7afdSSong Liu int stripes_to_flush; 1430e33fbb9cSShaohua Li int flushing_partial, flushing_full; 1431a39f7afdSSong Liu 1432a39f7afdSSong Liu if (!r5c_is_writeback(log)) 1433a39f7afdSSong Liu return; 1434a39f7afdSSong Liu 1435e33fbb9cSShaohua Li flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); 1436e33fbb9cSShaohua Li flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); 1437a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1438e33fbb9cSShaohua Li atomic_read(&conf->r5c_cached_full_stripes) - 1439e33fbb9cSShaohua Li flushing_full - flushing_partial; 1440a39f7afdSSong Liu 1441a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 3 / 4 || 1442a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 1443a39f7afdSSong Liu /* 1444a39f7afdSSong Liu * if stripe cache pressure high, flush all full stripes and 1445a39f7afdSSong Liu * some partial stripes 1446a39f7afdSSong Liu */ 1447a39f7afdSSong Liu stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1448a39f7afdSSong Liu else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1449e33fbb9cSShaohua Li atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > 145084890c03SShaohua Li R5C_FULL_STRIPE_FLUSH_BATCH(conf)) 1451a39f7afdSSong Liu /* 1452a39f7afdSSong Liu * if stripe cache pressure moderate, or if there is many full 1453a39f7afdSSong Liu * stripes,flush all full stripes 1454a39f7afdSSong Liu */ 1455a39f7afdSSong Liu stripes_to_flush = 0; 1456a39f7afdSSong Liu else 1457a39f7afdSSong Liu /* no need to flush */ 1458a39f7afdSSong Liu stripes_to_flush = -1; 1459a39f7afdSSong Liu 1460a39f7afdSSong Liu if (stripes_to_flush >= 0) { 1461a39f7afdSSong Liu spin_lock_irqsave(&conf->device_lock, flags); 1462a39f7afdSSong Liu r5c_flush_cache(conf, stripes_to_flush); 1463a39f7afdSSong Liu spin_unlock_irqrestore(&conf->device_lock, flags); 1464a39f7afdSSong Liu } 1465a39f7afdSSong Liu 1466a39f7afdSSong Liu /* if log space is tight, flush stripes on stripe_in_journal_list */ 1467a39f7afdSSong Liu if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1468a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1469a39f7afdSSong Liu spin_lock(&conf->device_lock); 1470a39f7afdSSong Liu list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1471a39f7afdSSong Liu /* 1472a39f7afdSSong Liu * stripes on stripe_in_journal_list could be in any 1473a39f7afdSSong Liu * state of the stripe_cache state machine. In this 1474a39f7afdSSong Liu * case, we only want to flush stripe on 1475a39f7afdSSong Liu * r5c_cached_full/partial_stripes. The following 1476a39f7afdSSong Liu * condition makes sure the stripe is on one of the 1477a39f7afdSSong Liu * two lists. 1478a39f7afdSSong Liu */ 1479a39f7afdSSong Liu if (!list_empty(&sh->lru) && 1480a39f7afdSSong Liu !test_bit(STRIPE_HANDLE, &sh->state) && 1481a39f7afdSSong Liu atomic_read(&sh->count) == 0) { 1482a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1483a39f7afdSSong Liu if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1484a39f7afdSSong Liu break; 1485a39f7afdSSong Liu } 1486e8fd52eeSShaohua Li } 1487a39f7afdSSong Liu spin_unlock(&conf->device_lock); 1488a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1489a39f7afdSSong Liu } 1490f687a33eSSong Liu 1491f687a33eSSong Liu if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 1492f687a33eSSong Liu r5l_run_no_space_stripes(log); 1493f687a33eSSong Liu 1494a39f7afdSSong Liu md_wakeup_thread(conf->mddev->thread); 1495a39f7afdSSong Liu } 14964b482044SShaohua Li 14970576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log) 14980576b1c6SShaohua Li { 1499a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 15000576b1c6SShaohua Li sector_t reclaim_target = xchg(&log->reclaim_target, 0); 150117036461SChristoph Hellwig sector_t reclaimable; 150217036461SChristoph Hellwig sector_t next_checkpoint; 1503a39f7afdSSong Liu bool write_super; 15040576b1c6SShaohua Li 15050576b1c6SShaohua Li spin_lock_irq(&log->io_list_lock); 1506a39f7afdSSong Liu write_super = r5l_reclaimable_space(log) > log->max_free_space || 1507a39f7afdSSong Liu reclaim_target != 0 || !list_empty(&log->no_space_stripes); 15080576b1c6SShaohua Li /* 15090576b1c6SShaohua Li * move proper io_unit to reclaim list. We should not change the order. 15100576b1c6SShaohua Li * reclaimable/unreclaimable io_unit can be mixed in the list, we 15110576b1c6SShaohua Li * shouldn't reuse space of an unreclaimable io_unit 15120576b1c6SShaohua Li */ 15130576b1c6SShaohua Li while (1) { 151417036461SChristoph Hellwig reclaimable = r5l_reclaimable_space(log); 151517036461SChristoph Hellwig if (reclaimable >= reclaim_target || 15160576b1c6SShaohua Li (list_empty(&log->running_ios) && 15170576b1c6SShaohua Li list_empty(&log->io_end_ios) && 1518a8c34f91SShaohua Li list_empty(&log->flushing_ios) && 151904732f74SChristoph Hellwig list_empty(&log->finished_ios))) 15200576b1c6SShaohua Li break; 15210576b1c6SShaohua Li 152217036461SChristoph Hellwig md_wakeup_thread(log->rdev->mddev->thread); 152317036461SChristoph Hellwig wait_event_lock_irq(log->iounit_wait, 152417036461SChristoph Hellwig r5l_reclaimable_space(log) > reclaimable, 152517036461SChristoph Hellwig log->io_list_lock); 15260576b1c6SShaohua Li } 152717036461SChristoph Hellwig 1528a39f7afdSSong Liu next_checkpoint = r5c_calculate_new_cp(conf); 15290576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 15300576b1c6SShaohua Li 1531a39f7afdSSong Liu if (reclaimable == 0 || !write_super) 15320576b1c6SShaohua Li return; 15330576b1c6SShaohua Li 15340576b1c6SShaohua Li /* 15350576b1c6SShaohua Li * write_super will flush cache of each raid disk. We must write super 15360576b1c6SShaohua Li * here, because the log area might be reused soon and we don't want to 15370576b1c6SShaohua Li * confuse recovery 15380576b1c6SShaohua Li */ 15394b482044SShaohua Li r5l_write_super_and_discard_space(log, next_checkpoint); 15400576b1c6SShaohua Li 15410576b1c6SShaohua Li mutex_lock(&log->io_mutex); 154217036461SChristoph Hellwig log->last_checkpoint = next_checkpoint; 1543a39f7afdSSong Liu r5c_update_log_state(log); 15440576b1c6SShaohua Li mutex_unlock(&log->io_mutex); 15450576b1c6SShaohua Li 154617036461SChristoph Hellwig r5l_run_no_space_stripes(log); 15470576b1c6SShaohua Li } 15480576b1c6SShaohua Li 15490576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread) 15500576b1c6SShaohua Li { 15510576b1c6SShaohua Li struct mddev *mddev = thread->mddev; 15520576b1c6SShaohua Li struct r5conf *conf = mddev->private; 15530576b1c6SShaohua Li struct r5l_log *log = conf->log; 15540576b1c6SShaohua Li 15550576b1c6SShaohua Li if (!log) 15560576b1c6SShaohua Li return; 1557a39f7afdSSong Liu r5c_do_reclaim(conf); 15580576b1c6SShaohua Li r5l_do_reclaim(log); 15590576b1c6SShaohua Li } 15600576b1c6SShaohua Li 1561a39f7afdSSong Liu void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1562f6bed0efSShaohua Li { 15630576b1c6SShaohua Li unsigned long target; 15640576b1c6SShaohua Li unsigned long new = (unsigned long)space; /* overflow in theory */ 15650576b1c6SShaohua Li 1566a39f7afdSSong Liu if (!log) 1567a39f7afdSSong Liu return; 15680576b1c6SShaohua Li do { 15690576b1c6SShaohua Li target = log->reclaim_target; 15700576b1c6SShaohua Li if (new < target) 15710576b1c6SShaohua Li return; 15720576b1c6SShaohua Li } while (cmpxchg(&log->reclaim_target, target, new) != target); 15730576b1c6SShaohua Li md_wakeup_thread(log->reclaim_thread); 1574f6bed0efSShaohua Li } 1575f6bed0efSShaohua Li 1576b03e0ccbSNeilBrown void r5l_quiesce(struct r5l_log *log, int quiesce) 1577e6c033f7SShaohua Li { 15784b482044SShaohua Li struct mddev *mddev; 1579b03e0ccbSNeilBrown 1580b03e0ccbSNeilBrown if (quiesce) { 15814b482044SShaohua Li /* make sure r5l_write_super_and_discard_space exits */ 15824b482044SShaohua Li mddev = log->rdev->mddev; 15834b482044SShaohua Li wake_up(&mddev->sb_wait); 1584ce1ccd07SShaohua Li kthread_park(log->reclaim_thread->tsk); 1585a39f7afdSSong Liu r5l_wake_reclaim(log, MaxSector); 1586e6c033f7SShaohua Li r5l_do_reclaim(log); 1587b03e0ccbSNeilBrown } else 1588b03e0ccbSNeilBrown kthread_unpark(log->reclaim_thread->tsk); 1589e6c033f7SShaohua Li } 1590e6c033f7SShaohua Li 15916e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf) 15926e74a9cfSShaohua Li { 1593f6b6ec5cSShaohua Li struct r5l_log *log; 1594f6b6ec5cSShaohua Li bool ret; 15957dde2ad3SShaohua Li /* don't allow write if journal disk is missing */ 1596f6b6ec5cSShaohua Li rcu_read_lock(); 1597f6b6ec5cSShaohua Li log = rcu_dereference(conf->log); 1598f6b6ec5cSShaohua Li 1599f6b6ec5cSShaohua Li if (!log) 1600f6b6ec5cSShaohua Li ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1601f6b6ec5cSShaohua Li else 1602f6b6ec5cSShaohua Li ret = test_bit(Faulty, &log->rdev->flags); 1603f6b6ec5cSShaohua Li rcu_read_unlock(); 1604f6b6ec5cSShaohua Li return ret; 16056e74a9cfSShaohua Li } 16066e74a9cfSShaohua Li 1607effe6ee7SSong Liu #define R5L_RECOVERY_PAGE_POOL_SIZE 256 1608effe6ee7SSong Liu 1609355810d1SShaohua Li struct r5l_recovery_ctx { 1610355810d1SShaohua Li struct page *meta_page; /* current meta */ 1611355810d1SShaohua Li sector_t meta_total_blocks; /* total size of current meta and data */ 1612355810d1SShaohua Li sector_t pos; /* recovery position */ 1613355810d1SShaohua Li u64 seq; /* recovery position seq */ 1614b4c625c6SSong Liu int data_parity_stripes; /* number of data_parity stripes */ 1615b4c625c6SSong Liu int data_only_stripes; /* number of data_only stripes */ 1616b4c625c6SSong Liu struct list_head cached_list; 1617effe6ee7SSong Liu 1618effe6ee7SSong Liu /* 1619effe6ee7SSong Liu * read ahead page pool (ra_pool) 1620effe6ee7SSong Liu * in recovery, log is read sequentially. It is not efficient to 1621effe6ee7SSong Liu * read every page with sync_page_io(). The read ahead page pool 1622effe6ee7SSong Liu * reads multiple pages with one IO, so further log read can 1623effe6ee7SSong Liu * just copy data from the pool. 1624effe6ee7SSong Liu */ 1625effe6ee7SSong Liu struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE]; 1626effe6ee7SSong Liu sector_t pool_offset; /* offset of first page in the pool */ 1627effe6ee7SSong Liu int total_pages; /* total allocated pages */ 1628effe6ee7SSong Liu int valid_pages; /* pages with valid data */ 1629effe6ee7SSong Liu struct bio *ra_bio; /* bio to do the read ahead */ 1630355810d1SShaohua Li }; 1631355810d1SShaohua Li 1632effe6ee7SSong Liu static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, 1633effe6ee7SSong Liu struct r5l_recovery_ctx *ctx) 1634effe6ee7SSong Liu { 1635effe6ee7SSong Liu struct page *page; 1636effe6ee7SSong Liu 1637afeee514SKent Overstreet ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs); 1638effe6ee7SSong Liu if (!ctx->ra_bio) 1639effe6ee7SSong Liu return -ENOMEM; 1640effe6ee7SSong Liu 1641effe6ee7SSong Liu ctx->valid_pages = 0; 1642effe6ee7SSong Liu ctx->total_pages = 0; 1643effe6ee7SSong Liu while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) { 1644effe6ee7SSong Liu page = alloc_page(GFP_KERNEL); 1645effe6ee7SSong Liu 1646effe6ee7SSong Liu if (!page) 1647effe6ee7SSong Liu break; 1648effe6ee7SSong Liu ctx->ra_pool[ctx->total_pages] = page; 1649effe6ee7SSong Liu ctx->total_pages += 1; 1650effe6ee7SSong Liu } 1651effe6ee7SSong Liu 1652effe6ee7SSong Liu if (ctx->total_pages == 0) { 1653effe6ee7SSong Liu bio_put(ctx->ra_bio); 1654effe6ee7SSong Liu return -ENOMEM; 1655effe6ee7SSong Liu } 1656effe6ee7SSong Liu 1657effe6ee7SSong Liu ctx->pool_offset = 0; 1658effe6ee7SSong Liu return 0; 1659effe6ee7SSong Liu } 1660effe6ee7SSong Liu 1661effe6ee7SSong Liu static void r5l_recovery_free_ra_pool(struct r5l_log *log, 1662effe6ee7SSong Liu struct r5l_recovery_ctx *ctx) 1663effe6ee7SSong Liu { 1664effe6ee7SSong Liu int i; 1665effe6ee7SSong Liu 1666effe6ee7SSong Liu for (i = 0; i < ctx->total_pages; ++i) 1667effe6ee7SSong Liu put_page(ctx->ra_pool[i]); 1668effe6ee7SSong Liu bio_put(ctx->ra_bio); 1669effe6ee7SSong Liu } 1670effe6ee7SSong Liu 1671effe6ee7SSong Liu /* 1672effe6ee7SSong Liu * fetch ctx->valid_pages pages from offset 1673effe6ee7SSong Liu * In normal cases, ctx->valid_pages == ctx->total_pages after the call. 1674effe6ee7SSong Liu * However, if the offset is close to the end of the journal device, 1675effe6ee7SSong Liu * ctx->valid_pages could be smaller than ctx->total_pages 1676effe6ee7SSong Liu */ 1677effe6ee7SSong Liu static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, 1678effe6ee7SSong Liu struct r5l_recovery_ctx *ctx, 1679effe6ee7SSong Liu sector_t offset) 1680effe6ee7SSong Liu { 1681effe6ee7SSong Liu bio_reset(ctx->ra_bio); 168274d46992SChristoph Hellwig bio_set_dev(ctx->ra_bio, log->rdev->bdev); 1683effe6ee7SSong Liu bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0); 1684effe6ee7SSong Liu ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset; 1685effe6ee7SSong Liu 1686effe6ee7SSong Liu ctx->valid_pages = 0; 1687effe6ee7SSong Liu ctx->pool_offset = offset; 1688effe6ee7SSong Liu 1689effe6ee7SSong Liu while (ctx->valid_pages < ctx->total_pages) { 1690effe6ee7SSong Liu bio_add_page(ctx->ra_bio, 1691effe6ee7SSong Liu ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0); 1692effe6ee7SSong Liu ctx->valid_pages += 1; 1693effe6ee7SSong Liu 1694effe6ee7SSong Liu offset = r5l_ring_add(log, offset, BLOCK_SECTORS); 1695effe6ee7SSong Liu 1696effe6ee7SSong Liu if (offset == 0) /* reached end of the device */ 1697effe6ee7SSong Liu break; 1698effe6ee7SSong Liu } 1699effe6ee7SSong Liu 1700effe6ee7SSong Liu return submit_bio_wait(ctx->ra_bio); 1701effe6ee7SSong Liu } 1702effe6ee7SSong Liu 1703effe6ee7SSong Liu /* 1704effe6ee7SSong Liu * try read a page from the read ahead page pool, if the page is not in the 1705effe6ee7SSong Liu * pool, call r5l_recovery_fetch_ra_pool 1706effe6ee7SSong Liu */ 1707effe6ee7SSong Liu static int r5l_recovery_read_page(struct r5l_log *log, 1708effe6ee7SSong Liu struct r5l_recovery_ctx *ctx, 1709effe6ee7SSong Liu struct page *page, 1710effe6ee7SSong Liu sector_t offset) 1711effe6ee7SSong Liu { 1712effe6ee7SSong Liu int ret; 1713effe6ee7SSong Liu 1714effe6ee7SSong Liu if (offset < ctx->pool_offset || 1715effe6ee7SSong Liu offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) { 1716effe6ee7SSong Liu ret = r5l_recovery_fetch_ra_pool(log, ctx, offset); 1717effe6ee7SSong Liu if (ret) 1718effe6ee7SSong Liu return ret; 1719effe6ee7SSong Liu } 1720effe6ee7SSong Liu 1721effe6ee7SSong Liu BUG_ON(offset < ctx->pool_offset || 1722effe6ee7SSong Liu offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS); 1723effe6ee7SSong Liu 1724effe6ee7SSong Liu memcpy(page_address(page), 1725effe6ee7SSong Liu page_address(ctx->ra_pool[(offset - ctx->pool_offset) >> 1726effe6ee7SSong Liu BLOCK_SECTOR_SHIFT]), 1727effe6ee7SSong Liu PAGE_SIZE); 1728effe6ee7SSong Liu return 0; 1729effe6ee7SSong Liu } 1730effe6ee7SSong Liu 17319ed988f5SSong Liu static int r5l_recovery_read_meta_block(struct r5l_log *log, 1732355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1733355810d1SShaohua Li { 1734355810d1SShaohua Li struct page *page = ctx->meta_page; 1735355810d1SShaohua Li struct r5l_meta_block *mb; 1736355810d1SShaohua Li u32 crc, stored_crc; 1737effe6ee7SSong Liu int ret; 1738355810d1SShaohua Li 1739effe6ee7SSong Liu ret = r5l_recovery_read_page(log, ctx, page, ctx->pos); 1740effe6ee7SSong Liu if (ret != 0) 1741effe6ee7SSong Liu return ret; 1742355810d1SShaohua Li 1743355810d1SShaohua Li mb = page_address(page); 1744355810d1SShaohua Li stored_crc = le32_to_cpu(mb->checksum); 1745355810d1SShaohua Li mb->checksum = 0; 1746355810d1SShaohua Li 1747355810d1SShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1748355810d1SShaohua Li le64_to_cpu(mb->seq) != ctx->seq || 1749355810d1SShaohua Li mb->version != R5LOG_VERSION || 1750355810d1SShaohua Li le64_to_cpu(mb->position) != ctx->pos) 1751355810d1SShaohua Li return -EINVAL; 1752355810d1SShaohua Li 17535cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1754355810d1SShaohua Li if (stored_crc != crc) 1755355810d1SShaohua Li return -EINVAL; 1756355810d1SShaohua Li 1757355810d1SShaohua Li if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1758355810d1SShaohua Li return -EINVAL; 1759355810d1SShaohua Li 1760355810d1SShaohua Li ctx->meta_total_blocks = BLOCK_SECTORS; 1761355810d1SShaohua Li 1762355810d1SShaohua Li return 0; 1763355810d1SShaohua Li } 1764355810d1SShaohua Li 17659ed988f5SSong Liu static void 17669ed988f5SSong Liu r5l_recovery_create_empty_meta_block(struct r5l_log *log, 17679ed988f5SSong Liu struct page *page, 17689ed988f5SSong Liu sector_t pos, u64 seq) 1769355810d1SShaohua Li { 1770355810d1SShaohua Li struct r5l_meta_block *mb; 1771355810d1SShaohua Li 1772355810d1SShaohua Li mb = page_address(page); 17739ed988f5SSong Liu clear_page(mb); 1774355810d1SShaohua Li mb->magic = cpu_to_le32(R5LOG_MAGIC); 1775355810d1SShaohua Li mb->version = R5LOG_VERSION; 1776355810d1SShaohua Li mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1777355810d1SShaohua Li mb->seq = cpu_to_le64(seq); 1778355810d1SShaohua Li mb->position = cpu_to_le64(pos); 1779355810d1SShaohua Li } 1780355810d1SShaohua Li 1781355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1782355810d1SShaohua Li u64 seq) 1783355810d1SShaohua Li { 1784355810d1SShaohua Li struct page *page; 1785355810d1SShaohua Li struct r5l_meta_block *mb; 1786355810d1SShaohua Li 17879ed988f5SSong Liu page = alloc_page(GFP_KERNEL); 1788355810d1SShaohua Li if (!page) 1789355810d1SShaohua Li return -ENOMEM; 17909ed988f5SSong Liu r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1791355810d1SShaohua Li mb = page_address(page); 17925c88f403SSong Liu mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 17935c88f403SSong Liu mb, PAGE_SIZE)); 1794796a5cf0SMike Christie if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 17955a8948f8SJan Kara REQ_SYNC | REQ_FUA, false)) { 1796355810d1SShaohua Li __free_page(page); 1797355810d1SShaohua Li return -EIO; 1798355810d1SShaohua Li } 1799355810d1SShaohua Li __free_page(page); 1800355810d1SShaohua Li return 0; 1801355810d1SShaohua Li } 1802355810d1SShaohua Li 1803b4c625c6SSong Liu /* 1804b4c625c6SSong Liu * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1805b4c625c6SSong Liu * to mark valid (potentially not flushed) data in the journal. 1806b4c625c6SSong Liu * 1807b4c625c6SSong Liu * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1808b4c625c6SSong Liu * so there should not be any mismatch here. 1809b4c625c6SSong Liu */ 1810b4c625c6SSong Liu static void r5l_recovery_load_data(struct r5l_log *log, 1811b4c625c6SSong Liu struct stripe_head *sh, 1812b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1813b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1814b4c625c6SSong Liu sector_t log_offset) 1815f6bed0efSShaohua Li { 1816b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1817b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1818b4c625c6SSong Liu int dd_idx; 1819355810d1SShaohua Li 1820b4c625c6SSong Liu raid5_compute_sector(conf, 1821b4c625c6SSong Liu le64_to_cpu(payload->location), 0, 1822b4c625c6SSong Liu &dd_idx, sh); 1823effe6ee7SSong Liu r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset); 1824b4c625c6SSong Liu sh->dev[dd_idx].log_checksum = 1825b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1826b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS; 1827b4c625c6SSong Liu 1828b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1829b4c625c6SSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 1830b4c625c6SSong Liu } 1831b4c625c6SSong Liu 1832b4c625c6SSong Liu static void r5l_recovery_load_parity(struct r5l_log *log, 1833b4c625c6SSong Liu struct stripe_head *sh, 1834b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1835b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1836b4c625c6SSong Liu sector_t log_offset) 1837b4c625c6SSong Liu { 1838b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1839b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1840b4c625c6SSong Liu 1841b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1842effe6ee7SSong Liu r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset); 1843b4c625c6SSong Liu sh->dev[sh->pd_idx].log_checksum = 1844b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1845b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1846b4c625c6SSong Liu 1847b4c625c6SSong Liu if (sh->qd_idx >= 0) { 1848effe6ee7SSong Liu r5l_recovery_read_page( 1849effe6ee7SSong Liu log, ctx, sh->dev[sh->qd_idx].page, 1850effe6ee7SSong Liu r5l_ring_add(log, log_offset, BLOCK_SECTORS)); 1851b4c625c6SSong Liu sh->dev[sh->qd_idx].log_checksum = 1852b4c625c6SSong Liu le32_to_cpu(payload->checksum[1]); 1853b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1854b4c625c6SSong Liu } 1855b4c625c6SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 1856b4c625c6SSong Liu } 1857b4c625c6SSong Liu 1858b4c625c6SSong Liu static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1859b4c625c6SSong Liu { 1860b4c625c6SSong Liu int i; 1861b4c625c6SSong Liu 1862b4c625c6SSong Liu sh->state = 0; 1863b4c625c6SSong Liu sh->log_start = MaxSector; 1864b4c625c6SSong Liu for (i = sh->disks; i--; ) 1865b4c625c6SSong Liu sh->dev[i].flags = 0; 1866b4c625c6SSong Liu } 1867b4c625c6SSong Liu 1868b4c625c6SSong Liu static void 1869b4c625c6SSong Liu r5l_recovery_replay_one_stripe(struct r5conf *conf, 1870b4c625c6SSong Liu struct stripe_head *sh, 1871b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1872b4c625c6SSong Liu { 1873b4c625c6SSong Liu struct md_rdev *rdev, *rrdev; 1874b4c625c6SSong Liu int disk_index; 1875b4c625c6SSong Liu int data_count = 0; 1876b4c625c6SSong Liu 1877b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1878b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1879b4c625c6SSong Liu continue; 1880b4c625c6SSong Liu if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1881b4c625c6SSong Liu continue; 1882b4c625c6SSong Liu data_count++; 1883b4c625c6SSong Liu } 1884b4c625c6SSong Liu 1885b4c625c6SSong Liu /* 1886b4c625c6SSong Liu * stripes that only have parity must have been flushed 1887b4c625c6SSong Liu * before the crash that we are now recovering from, so 1888b4c625c6SSong Liu * there is nothing more to recovery. 1889b4c625c6SSong Liu */ 1890b4c625c6SSong Liu if (data_count == 0) 1891b4c625c6SSong Liu goto out; 1892b4c625c6SSong Liu 1893b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1894b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1895b4c625c6SSong Liu continue; 1896b4c625c6SSong Liu 1897b4c625c6SSong Liu /* in case device is broken */ 1898b4c625c6SSong Liu rcu_read_lock(); 1899b4c625c6SSong Liu rdev = rcu_dereference(conf->disks[disk_index].rdev); 1900b4c625c6SSong Liu if (rdev) { 1901b4c625c6SSong Liu atomic_inc(&rdev->nr_pending); 1902b4c625c6SSong Liu rcu_read_unlock(); 1903b4c625c6SSong Liu sync_page_io(rdev, sh->sector, PAGE_SIZE, 1904b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1905b4c625c6SSong Liu false); 1906b4c625c6SSong Liu rdev_dec_pending(rdev, rdev->mddev); 1907b4c625c6SSong Liu rcu_read_lock(); 1908b4c625c6SSong Liu } 1909b4c625c6SSong Liu rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1910b4c625c6SSong Liu if (rrdev) { 1911b4c625c6SSong Liu atomic_inc(&rrdev->nr_pending); 1912b4c625c6SSong Liu rcu_read_unlock(); 1913b4c625c6SSong Liu sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1914b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1915b4c625c6SSong Liu false); 1916b4c625c6SSong Liu rdev_dec_pending(rrdev, rrdev->mddev); 1917b4c625c6SSong Liu rcu_read_lock(); 1918b4c625c6SSong Liu } 1919b4c625c6SSong Liu rcu_read_unlock(); 1920b4c625c6SSong Liu } 1921b4c625c6SSong Liu ctx->data_parity_stripes++; 1922b4c625c6SSong Liu out: 1923b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1924b4c625c6SSong Liu } 1925b4c625c6SSong Liu 1926b4c625c6SSong Liu static struct stripe_head * 1927483cbbedSAlexei Naberezhnov r5c_recovery_alloc_stripe( 1928483cbbedSAlexei Naberezhnov struct r5conf *conf, 1929483cbbedSAlexei Naberezhnov sector_t stripe_sect, 1930483cbbedSAlexei Naberezhnov int noblock) 1931b4c625c6SSong Liu { 1932b4c625c6SSong Liu struct stripe_head *sh; 1933b4c625c6SSong Liu 1934483cbbedSAlexei Naberezhnov sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); 1935b4c625c6SSong Liu if (!sh) 1936b4c625c6SSong Liu return NULL; /* no more stripe available */ 1937b4c625c6SSong Liu 1938b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1939b4c625c6SSong Liu 1940b4c625c6SSong Liu return sh; 1941b4c625c6SSong Liu } 1942b4c625c6SSong Liu 1943b4c625c6SSong Liu static struct stripe_head * 1944b4c625c6SSong Liu r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1945b4c625c6SSong Liu { 1946b4c625c6SSong Liu struct stripe_head *sh; 1947b4c625c6SSong Liu 1948b4c625c6SSong Liu list_for_each_entry(sh, list, lru) 1949b4c625c6SSong Liu if (sh->sector == sect) 1950b4c625c6SSong Liu return sh; 1951b4c625c6SSong Liu return NULL; 1952b4c625c6SSong Liu } 1953b4c625c6SSong Liu 1954b4c625c6SSong Liu static void 1955b4c625c6SSong Liu r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1956b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1957b4c625c6SSong Liu { 1958b4c625c6SSong Liu struct stripe_head *sh, *next; 1959b4c625c6SSong Liu 1960b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1961b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1962b4c625c6SSong Liu list_del_init(&sh->lru); 1963b4c625c6SSong Liu raid5_release_stripe(sh); 1964b4c625c6SSong Liu } 1965b4c625c6SSong Liu } 1966b4c625c6SSong Liu 1967b4c625c6SSong Liu static void 1968b4c625c6SSong Liu r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1969b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1970b4c625c6SSong Liu { 1971b4c625c6SSong Liu struct stripe_head *sh, *next; 1972b4c625c6SSong Liu 1973b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1974b4c625c6SSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1975b4c625c6SSong Liu r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1976b4c625c6SSong Liu list_del_init(&sh->lru); 1977b4c625c6SSong Liu raid5_release_stripe(sh); 1978b4c625c6SSong Liu } 1979b4c625c6SSong Liu } 1980b4c625c6SSong Liu 1981b4c625c6SSong Liu /* if matches return 0; otherwise return -EINVAL */ 1982b4c625c6SSong Liu static int 1983effe6ee7SSong Liu r5l_recovery_verify_data_checksum(struct r5l_log *log, 1984effe6ee7SSong Liu struct r5l_recovery_ctx *ctx, 1985effe6ee7SSong Liu struct page *page, 1986b4c625c6SSong Liu sector_t log_offset, __le32 log_checksum) 1987b4c625c6SSong Liu { 1988b4c625c6SSong Liu void *addr; 1989b4c625c6SSong Liu u32 checksum; 1990b4c625c6SSong Liu 1991effe6ee7SSong Liu r5l_recovery_read_page(log, ctx, page, log_offset); 1992b4c625c6SSong Liu addr = kmap_atomic(page); 1993b4c625c6SSong Liu checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1994b4c625c6SSong Liu kunmap_atomic(addr); 1995b4c625c6SSong Liu return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1996b4c625c6SSong Liu } 1997b4c625c6SSong Liu 1998b4c625c6SSong Liu /* 1999b4c625c6SSong Liu * before loading data to stripe cache, we need verify checksum for all data, 2000b4c625c6SSong Liu * if there is mismatch for any data page, we drop all data in the mata block 2001b4c625c6SSong Liu */ 2002b4c625c6SSong Liu static int 2003b4c625c6SSong Liu r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 2004b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 2005b4c625c6SSong Liu { 2006b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 2007b4c625c6SSong Liu struct r5conf *conf = mddev->private; 2008b4c625c6SSong Liu struct r5l_meta_block *mb = page_address(ctx->meta_page); 2009b4c625c6SSong Liu sector_t mb_offset = sizeof(struct r5l_meta_block); 2010b4c625c6SSong Liu sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2011b4c625c6SSong Liu struct page *page; 2012b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 20132d4f4687SSong Liu struct r5l_payload_flush *payload_flush; 2014b4c625c6SSong Liu 2015b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 2016b4c625c6SSong Liu if (!page) 2017355810d1SShaohua Li return -ENOMEM; 2018355810d1SShaohua Li 2019b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 2020b4c625c6SSong Liu payload = (void *)mb + mb_offset; 20212d4f4687SSong Liu payload_flush = (void *)mb + mb_offset; 2022b4c625c6SSong Liu 20231ad45a9bSJason Yan if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 2024b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 2025effe6ee7SSong Liu log, ctx, page, log_offset, 2026b4c625c6SSong Liu payload->checksum[0]) < 0) 2027b4c625c6SSong Liu goto mismatch; 20281ad45a9bSJason Yan } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { 2029b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 2030effe6ee7SSong Liu log, ctx, page, log_offset, 2031b4c625c6SSong Liu payload->checksum[0]) < 0) 2032b4c625c6SSong Liu goto mismatch; 2033b4c625c6SSong Liu if (conf->max_degraded == 2 && /* q for RAID 6 */ 2034b4c625c6SSong Liu r5l_recovery_verify_data_checksum( 2035effe6ee7SSong Liu log, ctx, page, 2036b4c625c6SSong Liu r5l_ring_add(log, log_offset, 2037b4c625c6SSong Liu BLOCK_SECTORS), 2038b4c625c6SSong Liu payload->checksum[1]) < 0) 2039b4c625c6SSong Liu goto mismatch; 20401ad45a9bSJason Yan } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 20412d4f4687SSong Liu /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ 20422d4f4687SSong Liu } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ 2043b4c625c6SSong Liu goto mismatch; 2044b4c625c6SSong Liu 20451ad45a9bSJason Yan if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 20462d4f4687SSong Liu mb_offset += sizeof(struct r5l_payload_flush) + 20472d4f4687SSong Liu le32_to_cpu(payload_flush->size); 20482d4f4687SSong Liu } else { 20492d4f4687SSong Liu /* DATA or PARITY payload */ 2050b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 2051b4c625c6SSong Liu le32_to_cpu(payload->size)); 2052b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 2053b4c625c6SSong Liu sizeof(__le32) * 2054b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 2055b4c625c6SSong Liu } 2056b4c625c6SSong Liu 20572d4f4687SSong Liu } 20582d4f4687SSong Liu 2059b4c625c6SSong Liu put_page(page); 2060b4c625c6SSong Liu return 0; 2061b4c625c6SSong Liu 2062b4c625c6SSong Liu mismatch: 2063b4c625c6SSong Liu put_page(page); 2064b4c625c6SSong Liu return -EINVAL; 2065b4c625c6SSong Liu } 2066b4c625c6SSong Liu 2067b4c625c6SSong Liu /* 2068b4c625c6SSong Liu * Analyze all data/parity pages in one meta block 2069b4c625c6SSong Liu * Returns: 2070b4c625c6SSong Liu * 0 for success 2071b4c625c6SSong Liu * -EINVAL for unknown playload type 2072b4c625c6SSong Liu * -EAGAIN for checksum mismatch of data page 2073b4c625c6SSong Liu * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 2074b4c625c6SSong Liu */ 2075b4c625c6SSong Liu static int 2076b4c625c6SSong Liu r5c_recovery_analyze_meta_block(struct r5l_log *log, 2077b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 2078b4c625c6SSong Liu struct list_head *cached_stripe_list) 2079b4c625c6SSong Liu { 2080b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 2081b4c625c6SSong Liu struct r5conf *conf = mddev->private; 2082b4c625c6SSong Liu struct r5l_meta_block *mb; 2083b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 20842d4f4687SSong Liu struct r5l_payload_flush *payload_flush; 2085b4c625c6SSong Liu int mb_offset; 2086b4c625c6SSong Liu sector_t log_offset; 2087b4c625c6SSong Liu sector_t stripe_sect; 2088b4c625c6SSong Liu struct stripe_head *sh; 2089b4c625c6SSong Liu int ret; 2090b4c625c6SSong Liu 2091b4c625c6SSong Liu /* 2092b4c625c6SSong Liu * for mismatch in data blocks, we will drop all data in this mb, but 2093b4c625c6SSong Liu * we will still read next mb for other data with FLUSH flag, as 2094b4c625c6SSong Liu * io_unit could finish out of order. 2095b4c625c6SSong Liu */ 2096b4c625c6SSong Liu ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 2097b4c625c6SSong Liu if (ret == -EINVAL) 2098b4c625c6SSong Liu return -EAGAIN; 2099b4c625c6SSong Liu else if (ret) 2100b4c625c6SSong Liu return ret; /* -ENOMEM duo to alloc_page() failed */ 2101b4c625c6SSong Liu 2102b4c625c6SSong Liu mb = page_address(ctx->meta_page); 2103b4c625c6SSong Liu mb_offset = sizeof(struct r5l_meta_block); 2104b4c625c6SSong Liu log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2105b4c625c6SSong Liu 2106b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 2107b4c625c6SSong Liu int dd; 2108b4c625c6SSong Liu 2109b4c625c6SSong Liu payload = (void *)mb + mb_offset; 21102d4f4687SSong Liu payload_flush = (void *)mb + mb_offset; 21112d4f4687SSong Liu 21121ad45a9bSJason Yan if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 21132d4f4687SSong Liu int i, count; 21142d4f4687SSong Liu 21152d4f4687SSong Liu count = le32_to_cpu(payload_flush->size) / sizeof(__le64); 21162d4f4687SSong Liu for (i = 0; i < count; ++i) { 21172d4f4687SSong Liu stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); 21182d4f4687SSong Liu sh = r5c_recovery_lookup_stripe(cached_stripe_list, 21192d4f4687SSong Liu stripe_sect); 21202d4f4687SSong Liu if (sh) { 21212d4f4687SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 21222d4f4687SSong Liu r5l_recovery_reset_stripe(sh); 21232d4f4687SSong Liu list_del_init(&sh->lru); 21242d4f4687SSong Liu raid5_release_stripe(sh); 21252d4f4687SSong Liu } 21262d4f4687SSong Liu } 21272d4f4687SSong Liu 21282d4f4687SSong Liu mb_offset += sizeof(struct r5l_payload_flush) + 21292d4f4687SSong Liu le32_to_cpu(payload_flush->size); 21302d4f4687SSong Liu continue; 21312d4f4687SSong Liu } 21322d4f4687SSong Liu 21332d4f4687SSong Liu /* DATA or PARITY payload */ 21341ad45a9bSJason Yan stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? 2135b4c625c6SSong Liu raid5_compute_sector( 2136b4c625c6SSong Liu conf, le64_to_cpu(payload->location), 0, &dd, 2137b4c625c6SSong Liu NULL) 2138b4c625c6SSong Liu : le64_to_cpu(payload->location); 2139b4c625c6SSong Liu 2140b4c625c6SSong Liu sh = r5c_recovery_lookup_stripe(cached_stripe_list, 2141b4c625c6SSong Liu stripe_sect); 2142b4c625c6SSong Liu 2143b4c625c6SSong Liu if (!sh) { 2144483cbbedSAlexei Naberezhnov sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1); 2145b4c625c6SSong Liu /* 2146b4c625c6SSong Liu * cannot get stripe from raid5_get_active_stripe 2147b4c625c6SSong Liu * try replay some stripes 2148b4c625c6SSong Liu */ 2149b4c625c6SSong Liu if (!sh) { 2150b4c625c6SSong Liu r5c_recovery_replay_stripes( 2151b4c625c6SSong Liu cached_stripe_list, ctx); 2152b4c625c6SSong Liu sh = r5c_recovery_alloc_stripe( 2153483cbbedSAlexei Naberezhnov conf, stripe_sect, 1); 2154b4c625c6SSong Liu } 2155b4c625c6SSong Liu if (!sh) { 2156483cbbedSAlexei Naberezhnov int new_size = conf->min_nr_stripes * 2; 2157b4c625c6SSong Liu pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 2158b4c625c6SSong Liu mdname(mddev), 2159483cbbedSAlexei Naberezhnov new_size); 2160483cbbedSAlexei Naberezhnov ret = raid5_set_cache_size(mddev, new_size); 2161483cbbedSAlexei Naberezhnov if (conf->min_nr_stripes <= new_size / 2) { 2162483cbbedSAlexei Naberezhnov pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n", 2163483cbbedSAlexei Naberezhnov mdname(mddev), 2164483cbbedSAlexei Naberezhnov ret, 2165483cbbedSAlexei Naberezhnov new_size, 2166483cbbedSAlexei Naberezhnov conf->min_nr_stripes, 2167483cbbedSAlexei Naberezhnov conf->max_nr_stripes); 2168483cbbedSAlexei Naberezhnov return -ENOMEM; 2169483cbbedSAlexei Naberezhnov } 2170483cbbedSAlexei Naberezhnov sh = r5c_recovery_alloc_stripe( 2171483cbbedSAlexei Naberezhnov conf, stripe_sect, 0); 2172b4c625c6SSong Liu } 2173b4c625c6SSong Liu if (!sh) { 2174b4c625c6SSong Liu pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 2175b4c625c6SSong Liu mdname(mddev)); 2176b4c625c6SSong Liu return -ENOMEM; 2177b4c625c6SSong Liu } 2178b4c625c6SSong Liu list_add_tail(&sh->lru, cached_stripe_list); 2179b4c625c6SSong Liu } 2180b4c625c6SSong Liu 21811ad45a9bSJason Yan if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 2182f7b7bee7SZhengyuan Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 2183f7b7bee7SZhengyuan Liu test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 2184b4c625c6SSong Liu r5l_recovery_replay_one_stripe(conf, sh, ctx); 2185b4c625c6SSong Liu list_move_tail(&sh->lru, cached_stripe_list); 2186b4c625c6SSong Liu } 2187b4c625c6SSong Liu r5l_recovery_load_data(log, sh, ctx, payload, 2188b4c625c6SSong Liu log_offset); 21891ad45a9bSJason Yan } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 2190b4c625c6SSong Liu r5l_recovery_load_parity(log, sh, ctx, payload, 2191b4c625c6SSong Liu log_offset); 2192b4c625c6SSong Liu else 2193b4c625c6SSong Liu return -EINVAL; 2194b4c625c6SSong Liu 2195b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 2196b4c625c6SSong Liu le32_to_cpu(payload->size)); 2197b4c625c6SSong Liu 2198b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 2199b4c625c6SSong Liu sizeof(__le32) * 2200b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 2201b4c625c6SSong Liu } 2202b4c625c6SSong Liu 2203b4c625c6SSong Liu return 0; 2204b4c625c6SSong Liu } 2205b4c625c6SSong Liu 2206b4c625c6SSong Liu /* 2207b4c625c6SSong Liu * Load the stripe into cache. The stripe will be written out later by 2208b4c625c6SSong Liu * the stripe cache state machine. 2209b4c625c6SSong Liu */ 2210b4c625c6SSong Liu static void r5c_recovery_load_one_stripe(struct r5l_log *log, 2211b4c625c6SSong Liu struct stripe_head *sh) 2212b4c625c6SSong Liu { 2213b4c625c6SSong Liu struct r5dev *dev; 2214b4c625c6SSong Liu int i; 2215b4c625c6SSong Liu 2216b4c625c6SSong Liu for (i = sh->disks; i--; ) { 2217b4c625c6SSong Liu dev = sh->dev + i; 2218b4c625c6SSong Liu if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 2219b4c625c6SSong Liu set_bit(R5_InJournal, &dev->flags); 2220b4c625c6SSong Liu set_bit(R5_UPTODATE, &dev->flags); 2221b4c625c6SSong Liu } 2222b4c625c6SSong Liu } 2223b4c625c6SSong Liu } 2224b4c625c6SSong Liu 2225b4c625c6SSong Liu /* 2226b4c625c6SSong Liu * Scan through the log for all to-be-flushed data 2227b4c625c6SSong Liu * 2228b4c625c6SSong Liu * For stripes with data and parity, namely Data-Parity stripe 2229b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 2230b4c625c6SSong Liu * 2231b4c625c6SSong Liu * For stripes with only data, namely Data-Only stripe 2232b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 2233b4c625c6SSong Liu * 2234b4c625c6SSong Liu * For a stripe, if we see data after parity, we should discard all previous 2235b4c625c6SSong Liu * data and parity for this stripe, as these data are already flushed to 2236b4c625c6SSong Liu * the array. 2237b4c625c6SSong Liu * 2238b4c625c6SSong Liu * At the end of the scan, we return the new journal_tail, which points to 2239b4c625c6SSong Liu * first data-only stripe on the journal device, or next invalid meta block. 2240b4c625c6SSong Liu */ 2241b4c625c6SSong Liu static int r5c_recovery_flush_log(struct r5l_log *log, 2242b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 2243b4c625c6SSong Liu { 2244bc8f167fSJackieLiu struct stripe_head *sh; 2245b4c625c6SSong Liu int ret = 0; 2246b4c625c6SSong Liu 2247b4c625c6SSong Liu /* scan through the log */ 2248b4c625c6SSong Liu while (1) { 2249b4c625c6SSong Liu if (r5l_recovery_read_meta_block(log, ctx)) 2250b4c625c6SSong Liu break; 2251b4c625c6SSong Liu 2252b4c625c6SSong Liu ret = r5c_recovery_analyze_meta_block(log, ctx, 2253b4c625c6SSong Liu &ctx->cached_list); 2254b4c625c6SSong Liu /* 2255b4c625c6SSong Liu * -EAGAIN means mismatch in data block, in this case, we still 2256b4c625c6SSong Liu * try scan the next metablock 2257b4c625c6SSong Liu */ 2258b4c625c6SSong Liu if (ret && ret != -EAGAIN) 2259b4c625c6SSong Liu break; /* ret == -EINVAL or -ENOMEM */ 2260b4c625c6SSong Liu ctx->seq++; 2261b4c625c6SSong Liu ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 2262b4c625c6SSong Liu } 2263b4c625c6SSong Liu 2264b4c625c6SSong Liu if (ret == -ENOMEM) { 2265b4c625c6SSong Liu r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 2266b4c625c6SSong Liu return ret; 2267b4c625c6SSong Liu } 2268b4c625c6SSong Liu 2269b4c625c6SSong Liu /* replay data-parity stripes */ 2270b4c625c6SSong Liu r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 2271b4c625c6SSong Liu 2272b4c625c6SSong Liu /* load data-only stripes to stripe cache */ 2273bc8f167fSJackieLiu list_for_each_entry(sh, &ctx->cached_list, lru) { 2274b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2275b4c625c6SSong Liu r5c_recovery_load_one_stripe(log, sh); 2276b4c625c6SSong Liu ctx->data_only_stripes++; 2277b4c625c6SSong Liu } 2278b4c625c6SSong Liu 2279b4c625c6SSong Liu return 0; 2280b4c625c6SSong Liu } 2281355810d1SShaohua Li 2282355810d1SShaohua Li /* 2283355810d1SShaohua Li * we did a recovery. Now ctx.pos points to an invalid meta block. New 2284355810d1SShaohua Li * log will start here. but we can't let superblock point to last valid 2285355810d1SShaohua Li * meta block. The log might looks like: 2286355810d1SShaohua Li * | meta 1| meta 2| meta 3| 2287355810d1SShaohua Li * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2288355810d1SShaohua Li * superblock points to meta 1, we write a new valid meta 2n. if crash 2289355810d1SShaohua Li * happens again, new recovery will start from meta 1. Since meta 2n is 2290355810d1SShaohua Li * valid now, recovery will think meta 3 is valid, which is wrong. 2291355810d1SShaohua Li * The solution is we create a new meta in meta2 with its seq == meta 22923c6edc66SSong Liu * 1's seq + 10000 and let superblock points to meta2. The same recovery 22933c6edc66SSong Liu * will not think meta 3 is a valid meta, because its seq doesn't match 2294355810d1SShaohua Li */ 2295355810d1SShaohua Li 2296b4c625c6SSong Liu /* 2297b4c625c6SSong Liu * Before recovery, the log looks like the following 2298b4c625c6SSong Liu * 2299b4c625c6SSong Liu * --------------------------------------------- 2300b4c625c6SSong Liu * | valid log | invalid log | 2301b4c625c6SSong Liu * --------------------------------------------- 2302b4c625c6SSong Liu * ^ 2303b4c625c6SSong Liu * |- log->last_checkpoint 2304b4c625c6SSong Liu * |- log->last_cp_seq 2305b4c625c6SSong Liu * 2306b4c625c6SSong Liu * Now we scan through the log until we see invalid entry 2307b4c625c6SSong Liu * 2308b4c625c6SSong Liu * --------------------------------------------- 2309b4c625c6SSong Liu * | valid log | invalid log | 2310b4c625c6SSong Liu * --------------------------------------------- 2311b4c625c6SSong Liu * ^ ^ 2312b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos 2313b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq 2314b4c625c6SSong Liu * 2315b4c625c6SSong Liu * From this point, we need to increase seq number by 10 to avoid 2316b4c625c6SSong Liu * confusing next recovery. 2317b4c625c6SSong Liu * 2318b4c625c6SSong Liu * --------------------------------------------- 2319b4c625c6SSong Liu * | valid log | invalid log | 2320b4c625c6SSong Liu * --------------------------------------------- 2321b4c625c6SSong Liu * ^ ^ 2322b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+1 23233c6edc66SSong Liu * |- log->last_cp_seq |- ctx->seq+10001 2324b4c625c6SSong Liu * 2325b4c625c6SSong Liu * However, it is not safe to start the state machine yet, because data only 2326b4c625c6SSong Liu * parities are not yet secured in RAID. To save these data only parities, we 2327b4c625c6SSong Liu * rewrite them from seq+11. 2328b4c625c6SSong Liu * 2329b4c625c6SSong Liu * ----------------------------------------------------------------- 2330b4c625c6SSong Liu * | valid log | data only stripes | invalid log | 2331b4c625c6SSong Liu * ----------------------------------------------------------------- 2332b4c625c6SSong Liu * ^ ^ 2333b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 23343c6edc66SSong Liu * |- log->last_cp_seq |- ctx->seq+10000+n 2335b4c625c6SSong Liu * 2336b4c625c6SSong Liu * If failure happens again during this process, the recovery can safe start 2337b4c625c6SSong Liu * again from log->last_checkpoint. 2338b4c625c6SSong Liu * 2339b4c625c6SSong Liu * Once data only stripes are rewritten to journal, we move log_tail 2340b4c625c6SSong Liu * 2341b4c625c6SSong Liu * ----------------------------------------------------------------- 2342b4c625c6SSong Liu * | old log | data only stripes | invalid log | 2343b4c625c6SSong Liu * ----------------------------------------------------------------- 2344b4c625c6SSong Liu * ^ ^ 2345b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 23463c6edc66SSong Liu * |- log->last_cp_seq |- ctx->seq+10000+n 2347b4c625c6SSong Liu * 2348b4c625c6SSong Liu * Then we can safely start the state machine. If failure happens from this 2349b4c625c6SSong Liu * point on, the recovery will start from new log->last_checkpoint. 2350b4c625c6SSong Liu */ 2351b4c625c6SSong Liu static int 2352b4c625c6SSong Liu r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2353b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 2354b4c625c6SSong Liu { 2355a85dd7b8SSong Liu struct stripe_head *sh; 2356b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 2357b4c625c6SSong Liu struct page *page; 23583c66abbaSSong Liu sector_t next_checkpoint = MaxSector; 2359b4c625c6SSong Liu 2360b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 2361b4c625c6SSong Liu if (!page) { 2362b4c625c6SSong Liu pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2363b4c625c6SSong Liu mdname(mddev)); 2364b4c625c6SSong Liu return -ENOMEM; 2365b4c625c6SSong Liu } 2366b4c625c6SSong Liu 23673c66abbaSSong Liu WARN_ON(list_empty(&ctx->cached_list)); 23683c66abbaSSong Liu 2369a85dd7b8SSong Liu list_for_each_entry(sh, &ctx->cached_list, lru) { 2370b4c625c6SSong Liu struct r5l_meta_block *mb; 2371b4c625c6SSong Liu int i; 2372b4c625c6SSong Liu int offset; 2373b4c625c6SSong Liu sector_t write_pos; 2374b4c625c6SSong Liu 2375b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2376b4c625c6SSong Liu r5l_recovery_create_empty_meta_block(log, page, 2377b4c625c6SSong Liu ctx->pos, ctx->seq); 2378b4c625c6SSong Liu mb = page_address(page); 2379b4c625c6SSong Liu offset = le32_to_cpu(mb->meta_size); 2380fc833c2aSJackieLiu write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2381b4c625c6SSong Liu 2382b4c625c6SSong Liu for (i = sh->disks; i--; ) { 2383b4c625c6SSong Liu struct r5dev *dev = &sh->dev[i]; 2384b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 2385b4c625c6SSong Liu void *addr; 2386b4c625c6SSong Liu 2387b4c625c6SSong Liu if (test_bit(R5_InJournal, &dev->flags)) { 2388b4c625c6SSong Liu payload = (void *)mb + offset; 2389b4c625c6SSong Liu payload->header.type = cpu_to_le16( 2390b4c625c6SSong Liu R5LOG_PAYLOAD_DATA); 23911ad45a9bSJason Yan payload->size = cpu_to_le32(BLOCK_SECTORS); 2392b4c625c6SSong Liu payload->location = cpu_to_le64( 2393b4c625c6SSong Liu raid5_compute_blocknr(sh, i, 0)); 2394b4c625c6SSong Liu addr = kmap_atomic(dev->page); 2395b4c625c6SSong Liu payload->checksum[0] = cpu_to_le32( 2396b4c625c6SSong Liu crc32c_le(log->uuid_checksum, addr, 2397b4c625c6SSong Liu PAGE_SIZE)); 2398b4c625c6SSong Liu kunmap_atomic(addr); 2399b4c625c6SSong Liu sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2400b4c625c6SSong Liu dev->page, REQ_OP_WRITE, 0, false); 2401b4c625c6SSong Liu write_pos = r5l_ring_add(log, write_pos, 2402b4c625c6SSong Liu BLOCK_SECTORS); 2403b4c625c6SSong Liu offset += sizeof(__le32) + 2404b4c625c6SSong Liu sizeof(struct r5l_payload_data_parity); 2405b4c625c6SSong Liu 2406b4c625c6SSong Liu } 2407b4c625c6SSong Liu } 2408b4c625c6SSong Liu mb->meta_size = cpu_to_le32(offset); 24095c88f403SSong Liu mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 24105c88f403SSong Liu mb, PAGE_SIZE)); 2411b4c625c6SSong Liu sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 24125a8948f8SJan Kara REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false); 2413b4c625c6SSong Liu sh->log_start = ctx->pos; 24143c66abbaSSong Liu list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 24153c66abbaSSong Liu atomic_inc(&log->stripe_in_journal_count); 2416b4c625c6SSong Liu ctx->pos = write_pos; 2417b4c625c6SSong Liu ctx->seq += 1; 24183c66abbaSSong Liu next_checkpoint = sh->log_start; 2419b4c625c6SSong Liu } 24203c66abbaSSong Liu log->next_checkpoint = next_checkpoint; 2421b4c625c6SSong Liu __free_page(page); 2422b4c625c6SSong Liu return 0; 2423b4c625c6SSong Liu } 2424b4c625c6SSong Liu 2425a85dd7b8SSong Liu static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, 2426a85dd7b8SSong Liu struct r5l_recovery_ctx *ctx) 2427a85dd7b8SSong Liu { 2428a85dd7b8SSong Liu struct mddev *mddev = log->rdev->mddev; 2429a85dd7b8SSong Liu struct r5conf *conf = mddev->private; 2430a85dd7b8SSong Liu struct stripe_head *sh, *next; 2431c9020e64SSong Liu bool cleared_pending = false; 2432a85dd7b8SSong Liu 2433a85dd7b8SSong Liu if (ctx->data_only_stripes == 0) 2434a85dd7b8SSong Liu return; 2435a85dd7b8SSong Liu 2436c9020e64SSong Liu if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 2437c9020e64SSong Liu cleared_pending = true; 2438c9020e64SSong Liu clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2439c9020e64SSong Liu } 2440a85dd7b8SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; 2441a85dd7b8SSong Liu 2442a85dd7b8SSong Liu list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2443a85dd7b8SSong Liu r5c_make_stripe_write_out(sh); 2444a85dd7b8SSong Liu set_bit(STRIPE_HANDLE, &sh->state); 2445a85dd7b8SSong Liu list_del_init(&sh->lru); 2446a85dd7b8SSong Liu raid5_release_stripe(sh); 2447a85dd7b8SSong Liu } 2448a85dd7b8SSong Liu 2449a85dd7b8SSong Liu /* reuse conf->wait_for_quiescent in recovery */ 2450a85dd7b8SSong Liu wait_event(conf->wait_for_quiescent, 2451a85dd7b8SSong Liu atomic_read(&conf->active_stripes) == 0); 2452a85dd7b8SSong Liu 2453a85dd7b8SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2454c9020e64SSong Liu if (cleared_pending) 2455c9020e64SSong Liu set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2456a85dd7b8SSong Liu } 2457a85dd7b8SSong Liu 2458f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log) 2459f6bed0efSShaohua Li { 24605aabf7c4SSong Liu struct mddev *mddev = log->rdev->mddev; 2461effe6ee7SSong Liu struct r5l_recovery_ctx *ctx; 24625aabf7c4SSong Liu int ret; 246343b96748SJackieLiu sector_t pos; 2464355810d1SShaohua Li 2465effe6ee7SSong Liu ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 2466effe6ee7SSong Liu if (!ctx) 2467355810d1SShaohua Li return -ENOMEM; 2468355810d1SShaohua Li 2469effe6ee7SSong Liu ctx->pos = log->last_checkpoint; 2470effe6ee7SSong Liu ctx->seq = log->last_cp_seq; 2471effe6ee7SSong Liu INIT_LIST_HEAD(&ctx->cached_list); 2472effe6ee7SSong Liu ctx->meta_page = alloc_page(GFP_KERNEL); 2473effe6ee7SSong Liu 2474effe6ee7SSong Liu if (!ctx->meta_page) { 2475effe6ee7SSong Liu ret = -ENOMEM; 2476effe6ee7SSong Liu goto meta_page; 2477effe6ee7SSong Liu } 2478effe6ee7SSong Liu 2479effe6ee7SSong Liu if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) { 2480effe6ee7SSong Liu ret = -ENOMEM; 2481effe6ee7SSong Liu goto ra_pool; 2482effe6ee7SSong Liu } 2483effe6ee7SSong Liu 2484effe6ee7SSong Liu ret = r5c_recovery_flush_log(log, ctx); 2485355810d1SShaohua Li 2486355810d1SShaohua Li if (ret) 2487effe6ee7SSong Liu goto error; 24885aabf7c4SSong Liu 2489effe6ee7SSong Liu pos = ctx->pos; 2490effe6ee7SSong Liu ctx->seq += 10000; 249143b96748SJackieLiu 2492effe6ee7SSong Liu if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) 249392e6245dSSong Liu pr_info("md/raid:%s: starting from clean shutdown\n", 24945aabf7c4SSong Liu mdname(mddev)); 2495a85dd7b8SSong Liu else 249692e6245dSSong Liu pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 2497effe6ee7SSong Liu mdname(mddev), ctx->data_only_stripes, 2498effe6ee7SSong Liu ctx->data_parity_stripes); 24995aabf7c4SSong Liu 2500effe6ee7SSong Liu if (ctx->data_only_stripes == 0) { 2501effe6ee7SSong Liu log->next_checkpoint = ctx->pos; 2502effe6ee7SSong Liu r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++); 2503effe6ee7SSong Liu ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2504effe6ee7SSong Liu } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) { 25055aabf7c4SSong Liu pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 25065aabf7c4SSong Liu mdname(mddev)); 2507effe6ee7SSong Liu ret = -EIO; 2508effe6ee7SSong Liu goto error; 25095aabf7c4SSong Liu } 25105aabf7c4SSong Liu 2511effe6ee7SSong Liu log->log_start = ctx->pos; 2512effe6ee7SSong Liu log->seq = ctx->seq; 251343b96748SJackieLiu log->last_checkpoint = pos; 251443b96748SJackieLiu r5l_write_super(log, pos); 2515a85dd7b8SSong Liu 2516effe6ee7SSong Liu r5c_recovery_flush_data_only_stripes(log, ctx); 2517effe6ee7SSong Liu ret = 0; 2518effe6ee7SSong Liu error: 2519effe6ee7SSong Liu r5l_recovery_free_ra_pool(log, ctx); 2520effe6ee7SSong Liu ra_pool: 2521effe6ee7SSong Liu __free_page(ctx->meta_page); 2522effe6ee7SSong Liu meta_page: 2523effe6ee7SSong Liu kfree(ctx); 2524effe6ee7SSong Liu return ret; 2525f6bed0efSShaohua Li } 2526f6bed0efSShaohua Li 2527f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp) 2528f6bed0efSShaohua Li { 2529f6bed0efSShaohua Li struct mddev *mddev = log->rdev->mddev; 2530f6bed0efSShaohua Li 2531f6bed0efSShaohua Li log->rdev->journal_tail = cp; 25322953079cSShaohua Li set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2533f6bed0efSShaohua Li } 2534f6bed0efSShaohua Li 25352c7da14bSSong Liu static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 25362c7da14bSSong Liu { 2537a72cbf83SSong Liu struct r5conf *conf; 25382c7da14bSSong Liu int ret; 25392c7da14bSSong Liu 2540a72cbf83SSong Liu ret = mddev_lock(mddev); 2541a72cbf83SSong Liu if (ret) 2542a72cbf83SSong Liu return ret; 2543a72cbf83SSong Liu 2544a72cbf83SSong Liu conf = mddev->private; 2545a72cbf83SSong Liu if (!conf || !conf->log) { 2546a72cbf83SSong Liu mddev_unlock(mddev); 25472c7da14bSSong Liu return 0; 2548a72cbf83SSong Liu } 25492c7da14bSSong Liu 25502c7da14bSSong Liu switch (conf->log->r5c_journal_mode) { 25512c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_THROUGH: 25522c7da14bSSong Liu ret = snprintf( 25532c7da14bSSong Liu page, PAGE_SIZE, "[%s] %s\n", 25542c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 25552c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 25562c7da14bSSong Liu break; 25572c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_BACK: 25582c7da14bSSong Liu ret = snprintf( 25592c7da14bSSong Liu page, PAGE_SIZE, "%s [%s]\n", 25602c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 25612c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 25622c7da14bSSong Liu break; 25632c7da14bSSong Liu default: 25642c7da14bSSong Liu ret = 0; 25652c7da14bSSong Liu } 2566a72cbf83SSong Liu mddev_unlock(mddev); 25672c7da14bSSong Liu return ret; 25682c7da14bSSong Liu } 25692c7da14bSSong Liu 257078e470c2SHeinz Mauelshagen /* 257178e470c2SHeinz Mauelshagen * Set journal cache mode on @mddev (external API initially needed by dm-raid). 257278e470c2SHeinz Mauelshagen * 257378e470c2SHeinz Mauelshagen * @mode as defined in 'enum r5c_journal_mode'. 257478e470c2SHeinz Mauelshagen * 257578e470c2SHeinz Mauelshagen */ 257678e470c2SHeinz Mauelshagen int r5c_journal_mode_set(struct mddev *mddev, int mode) 25772c7da14bSSong Liu { 2578b44886c5SSong Liu struct r5conf *conf; 25792c7da14bSSong Liu 258078e470c2SHeinz Mauelshagen if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || 258178e470c2SHeinz Mauelshagen mode > R5C_JOURNAL_MODE_WRITE_BACK) 25822c7da14bSSong Liu return -EINVAL; 25832c7da14bSSong Liu 2584b44886c5SSong Liu conf = mddev->private; 2585ff35f58eSSong Liu if (!conf || !conf->log) 2586b44886c5SSong Liu return -ENODEV; 2587b44886c5SSong Liu 25882e38a37fSSong Liu if (raid5_calc_degraded(conf) > 0 && 2589ff35f58eSSong Liu mode == R5C_JOURNAL_MODE_WRITE_BACK) 25902e38a37fSSong Liu return -EINVAL; 25912e38a37fSSong Liu 25922c7da14bSSong Liu mddev_suspend(mddev); 259378e470c2SHeinz Mauelshagen conf->log->r5c_journal_mode = mode; 25942c7da14bSSong Liu mddev_resume(mddev); 25952c7da14bSSong Liu 25962c7da14bSSong Liu pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 259778e470c2SHeinz Mauelshagen mdname(mddev), mode, r5c_journal_mode_str[mode]); 259878e470c2SHeinz Mauelshagen return 0; 259978e470c2SHeinz Mauelshagen } 260078e470c2SHeinz Mauelshagen EXPORT_SYMBOL(r5c_journal_mode_set); 260178e470c2SHeinz Mauelshagen 260278e470c2SHeinz Mauelshagen static ssize_t r5c_journal_mode_store(struct mddev *mddev, 260378e470c2SHeinz Mauelshagen const char *page, size_t length) 260478e470c2SHeinz Mauelshagen { 260578e470c2SHeinz Mauelshagen int mode = ARRAY_SIZE(r5c_journal_mode_str); 260678e470c2SHeinz Mauelshagen size_t len = length; 2607ff35f58eSSong Liu int ret; 260878e470c2SHeinz Mauelshagen 260978e470c2SHeinz Mauelshagen if (len < 2) 261078e470c2SHeinz Mauelshagen return -EINVAL; 261178e470c2SHeinz Mauelshagen 261278e470c2SHeinz Mauelshagen if (page[len - 1] == '\n') 261378e470c2SHeinz Mauelshagen len--; 261478e470c2SHeinz Mauelshagen 261578e470c2SHeinz Mauelshagen while (mode--) 261678e470c2SHeinz Mauelshagen if (strlen(r5c_journal_mode_str[mode]) == len && 261778e470c2SHeinz Mauelshagen !strncmp(page, r5c_journal_mode_str[mode], len)) 261878e470c2SHeinz Mauelshagen break; 2619ff35f58eSSong Liu ret = mddev_lock(mddev); 2620ff35f58eSSong Liu if (ret) 2621ff35f58eSSong Liu return ret; 2622ff35f58eSSong Liu ret = r5c_journal_mode_set(mddev, mode); 2623ff35f58eSSong Liu mddev_unlock(mddev); 2624ff35f58eSSong Liu return ret ?: length; 26252c7da14bSSong Liu } 26262c7da14bSSong Liu 26272c7da14bSSong Liu struct md_sysfs_entry 26282c7da14bSSong Liu r5c_journal_mode = __ATTR(journal_mode, 0644, 26292c7da14bSSong Liu r5c_journal_mode_show, r5c_journal_mode_store); 26302c7da14bSSong Liu 26312ded3703SSong Liu /* 26322ded3703SSong Liu * Try handle write operation in caching phase. This function should only 26332ded3703SSong Liu * be called in write-back mode. 26342ded3703SSong Liu * 26352ded3703SSong Liu * If all outstanding writes can be handled in caching phase, returns 0 26362ded3703SSong Liu * If writes requires write-out phase, call r5c_make_stripe_write_out() 26372ded3703SSong Liu * and returns -EAGAIN 26382ded3703SSong Liu */ 26392ded3703SSong Liu int r5c_try_caching_write(struct r5conf *conf, 26402ded3703SSong Liu struct stripe_head *sh, 26412ded3703SSong Liu struct stripe_head_state *s, 26422ded3703SSong Liu int disks) 26432ded3703SSong Liu { 26442ded3703SSong Liu struct r5l_log *log = conf->log; 26451e6d690bSSong Liu int i; 26461e6d690bSSong Liu struct r5dev *dev; 26471e6d690bSSong Liu int to_cache = 0; 264803b047f4SSong Liu void **pslot; 264903b047f4SSong Liu sector_t tree_index; 265003b047f4SSong Liu int ret; 265103b047f4SSong Liu uintptr_t refcount; 26522ded3703SSong Liu 26532ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 26542ded3703SSong Liu 26551e6d690bSSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 26561e6d690bSSong Liu /* 26571e6d690bSSong Liu * There are two different scenarios here: 26581e6d690bSSong Liu * 1. The stripe has some data cached, and it is sent to 26591e6d690bSSong Liu * write-out phase for reclaim 26601e6d690bSSong Liu * 2. The stripe is clean, and this is the first write 26611e6d690bSSong Liu * 26621e6d690bSSong Liu * For 1, return -EAGAIN, so we continue with 26631e6d690bSSong Liu * handle_stripe_dirtying(). 26641e6d690bSSong Liu * 26651e6d690bSSong Liu * For 2, set STRIPE_R5C_CACHING and continue with caching 26661e6d690bSSong Liu * write. 26671e6d690bSSong Liu */ 26681e6d690bSSong Liu 26691e6d690bSSong Liu /* case 1: anything injournal or anything in written */ 26701e6d690bSSong Liu if (s->injournal > 0 || s->written > 0) 26711e6d690bSSong Liu return -EAGAIN; 26721e6d690bSSong Liu /* case 2 */ 26731e6d690bSSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 26741e6d690bSSong Liu } 26751e6d690bSSong Liu 26762e38a37fSSong Liu /* 26772e38a37fSSong Liu * When run in degraded mode, array is set to write-through mode. 26782e38a37fSSong Liu * This check helps drain pending write safely in the transition to 26792e38a37fSSong Liu * write-through mode. 26805ddf0440SSong Liu * 26815ddf0440SSong Liu * When a stripe is syncing, the write is also handled in write 26825ddf0440SSong Liu * through mode. 26832e38a37fSSong Liu */ 26845ddf0440SSong Liu if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) { 26852e38a37fSSong Liu r5c_make_stripe_write_out(sh); 26862e38a37fSSong Liu return -EAGAIN; 26872e38a37fSSong Liu } 26882e38a37fSSong Liu 26891e6d690bSSong Liu for (i = disks; i--; ) { 26901e6d690bSSong Liu dev = &sh->dev[i]; 26911e6d690bSSong Liu /* if non-overwrite, use writing-out phase */ 26921e6d690bSSong Liu if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 26931e6d690bSSong Liu !test_bit(R5_InJournal, &dev->flags)) { 26942ded3703SSong Liu r5c_make_stripe_write_out(sh); 26952ded3703SSong Liu return -EAGAIN; 26962ded3703SSong Liu } 26971e6d690bSSong Liu } 26981e6d690bSSong Liu 269903b047f4SSong Liu /* if the stripe is not counted in big_stripe_tree, add it now */ 270003b047f4SSong Liu if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 270103b047f4SSong Liu !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 270203b047f4SSong Liu tree_index = r5c_tree_index(conf, sh->sector); 270303b047f4SSong Liu spin_lock(&log->tree_lock); 270403b047f4SSong Liu pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 270503b047f4SSong Liu tree_index); 270603b047f4SSong Liu if (pslot) { 270703b047f4SSong Liu refcount = (uintptr_t)radix_tree_deref_slot_protected( 270803b047f4SSong Liu pslot, &log->tree_lock) >> 270903b047f4SSong Liu R5C_RADIX_COUNT_SHIFT; 271003b047f4SSong Liu radix_tree_replace_slot( 271103b047f4SSong Liu &log->big_stripe_tree, pslot, 271203b047f4SSong Liu (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); 271303b047f4SSong Liu } else { 271403b047f4SSong Liu /* 271503b047f4SSong Liu * this radix_tree_insert can fail safely, so no 271603b047f4SSong Liu * need to call radix_tree_preload() 271703b047f4SSong Liu */ 271803b047f4SSong Liu ret = radix_tree_insert( 271903b047f4SSong Liu &log->big_stripe_tree, tree_index, 272003b047f4SSong Liu (void *)(1 << R5C_RADIX_COUNT_SHIFT)); 272103b047f4SSong Liu if (ret) { 272203b047f4SSong Liu spin_unlock(&log->tree_lock); 272303b047f4SSong Liu r5c_make_stripe_write_out(sh); 272403b047f4SSong Liu return -EAGAIN; 272503b047f4SSong Liu } 272603b047f4SSong Liu } 272703b047f4SSong Liu spin_unlock(&log->tree_lock); 272803b047f4SSong Liu 272903b047f4SSong Liu /* 273003b047f4SSong Liu * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is 273103b047f4SSong Liu * counted in the radix tree 273203b047f4SSong Liu */ 273303b047f4SSong Liu set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 273403b047f4SSong Liu atomic_inc(&conf->r5c_cached_partial_stripes); 273503b047f4SSong Liu } 273603b047f4SSong Liu 27371e6d690bSSong Liu for (i = disks; i--; ) { 27381e6d690bSSong Liu dev = &sh->dev[i]; 27391e6d690bSSong Liu if (dev->towrite) { 27401e6d690bSSong Liu set_bit(R5_Wantwrite, &dev->flags); 27411e6d690bSSong Liu set_bit(R5_Wantdrain, &dev->flags); 27421e6d690bSSong Liu set_bit(R5_LOCKED, &dev->flags); 27431e6d690bSSong Liu to_cache++; 27441e6d690bSSong Liu } 27451e6d690bSSong Liu } 27461e6d690bSSong Liu 27471e6d690bSSong Liu if (to_cache) { 27481e6d690bSSong Liu set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 27491e6d690bSSong Liu /* 27501e6d690bSSong Liu * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 27511e6d690bSSong Liu * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 27521e6d690bSSong Liu * r5c_handle_data_cached() 27531e6d690bSSong Liu */ 27541e6d690bSSong Liu set_bit(STRIPE_LOG_TRAPPED, &sh->state); 27551e6d690bSSong Liu } 27561e6d690bSSong Liu 27571e6d690bSSong Liu return 0; 27581e6d690bSSong Liu } 27591e6d690bSSong Liu 27601e6d690bSSong Liu /* 27611e6d690bSSong Liu * free extra pages (orig_page) we allocated for prexor 27621e6d690bSSong Liu */ 27631e6d690bSSong Liu void r5c_release_extra_page(struct stripe_head *sh) 27641e6d690bSSong Liu { 2765d7bd398eSSong Liu struct r5conf *conf = sh->raid_conf; 27661e6d690bSSong Liu int i; 2767d7bd398eSSong Liu bool using_disk_info_extra_page; 2768d7bd398eSSong Liu 2769d7bd398eSSong Liu using_disk_info_extra_page = 2770d7bd398eSSong Liu sh->dev[0].orig_page == conf->disks[0].extra_page; 27711e6d690bSSong Liu 27721e6d690bSSong Liu for (i = sh->disks; i--; ) 27731e6d690bSSong Liu if (sh->dev[i].page != sh->dev[i].orig_page) { 27741e6d690bSSong Liu struct page *p = sh->dev[i].orig_page; 27751e6d690bSSong Liu 27761e6d690bSSong Liu sh->dev[i].orig_page = sh->dev[i].page; 277786aa1397SSong Liu clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 277886aa1397SSong Liu 2779d7bd398eSSong Liu if (!using_disk_info_extra_page) 27801e6d690bSSong Liu put_page(p); 27811e6d690bSSong Liu } 2782d7bd398eSSong Liu 2783d7bd398eSSong Liu if (using_disk_info_extra_page) { 2784d7bd398eSSong Liu clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 2785d7bd398eSSong Liu md_wakeup_thread(conf->mddev->thread); 2786d7bd398eSSong Liu } 2787d7bd398eSSong Liu } 2788d7bd398eSSong Liu 2789d7bd398eSSong Liu void r5c_use_extra_page(struct stripe_head *sh) 2790d7bd398eSSong Liu { 2791d7bd398eSSong Liu struct r5conf *conf = sh->raid_conf; 2792d7bd398eSSong Liu int i; 2793d7bd398eSSong Liu struct r5dev *dev; 2794d7bd398eSSong Liu 2795d7bd398eSSong Liu for (i = sh->disks; i--; ) { 2796d7bd398eSSong Liu dev = &sh->dev[i]; 2797d7bd398eSSong Liu if (dev->orig_page != dev->page) 2798d7bd398eSSong Liu put_page(dev->orig_page); 2799d7bd398eSSong Liu dev->orig_page = conf->disks[i].extra_page; 2800d7bd398eSSong Liu } 28011e6d690bSSong Liu } 28022ded3703SSong Liu 28032ded3703SSong Liu /* 28042ded3703SSong Liu * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 28052ded3703SSong Liu * stripe is committed to RAID disks. 28062ded3703SSong Liu */ 28072ded3703SSong Liu void r5c_finish_stripe_write_out(struct r5conf *conf, 28082ded3703SSong Liu struct stripe_head *sh, 28092ded3703SSong Liu struct stripe_head_state *s) 28102ded3703SSong Liu { 281103b047f4SSong Liu struct r5l_log *log = conf->log; 28121e6d690bSSong Liu int i; 28131e6d690bSSong Liu int do_wakeup = 0; 281403b047f4SSong Liu sector_t tree_index; 281503b047f4SSong Liu void **pslot; 281603b047f4SSong Liu uintptr_t refcount; 28171e6d690bSSong Liu 281803b047f4SSong Liu if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 28192ded3703SSong Liu return; 28202ded3703SSong Liu 28212ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 28222ded3703SSong Liu clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 28232ded3703SSong Liu 282403b047f4SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 28252ded3703SSong Liu return; 28261e6d690bSSong Liu 28271e6d690bSSong Liu for (i = sh->disks; i--; ) { 28281e6d690bSSong Liu clear_bit(R5_InJournal, &sh->dev[i].flags); 28291e6d690bSSong Liu if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 28301e6d690bSSong Liu do_wakeup = 1; 28311e6d690bSSong Liu } 28321e6d690bSSong Liu 28331e6d690bSSong Liu /* 28341e6d690bSSong Liu * analyse_stripe() runs before r5c_finish_stripe_write_out(), 28351e6d690bSSong Liu * We updated R5_InJournal, so we also update s->injournal. 28361e6d690bSSong Liu */ 28371e6d690bSSong Liu s->injournal = 0; 28381e6d690bSSong Liu 28391e6d690bSSong Liu if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 28401e6d690bSSong Liu if (atomic_dec_and_test(&conf->pending_full_writes)) 28411e6d690bSSong Liu md_wakeup_thread(conf->mddev->thread); 28421e6d690bSSong Liu 28431e6d690bSSong Liu if (do_wakeup) 28441e6d690bSSong Liu wake_up(&conf->wait_for_overlap); 2845a39f7afdSSong Liu 284603b047f4SSong Liu spin_lock_irq(&log->stripe_in_journal_lock); 2847a39f7afdSSong Liu list_del_init(&sh->r5c); 284803b047f4SSong Liu spin_unlock_irq(&log->stripe_in_journal_lock); 2849a39f7afdSSong Liu sh->log_start = MaxSector; 285003b047f4SSong Liu 285103b047f4SSong Liu atomic_dec(&log->stripe_in_journal_count); 285203b047f4SSong Liu r5c_update_log_state(log); 285303b047f4SSong Liu 285403b047f4SSong Liu /* stop counting this stripe in big_stripe_tree */ 285503b047f4SSong Liu if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || 285603b047f4SSong Liu test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 285703b047f4SSong Liu tree_index = r5c_tree_index(conf, sh->sector); 285803b047f4SSong Liu spin_lock(&log->tree_lock); 285903b047f4SSong Liu pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 286003b047f4SSong Liu tree_index); 286103b047f4SSong Liu BUG_ON(pslot == NULL); 286203b047f4SSong Liu refcount = (uintptr_t)radix_tree_deref_slot_protected( 286303b047f4SSong Liu pslot, &log->tree_lock) >> 286403b047f4SSong Liu R5C_RADIX_COUNT_SHIFT; 286503b047f4SSong Liu if (refcount == 1) 286603b047f4SSong Liu radix_tree_delete(&log->big_stripe_tree, tree_index); 286703b047f4SSong Liu else 286803b047f4SSong Liu radix_tree_replace_slot( 286903b047f4SSong Liu &log->big_stripe_tree, pslot, 287003b047f4SSong Liu (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); 287103b047f4SSong Liu spin_unlock(&log->tree_lock); 287203b047f4SSong Liu } 287303b047f4SSong Liu 287403b047f4SSong Liu if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 287503b047f4SSong Liu BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 2876e33fbb9cSShaohua Li atomic_dec(&conf->r5c_flushing_partial_stripes); 287703b047f4SSong Liu atomic_dec(&conf->r5c_cached_partial_stripes); 287803b047f4SSong Liu } 287903b047f4SSong Liu 288003b047f4SSong Liu if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 288103b047f4SSong Liu BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 2882e33fbb9cSShaohua Li atomic_dec(&conf->r5c_flushing_full_stripes); 288303b047f4SSong Liu atomic_dec(&conf->r5c_cached_full_stripes); 288403b047f4SSong Liu } 2885ea17481fSSong Liu 2886ea17481fSSong Liu r5l_append_flush_payload(log, sh->sector); 28875ddf0440SSong Liu /* stripe is flused to raid disks, we can do resync now */ 28885ddf0440SSong Liu if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 28895ddf0440SSong Liu set_bit(STRIPE_HANDLE, &sh->state); 28901e6d690bSSong Liu } 28911e6d690bSSong Liu 2892ff875738SArtur Paszkiewicz int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) 28931e6d690bSSong Liu { 2894a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 28951e6d690bSSong Liu int pages = 0; 28961e6d690bSSong Liu int reserve; 28971e6d690bSSong Liu int i; 28981e6d690bSSong Liu int ret = 0; 28991e6d690bSSong Liu 29001e6d690bSSong Liu BUG_ON(!log); 29011e6d690bSSong Liu 29021e6d690bSSong Liu for (i = 0; i < sh->disks; i++) { 29031e6d690bSSong Liu void *addr; 29041e6d690bSSong Liu 29051e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 29061e6d690bSSong Liu continue; 29071e6d690bSSong Liu addr = kmap_atomic(sh->dev[i].page); 29081e6d690bSSong Liu sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 29091e6d690bSSong Liu addr, PAGE_SIZE); 29101e6d690bSSong Liu kunmap_atomic(addr); 29111e6d690bSSong Liu pages++; 29121e6d690bSSong Liu } 29131e6d690bSSong Liu WARN_ON(pages == 0); 29141e6d690bSSong Liu 29151e6d690bSSong Liu /* 29161e6d690bSSong Liu * The stripe must enter state machine again to call endio, so 29171e6d690bSSong Liu * don't delay. 29181e6d690bSSong Liu */ 29191e6d690bSSong Liu clear_bit(STRIPE_DELAYED, &sh->state); 29201e6d690bSSong Liu atomic_inc(&sh->count); 29211e6d690bSSong Liu 29221e6d690bSSong Liu mutex_lock(&log->io_mutex); 29231e6d690bSSong Liu /* meta + data */ 29241e6d690bSSong Liu reserve = (1 + pages) << (PAGE_SHIFT - 9); 29251e6d690bSSong Liu 2926a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2927a39f7afdSSong Liu sh->log_start == MaxSector) 2928a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 2929a39f7afdSSong Liu else if (!r5l_has_free_space(log, reserve)) { 2930a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 2931a39f7afdSSong Liu BUG(); 2932a39f7afdSSong Liu else 2933a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 29341e6d690bSSong Liu } else { 29351e6d690bSSong Liu ret = r5l_log_stripe(log, sh, pages, 0); 29361e6d690bSSong Liu if (ret) { 29371e6d690bSSong Liu spin_lock_irq(&log->io_list_lock); 29381e6d690bSSong Liu list_add_tail(&sh->log_list, &log->no_mem_stripes); 29391e6d690bSSong Liu spin_unlock_irq(&log->io_list_lock); 29401e6d690bSSong Liu } 29411e6d690bSSong Liu } 29421e6d690bSSong Liu 29431e6d690bSSong Liu mutex_unlock(&log->io_mutex); 29441e6d690bSSong Liu return 0; 2945f6bed0efSShaohua Li } 2946f6bed0efSShaohua Li 294703b047f4SSong Liu /* check whether this big stripe is in write back cache. */ 294803b047f4SSong Liu bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) 294903b047f4SSong Liu { 295003b047f4SSong Liu struct r5l_log *log = conf->log; 295103b047f4SSong Liu sector_t tree_index; 295203b047f4SSong Liu void *slot; 295303b047f4SSong Liu 295403b047f4SSong Liu if (!log) 295503b047f4SSong Liu return false; 295603b047f4SSong Liu 295703b047f4SSong Liu WARN_ON_ONCE(!rcu_read_lock_held()); 295803b047f4SSong Liu tree_index = r5c_tree_index(conf, sect); 295903b047f4SSong Liu slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); 296003b047f4SSong Liu return slot != NULL; 296103b047f4SSong Liu } 296203b047f4SSong Liu 2963f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log) 2964f6bed0efSShaohua Li { 2965f6bed0efSShaohua Li struct md_rdev *rdev = log->rdev; 2966f6bed0efSShaohua Li struct page *page; 2967f6bed0efSShaohua Li struct r5l_meta_block *mb; 2968f6bed0efSShaohua Li sector_t cp = log->rdev->journal_tail; 2969f6bed0efSShaohua Li u32 stored_crc, expected_crc; 2970f6bed0efSShaohua Li bool create_super = false; 2971d30dfeb9SJackieLiu int ret = 0; 2972f6bed0efSShaohua Li 2973f6bed0efSShaohua Li /* Make sure it's valid */ 2974f6bed0efSShaohua Li if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2975f6bed0efSShaohua Li cp = 0; 2976f6bed0efSShaohua Li page = alloc_page(GFP_KERNEL); 2977f6bed0efSShaohua Li if (!page) 2978f6bed0efSShaohua Li return -ENOMEM; 2979f6bed0efSShaohua Li 2980796a5cf0SMike Christie if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2981f6bed0efSShaohua Li ret = -EIO; 2982f6bed0efSShaohua Li goto ioerr; 2983f6bed0efSShaohua Li } 2984f6bed0efSShaohua Li mb = page_address(page); 2985f6bed0efSShaohua Li 2986f6bed0efSShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2987f6bed0efSShaohua Li mb->version != R5LOG_VERSION) { 2988f6bed0efSShaohua Li create_super = true; 2989f6bed0efSShaohua Li goto create; 2990f6bed0efSShaohua Li } 2991f6bed0efSShaohua Li stored_crc = le32_to_cpu(mb->checksum); 2992f6bed0efSShaohua Li mb->checksum = 0; 29935cb2fbd6SShaohua Li expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2994f6bed0efSShaohua Li if (stored_crc != expected_crc) { 2995f6bed0efSShaohua Li create_super = true; 2996f6bed0efSShaohua Li goto create; 2997f6bed0efSShaohua Li } 2998f6bed0efSShaohua Li if (le64_to_cpu(mb->position) != cp) { 2999f6bed0efSShaohua Li create_super = true; 3000f6bed0efSShaohua Li goto create; 3001f6bed0efSShaohua Li } 3002f6bed0efSShaohua Li create: 3003f6bed0efSShaohua Li if (create_super) { 3004f6bed0efSShaohua Li log->last_cp_seq = prandom_u32(); 3005f6bed0efSShaohua Li cp = 0; 300656056c2eSZhengyuan Liu r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 3007f6bed0efSShaohua Li /* 3008f6bed0efSShaohua Li * Make sure super points to correct address. Log might have 3009f6bed0efSShaohua Li * data very soon. If super hasn't correct log tail address, 3010f6bed0efSShaohua Li * recovery can't find the log 3011f6bed0efSShaohua Li */ 3012f6bed0efSShaohua Li r5l_write_super(log, cp); 3013f6bed0efSShaohua Li } else 3014f6bed0efSShaohua Li log->last_cp_seq = le64_to_cpu(mb->seq); 3015f6bed0efSShaohua Li 3016f6bed0efSShaohua Li log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 30170576b1c6SShaohua Li log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 30180576b1c6SShaohua Li if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 30190576b1c6SShaohua Li log->max_free_space = RECLAIM_MAX_FREE_SPACE; 3020f6bed0efSShaohua Li log->last_checkpoint = cp; 3021f6bed0efSShaohua Li 3022f6bed0efSShaohua Li __free_page(page); 3023f6bed0efSShaohua Li 3024d30dfeb9SJackieLiu if (create_super) { 3025d30dfeb9SJackieLiu log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); 3026d30dfeb9SJackieLiu log->seq = log->last_cp_seq + 1; 3027d30dfeb9SJackieLiu log->next_checkpoint = cp; 3028d30dfeb9SJackieLiu } else 30293d7e7e1dSZhengyuan Liu ret = r5l_recovery_log(log); 3030d30dfeb9SJackieLiu 30313d7e7e1dSZhengyuan Liu r5c_update_log_state(log); 30323d7e7e1dSZhengyuan Liu return ret; 3033f6bed0efSShaohua Li ioerr: 3034f6bed0efSShaohua Li __free_page(page); 3035f6bed0efSShaohua Li return ret; 3036f6bed0efSShaohua Li } 3037f6bed0efSShaohua Li 3038d5d885fdSSong Liu int r5l_start(struct r5l_log *log) 3039d5d885fdSSong Liu { 3040d5d885fdSSong Liu int ret; 3041d5d885fdSSong Liu 3042d5d885fdSSong Liu if (!log) 3043d5d885fdSSong Liu return 0; 3044d5d885fdSSong Liu 3045d5d885fdSSong Liu ret = r5l_load_log(log); 3046d5d885fdSSong Liu if (ret) { 3047d5d885fdSSong Liu struct mddev *mddev = log->rdev->mddev; 3048d5d885fdSSong Liu struct r5conf *conf = mddev->private; 3049d5d885fdSSong Liu 3050d5d885fdSSong Liu r5l_exit_log(conf); 3051d5d885fdSSong Liu } 3052d5d885fdSSong Liu return ret; 3053d5d885fdSSong Liu } 3054d5d885fdSSong Liu 305570d466f7SSong Liu void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) 30562e38a37fSSong Liu { 30572e38a37fSSong Liu struct r5conf *conf = mddev->private; 30582e38a37fSSong Liu struct r5l_log *log = conf->log; 30592e38a37fSSong Liu 30602e38a37fSSong Liu if (!log) 30612e38a37fSSong Liu return; 30622e38a37fSSong Liu 306370d466f7SSong Liu if ((raid5_calc_degraded(conf) > 0 || 306470d466f7SSong Liu test_bit(Journal, &rdev->flags)) && 30652e38a37fSSong Liu conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) 30662e38a37fSSong Liu schedule_work(&log->disable_writeback_work); 30672e38a37fSSong Liu } 30682e38a37fSSong Liu 3069f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 3070f6bed0efSShaohua Li { 3071c888a8f9SJens Axboe struct request_queue *q = bdev_get_queue(rdev->bdev); 3072f6bed0efSShaohua Li struct r5l_log *log; 3073ff875738SArtur Paszkiewicz char b[BDEVNAME_SIZE]; 3074afeee514SKent Overstreet int ret; 3075ff875738SArtur Paszkiewicz 3076ff875738SArtur Paszkiewicz pr_debug("md/raid:%s: using device %s as journal\n", 3077ff875738SArtur Paszkiewicz mdname(conf->mddev), bdevname(rdev->bdev, b)); 3078f6bed0efSShaohua Li 3079f6bed0efSShaohua Li if (PAGE_SIZE != 4096) 3080f6bed0efSShaohua Li return -EINVAL; 3081c757ec95SSong Liu 3082c757ec95SSong Liu /* 3083c757ec95SSong Liu * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 3084c757ec95SSong Liu * raid_disks r5l_payload_data_parity. 3085c757ec95SSong Liu * 3086c757ec95SSong Liu * Write journal and cache does not work for very big array 3087c757ec95SSong Liu * (raid_disks > 203) 3088c757ec95SSong Liu */ 3089c757ec95SSong Liu if (sizeof(struct r5l_meta_block) + 3090c757ec95SSong Liu ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 3091c757ec95SSong Liu conf->raid_disks) > PAGE_SIZE) { 3092c757ec95SSong Liu pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 3093c757ec95SSong Liu mdname(conf->mddev), conf->raid_disks); 3094c757ec95SSong Liu return -EINVAL; 3095c757ec95SSong Liu } 3096c757ec95SSong Liu 3097f6bed0efSShaohua Li log = kzalloc(sizeof(*log), GFP_KERNEL); 3098f6bed0efSShaohua Li if (!log) 3099f6bed0efSShaohua Li return -ENOMEM; 3100f6bed0efSShaohua Li log->rdev = rdev; 3101f6bed0efSShaohua Li 3102c888a8f9SJens Axboe log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 310356fef7c6SChristoph Hellwig 31045cb2fbd6SShaohua Li log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 3105f6bed0efSShaohua Li sizeof(rdev->mddev->uuid)); 3106f6bed0efSShaohua Li 3107f6bed0efSShaohua Li mutex_init(&log->io_mutex); 3108f6bed0efSShaohua Li 3109f6bed0efSShaohua Li spin_lock_init(&log->io_list_lock); 3110f6bed0efSShaohua Li INIT_LIST_HEAD(&log->running_ios); 31110576b1c6SShaohua Li INIT_LIST_HEAD(&log->io_end_ios); 3112a8c34f91SShaohua Li INIT_LIST_HEAD(&log->flushing_ios); 311304732f74SChristoph Hellwig INIT_LIST_HEAD(&log->finished_ios); 31143a83f467SMing Lei bio_init(&log->flush_bio, NULL, 0); 3115f6bed0efSShaohua Li 3116f6bed0efSShaohua Li log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 3117f6bed0efSShaohua Li if (!log->io_kc) 3118f6bed0efSShaohua Li goto io_kc; 3119f6bed0efSShaohua Li 3120afeee514SKent Overstreet ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc); 3121afeee514SKent Overstreet if (ret) 31225036c390SChristoph Hellwig goto io_pool; 31235036c390SChristoph Hellwig 3124afeee514SKent Overstreet ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); 3125afeee514SKent Overstreet if (ret) 3126c38d29b3SChristoph Hellwig goto io_bs; 3127c38d29b3SChristoph Hellwig 3128afeee514SKent Overstreet ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0); 3129afeee514SKent Overstreet if (ret) 3130e8deb638SChristoph Hellwig goto out_mempool; 3131e8deb638SChristoph Hellwig 313203b047f4SSong Liu spin_lock_init(&log->tree_lock); 313303b047f4SSong Liu INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); 313403b047f4SSong Liu 31350576b1c6SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 31360576b1c6SShaohua Li log->rdev->mddev, "reclaim"); 31370576b1c6SShaohua Li if (!log->reclaim_thread) 31380576b1c6SShaohua Li goto reclaim_thread; 3139a39f7afdSSong Liu log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 3140a39f7afdSSong Liu 31410fd22b45SShaohua Li init_waitqueue_head(&log->iounit_wait); 31420576b1c6SShaohua Li 31435036c390SChristoph Hellwig INIT_LIST_HEAD(&log->no_mem_stripes); 31445036c390SChristoph Hellwig 3145f6bed0efSShaohua Li INIT_LIST_HEAD(&log->no_space_stripes); 3146f6bed0efSShaohua Li spin_lock_init(&log->no_space_stripes_lock); 3147f6bed0efSShaohua Li 31483bddb7f8SSong Liu INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 31492e38a37fSSong Liu INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); 31503bddb7f8SSong Liu 31512ded3703SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 3152a39f7afdSSong Liu INIT_LIST_HEAD(&log->stripe_in_journal_list); 3153a39f7afdSSong Liu spin_lock_init(&log->stripe_in_journal_lock); 3154a39f7afdSSong Liu atomic_set(&log->stripe_in_journal_count, 0); 31552ded3703SSong Liu 3156d2250f10SSong Liu rcu_assign_pointer(conf->log, log); 3157d2250f10SSong Liu 3158a62ab49eSShaohua Li set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 3159f6bed0efSShaohua Li return 0; 3160e8deb638SChristoph Hellwig 31610576b1c6SShaohua Li reclaim_thread: 3162afeee514SKent Overstreet mempool_exit(&log->meta_pool); 3163e8deb638SChristoph Hellwig out_mempool: 3164afeee514SKent Overstreet bioset_exit(&log->bs); 3165c38d29b3SChristoph Hellwig io_bs: 3166afeee514SKent Overstreet mempool_exit(&log->io_pool); 31675036c390SChristoph Hellwig io_pool: 3168f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 3169f6bed0efSShaohua Li io_kc: 3170f6bed0efSShaohua Li kfree(log); 3171f6bed0efSShaohua Li return -EINVAL; 3172f6bed0efSShaohua Li } 3173f6bed0efSShaohua Li 3174ff875738SArtur Paszkiewicz void r5l_exit_log(struct r5conf *conf) 3175f6bed0efSShaohua Li { 3176ff875738SArtur Paszkiewicz struct r5l_log *log = conf->log; 3177ff875738SArtur Paszkiewicz 3178ff875738SArtur Paszkiewicz conf->log = NULL; 3179ff875738SArtur Paszkiewicz synchronize_rcu(); 3180ff875738SArtur Paszkiewicz 31814d5324f7SNeilBrown /* Ensure disable_writeback_work wakes up and exits */ 31824d5324f7SNeilBrown wake_up(&conf->mddev->sb_wait); 31832e38a37fSSong Liu flush_work(&log->disable_writeback_work); 31840576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 3185afeee514SKent Overstreet mempool_exit(&log->meta_pool); 3186afeee514SKent Overstreet bioset_exit(&log->bs); 3187afeee514SKent Overstreet mempool_exit(&log->io_pool); 3188f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 3189f6bed0efSShaohua Li kfree(log); 3190f6bed0efSShaohua Li } 3191