1f6bed0efSShaohua Li /* 2f6bed0efSShaohua Li * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3b4c625c6SSong Liu * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4f6bed0efSShaohua Li * 5f6bed0efSShaohua Li * This program is free software; you can redistribute it and/or modify it 6f6bed0efSShaohua Li * under the terms and conditions of the GNU General Public License, 7f6bed0efSShaohua Li * version 2, as published by the Free Software Foundation. 8f6bed0efSShaohua Li * 9f6bed0efSShaohua Li * This program is distributed in the hope it will be useful, but WITHOUT 10f6bed0efSShaohua Li * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11f6bed0efSShaohua Li * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12f6bed0efSShaohua Li * more details. 13f6bed0efSShaohua Li * 14f6bed0efSShaohua Li */ 15f6bed0efSShaohua Li #include <linux/kernel.h> 16f6bed0efSShaohua Li #include <linux/wait.h> 17f6bed0efSShaohua Li #include <linux/blkdev.h> 18f6bed0efSShaohua Li #include <linux/slab.h> 19f6bed0efSShaohua Li #include <linux/raid/md_p.h> 205cb2fbd6SShaohua Li #include <linux/crc32c.h> 21f6bed0efSShaohua Li #include <linux/random.h> 22ce1ccd07SShaohua Li #include <linux/kthread.h> 2303b047f4SSong Liu #include <linux/types.h> 24f6bed0efSShaohua Li #include "md.h" 25f6bed0efSShaohua Li #include "raid5.h" 261e6d690bSSong Liu #include "bitmap.h" 27f6bed0efSShaohua Li 28f6bed0efSShaohua Li /* 29f6bed0efSShaohua Li * metadata/data stored in disk with 4k size unit (a block) regardless 30f6bed0efSShaohua Li * underneath hardware sector size. only works with PAGE_SIZE == 4096 31f6bed0efSShaohua Li */ 32f6bed0efSShaohua Li #define BLOCK_SECTORS (8) 33f6bed0efSShaohua Li 340576b1c6SShaohua Li /* 35a39f7afdSSong Liu * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 36a39f7afdSSong Liu * 37a39f7afdSSong Liu * In write through mode, the reclaim runs every log->max_free_space. 38a39f7afdSSong Liu * This can prevent the recovery scans for too long 390576b1c6SShaohua Li */ 400576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 410576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 420576b1c6SShaohua Li 43a39f7afdSSong Liu /* wake up reclaim thread periodically */ 44a39f7afdSSong Liu #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 45a39f7afdSSong Liu /* start flush with these full stripes */ 46a39f7afdSSong Liu #define R5C_FULL_STRIPE_FLUSH_BATCH 256 47a39f7afdSSong Liu /* reclaim stripes in groups */ 48a39f7afdSSong Liu #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 49a39f7afdSSong Liu 50c38d29b3SChristoph Hellwig /* 51c38d29b3SChristoph Hellwig * We only need 2 bios per I/O unit to make progress, but ensure we 52c38d29b3SChristoph Hellwig * have a few more available to not get too tight. 53c38d29b3SChristoph Hellwig */ 54c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE 4 55c38d29b3SChristoph Hellwig 562ded3703SSong Liu /* 572ded3703SSong Liu * r5c journal modes of the array: write-back or write-through. 582ded3703SSong Liu * write-through mode has identical behavior as existing log only 592ded3703SSong Liu * implementation. 602ded3703SSong Liu */ 612ded3703SSong Liu enum r5c_journal_mode { 622ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 632ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_BACK = 1, 642ded3703SSong Liu }; 652ded3703SSong Liu 662c7da14bSSong Liu static char *r5c_journal_mode_str[] = {"write-through", 672c7da14bSSong Liu "write-back"}; 682ded3703SSong Liu /* 692ded3703SSong Liu * raid5 cache state machine 702ded3703SSong Liu * 719b69173eSJackieLiu * With the RAID cache, each stripe works in two phases: 722ded3703SSong Liu * - caching phase 732ded3703SSong Liu * - writing-out phase 742ded3703SSong Liu * 752ded3703SSong Liu * These two phases are controlled by bit STRIPE_R5C_CACHING: 762ded3703SSong Liu * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 772ded3703SSong Liu * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 782ded3703SSong Liu * 792ded3703SSong Liu * When there is no journal, or the journal is in write-through mode, 802ded3703SSong Liu * the stripe is always in writing-out phase. 812ded3703SSong Liu * 822ded3703SSong Liu * For write-back journal, the stripe is sent to caching phase on write 832ded3703SSong Liu * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 842ded3703SSong Liu * the write-out phase by clearing STRIPE_R5C_CACHING. 852ded3703SSong Liu * 862ded3703SSong Liu * Stripes in caching phase do not write the raid disks. Instead, all 872ded3703SSong Liu * writes are committed from the log device. Therefore, a stripe in 882ded3703SSong Liu * caching phase handles writes as: 892ded3703SSong Liu * - write to log device 902ded3703SSong Liu * - return IO 912ded3703SSong Liu * 922ded3703SSong Liu * Stripes in writing-out phase handle writes as: 932ded3703SSong Liu * - calculate parity 942ded3703SSong Liu * - write pending data and parity to journal 952ded3703SSong Liu * - write data and parity to raid disks 962ded3703SSong Liu * - return IO for pending writes 972ded3703SSong Liu */ 982ded3703SSong Liu 99f6bed0efSShaohua Li struct r5l_log { 100f6bed0efSShaohua Li struct md_rdev *rdev; 101f6bed0efSShaohua Li 102f6bed0efSShaohua Li u32 uuid_checksum; 103f6bed0efSShaohua Li 104f6bed0efSShaohua Li sector_t device_size; /* log device size, round to 105f6bed0efSShaohua Li * BLOCK_SECTORS */ 1060576b1c6SShaohua Li sector_t max_free_space; /* reclaim run if free space is at 1070576b1c6SShaohua Li * this size */ 108f6bed0efSShaohua Li 109f6bed0efSShaohua Li sector_t last_checkpoint; /* log tail. where recovery scan 110f6bed0efSShaohua Li * starts from */ 111f6bed0efSShaohua Li u64 last_cp_seq; /* log tail sequence */ 112f6bed0efSShaohua Li 113f6bed0efSShaohua Li sector_t log_start; /* log head. where new data appends */ 114f6bed0efSShaohua Li u64 seq; /* log head sequence */ 115f6bed0efSShaohua Li 11617036461SChristoph Hellwig sector_t next_checkpoint; 11717036461SChristoph Hellwig 118f6bed0efSShaohua Li struct mutex io_mutex; 119f6bed0efSShaohua Li struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 120f6bed0efSShaohua Li 121f6bed0efSShaohua Li spinlock_t io_list_lock; 122f6bed0efSShaohua Li struct list_head running_ios; /* io_units which are still running, 123f6bed0efSShaohua Li * and have not yet been completely 124f6bed0efSShaohua Li * written to the log */ 125f6bed0efSShaohua Li struct list_head io_end_ios; /* io_units which have been completely 126f6bed0efSShaohua Li * written to the log but not yet written 127f6bed0efSShaohua Li * to the RAID */ 128a8c34f91SShaohua Li struct list_head flushing_ios; /* io_units which are waiting for log 129a8c34f91SShaohua Li * cache flush */ 13004732f74SChristoph Hellwig struct list_head finished_ios; /* io_units which settle down in log disk */ 131a8c34f91SShaohua Li struct bio flush_bio; 132f6bed0efSShaohua Li 1335036c390SChristoph Hellwig struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 1345036c390SChristoph Hellwig 135f6bed0efSShaohua Li struct kmem_cache *io_kc; 1365036c390SChristoph Hellwig mempool_t *io_pool; 137c38d29b3SChristoph Hellwig struct bio_set *bs; 138e8deb638SChristoph Hellwig mempool_t *meta_pool; 139f6bed0efSShaohua Li 1400576b1c6SShaohua Li struct md_thread *reclaim_thread; 1410576b1c6SShaohua Li unsigned long reclaim_target; /* number of space that need to be 1420576b1c6SShaohua Li * reclaimed. if it's 0, reclaim spaces 1430576b1c6SShaohua Li * used by io_units which are in 1440576b1c6SShaohua Li * IO_UNIT_STRIPE_END state (eg, reclaim 1450576b1c6SShaohua Li * dones't wait for specific io_unit 1460576b1c6SShaohua Li * switching to IO_UNIT_STRIPE_END 1470576b1c6SShaohua Li * state) */ 1480fd22b45SShaohua Li wait_queue_head_t iounit_wait; 1490576b1c6SShaohua Li 150f6bed0efSShaohua Li struct list_head no_space_stripes; /* pending stripes, log has no space */ 151f6bed0efSShaohua Li spinlock_t no_space_stripes_lock; 15256fef7c6SChristoph Hellwig 15356fef7c6SChristoph Hellwig bool need_cache_flush; 1542ded3703SSong Liu 1552ded3703SSong Liu /* for r5c_cache */ 1562ded3703SSong Liu enum r5c_journal_mode r5c_journal_mode; 157a39f7afdSSong Liu 158a39f7afdSSong Liu /* all stripes in r5cache, in the order of seq at sh->log_start */ 159a39f7afdSSong Liu struct list_head stripe_in_journal_list; 160a39f7afdSSong Liu 161a39f7afdSSong Liu spinlock_t stripe_in_journal_lock; 162a39f7afdSSong Liu atomic_t stripe_in_journal_count; 1633bddb7f8SSong Liu 1643bddb7f8SSong Liu /* to submit async io_units, to fulfill ordering of flush */ 1653bddb7f8SSong Liu struct work_struct deferred_io_work; 1662e38a37fSSong Liu /* to disable write back during in degraded mode */ 1672e38a37fSSong Liu struct work_struct disable_writeback_work; 16803b047f4SSong Liu 16903b047f4SSong Liu /* to for chunk_aligned_read in writeback mode, details below */ 17003b047f4SSong Liu spinlock_t tree_lock; 17103b047f4SSong Liu struct radix_tree_root big_stripe_tree; 172f6bed0efSShaohua Li }; 173f6bed0efSShaohua Li 174f6bed0efSShaohua Li /* 17503b047f4SSong Liu * Enable chunk_aligned_read() with write back cache. 17603b047f4SSong Liu * 17703b047f4SSong Liu * Each chunk may contain more than one stripe (for example, a 256kB 17803b047f4SSong Liu * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For 17903b047f4SSong Liu * chunk_aligned_read, these stripes are grouped into one "big_stripe". 18003b047f4SSong Liu * For each big_stripe, we count how many stripes of this big_stripe 18103b047f4SSong Liu * are in the write back cache. These data are tracked in a radix tree 18203b047f4SSong Liu * (big_stripe_tree). We use radix_tree item pointer as the counter. 18303b047f4SSong Liu * r5c_tree_index() is used to calculate keys for the radix tree. 18403b047f4SSong Liu * 18503b047f4SSong Liu * chunk_aligned_read() calls r5c_big_stripe_cached() to look up 18603b047f4SSong Liu * big_stripe of each chunk in the tree. If this big_stripe is in the 18703b047f4SSong Liu * tree, chunk_aligned_read() aborts. This look up is protected by 18803b047f4SSong Liu * rcu_read_lock(). 18903b047f4SSong Liu * 19003b047f4SSong Liu * It is necessary to remember whether a stripe is counted in 19103b047f4SSong Liu * big_stripe_tree. Instead of adding new flag, we reuses existing flags: 19203b047f4SSong Liu * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these 19303b047f4SSong Liu * two flags are set, the stripe is counted in big_stripe_tree. This 19403b047f4SSong Liu * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to 19503b047f4SSong Liu * r5c_try_caching_write(); and moving clear_bit of 19603b047f4SSong Liu * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to 19703b047f4SSong Liu * r5c_finish_stripe_write_out(). 19803b047f4SSong Liu */ 19903b047f4SSong Liu 20003b047f4SSong Liu /* 20103b047f4SSong Liu * radix tree requests lowest 2 bits of data pointer to be 2b'00. 20203b047f4SSong Liu * So it is necessary to left shift the counter by 2 bits before using it 20303b047f4SSong Liu * as data pointer of the tree. 20403b047f4SSong Liu */ 20503b047f4SSong Liu #define R5C_RADIX_COUNT_SHIFT 2 20603b047f4SSong Liu 20703b047f4SSong Liu /* 20803b047f4SSong Liu * calculate key for big_stripe_tree 20903b047f4SSong Liu * 21003b047f4SSong Liu * sect: align_bi->bi_iter.bi_sector or sh->sector 21103b047f4SSong Liu */ 21203b047f4SSong Liu static inline sector_t r5c_tree_index(struct r5conf *conf, 21303b047f4SSong Liu sector_t sect) 21403b047f4SSong Liu { 21503b047f4SSong Liu sector_t offset; 21603b047f4SSong Liu 21703b047f4SSong Liu offset = sector_div(sect, conf->chunk_sectors); 21803b047f4SSong Liu return sect; 21903b047f4SSong Liu } 22003b047f4SSong Liu 22103b047f4SSong Liu /* 222f6bed0efSShaohua Li * an IO range starts from a meta data block and end at the next meta data 223f6bed0efSShaohua Li * block. The io unit's the meta data block tracks data/parity followed it. io 224f6bed0efSShaohua Li * unit is written to log disk with normal write, as we always flush log disk 225f6bed0efSShaohua Li * first and then start move data to raid disks, there is no requirement to 226f6bed0efSShaohua Li * write io unit with FLUSH/FUA 227f6bed0efSShaohua Li */ 228f6bed0efSShaohua Li struct r5l_io_unit { 229f6bed0efSShaohua Li struct r5l_log *log; 230f6bed0efSShaohua Li 231f6bed0efSShaohua Li struct page *meta_page; /* store meta block */ 232f6bed0efSShaohua Li int meta_offset; /* current offset in meta_page */ 233f6bed0efSShaohua Li 234f6bed0efSShaohua Li struct bio *current_bio;/* current_bio accepting new data */ 235f6bed0efSShaohua Li 236f6bed0efSShaohua Li atomic_t pending_stripe;/* how many stripes not flushed to raid */ 237f6bed0efSShaohua Li u64 seq; /* seq number of the metablock */ 238f6bed0efSShaohua Li sector_t log_start; /* where the io_unit starts */ 239f6bed0efSShaohua Li sector_t log_end; /* where the io_unit ends */ 240f6bed0efSShaohua Li struct list_head log_sibling; /* log->running_ios */ 241f6bed0efSShaohua Li struct list_head stripe_list; /* stripes added to the io_unit */ 242f6bed0efSShaohua Li 243f6bed0efSShaohua Li int state; 2446143e2ceSChristoph Hellwig bool need_split_bio; 2453bddb7f8SSong Liu struct bio *split_bio; 2463bddb7f8SSong Liu 2473bddb7f8SSong Liu unsigned int has_flush:1; /* include flush request */ 2483bddb7f8SSong Liu unsigned int has_fua:1; /* include fua request */ 2493bddb7f8SSong Liu unsigned int has_null_flush:1; /* include empty flush request */ 2503bddb7f8SSong Liu /* 2513bddb7f8SSong Liu * io isn't sent yet, flush/fua request can only be submitted till it's 2523bddb7f8SSong Liu * the first IO in running_ios list 2533bddb7f8SSong Liu */ 2543bddb7f8SSong Liu unsigned int io_deferred:1; 2553bddb7f8SSong Liu 2563bddb7f8SSong Liu struct bio_list flush_barriers; /* size == 0 flush bios */ 257f6bed0efSShaohua Li }; 258f6bed0efSShaohua Li 259f6bed0efSShaohua Li /* r5l_io_unit state */ 260f6bed0efSShaohua Li enum r5l_io_unit_state { 261f6bed0efSShaohua Li IO_UNIT_RUNNING = 0, /* accepting new IO */ 262f6bed0efSShaohua Li IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 263f6bed0efSShaohua Li * don't accepting new bio */ 264f6bed0efSShaohua Li IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 265a8c34f91SShaohua Li IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 266f6bed0efSShaohua Li }; 267f6bed0efSShaohua Li 2682ded3703SSong Liu bool r5c_is_writeback(struct r5l_log *log) 2692ded3703SSong Liu { 2702ded3703SSong Liu return (log != NULL && 2712ded3703SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 2722ded3703SSong Liu } 2732ded3703SSong Liu 274f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 275f6bed0efSShaohua Li { 276f6bed0efSShaohua Li start += inc; 277f6bed0efSShaohua Li if (start >= log->device_size) 278f6bed0efSShaohua Li start = start - log->device_size; 279f6bed0efSShaohua Li return start; 280f6bed0efSShaohua Li } 281f6bed0efSShaohua Li 282f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 283f6bed0efSShaohua Li sector_t end) 284f6bed0efSShaohua Li { 285f6bed0efSShaohua Li if (end >= start) 286f6bed0efSShaohua Li return end - start; 287f6bed0efSShaohua Li else 288f6bed0efSShaohua Li return end + log->device_size - start; 289f6bed0efSShaohua Li } 290f6bed0efSShaohua Li 291f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 292f6bed0efSShaohua Li { 293f6bed0efSShaohua Li sector_t used_size; 294f6bed0efSShaohua Li 295f6bed0efSShaohua Li used_size = r5l_ring_distance(log, log->last_checkpoint, 296f6bed0efSShaohua Li log->log_start); 297f6bed0efSShaohua Li 298f6bed0efSShaohua Li return log->device_size > used_size + size; 299f6bed0efSShaohua Li } 300f6bed0efSShaohua Li 301f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 302f6bed0efSShaohua Li enum r5l_io_unit_state state) 303f6bed0efSShaohua Li { 304f6bed0efSShaohua Li if (WARN_ON(io->state >= state)) 305f6bed0efSShaohua Li return; 306f6bed0efSShaohua Li io->state = state; 307f6bed0efSShaohua Li } 308f6bed0efSShaohua Li 3091e6d690bSSong Liu static void 3101e6d690bSSong Liu r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 3111e6d690bSSong Liu struct bio_list *return_bi) 3121e6d690bSSong Liu { 3131e6d690bSSong Liu struct bio *wbi, *wbi2; 3141e6d690bSSong Liu 3151e6d690bSSong Liu wbi = dev->written; 3161e6d690bSSong Liu dev->written = NULL; 3171e6d690bSSong Liu while (wbi && wbi->bi_iter.bi_sector < 3181e6d690bSSong Liu dev->sector + STRIPE_SECTORS) { 3191e6d690bSSong Liu wbi2 = r5_next_bio(wbi, dev->sector); 3201e6d690bSSong Liu if (!raid5_dec_bi_active_stripes(wbi)) { 3211e6d690bSSong Liu md_write_end(conf->mddev); 3221e6d690bSSong Liu bio_list_add(return_bi, wbi); 3231e6d690bSSong Liu } 3241e6d690bSSong Liu wbi = wbi2; 3251e6d690bSSong Liu } 3261e6d690bSSong Liu } 3271e6d690bSSong Liu 3281e6d690bSSong Liu void r5c_handle_cached_data_endio(struct r5conf *conf, 3291e6d690bSSong Liu struct stripe_head *sh, int disks, struct bio_list *return_bi) 3301e6d690bSSong Liu { 3311e6d690bSSong Liu int i; 3321e6d690bSSong Liu 3331e6d690bSSong Liu for (i = sh->disks; i--; ) { 3341e6d690bSSong Liu if (sh->dev[i].written) { 3351e6d690bSSong Liu set_bit(R5_UPTODATE, &sh->dev[i].flags); 3361e6d690bSSong Liu r5c_return_dev_pending_writes(conf, &sh->dev[i], 3371e6d690bSSong Liu return_bi); 3381e6d690bSSong Liu bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3391e6d690bSSong Liu STRIPE_SECTORS, 3401e6d690bSSong Liu !test_bit(STRIPE_DEGRADED, &sh->state), 3411e6d690bSSong Liu 0); 3421e6d690bSSong Liu } 3431e6d690bSSong Liu } 3441e6d690bSSong Liu } 3451e6d690bSSong Liu 346a39f7afdSSong Liu /* Check whether we should flush some stripes to free up stripe cache */ 347a39f7afdSSong Liu void r5c_check_stripe_cache_usage(struct r5conf *conf) 348a39f7afdSSong Liu { 349a39f7afdSSong Liu int total_cached; 350a39f7afdSSong Liu 351a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 352a39f7afdSSong Liu return; 353a39f7afdSSong Liu 354a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 355a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 356a39f7afdSSong Liu 357a39f7afdSSong Liu /* 358a39f7afdSSong Liu * The following condition is true for either of the following: 359a39f7afdSSong Liu * - stripe cache pressure high: 360a39f7afdSSong Liu * total_cached > 3/4 min_nr_stripes || 361a39f7afdSSong Liu * empty_inactive_list_nr > 0 362a39f7afdSSong Liu * - stripe cache pressure moderate: 363a39f7afdSSong Liu * total_cached > 1/2 min_nr_stripes 364a39f7afdSSong Liu */ 365a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 1 / 2 || 366a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 367a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 368a39f7afdSSong Liu } 369a39f7afdSSong Liu 370a39f7afdSSong Liu /* 371a39f7afdSSong Liu * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 372a39f7afdSSong Liu * stripes in the cache 373a39f7afdSSong Liu */ 374a39f7afdSSong Liu void r5c_check_cached_full_stripe(struct r5conf *conf) 375a39f7afdSSong Liu { 376a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 377a39f7afdSSong Liu return; 378a39f7afdSSong Liu 379a39f7afdSSong Liu /* 380a39f7afdSSong Liu * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 381a39f7afdSSong Liu * or a full stripe (chunk size / 4k stripes). 382a39f7afdSSong Liu */ 383a39f7afdSSong Liu if (atomic_read(&conf->r5c_cached_full_stripes) >= 384a39f7afdSSong Liu min(R5C_FULL_STRIPE_FLUSH_BATCH, 385a39f7afdSSong Liu conf->chunk_sectors >> STRIPE_SHIFT)) 386a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 387a39f7afdSSong Liu } 388a39f7afdSSong Liu 389a39f7afdSSong Liu /* 390a39f7afdSSong Liu * Total log space (in sectors) needed to flush all data in cache 391a39f7afdSSong Liu * 39239b99586SSong Liu * To avoid deadlock due to log space, it is necessary to reserve log 39339b99586SSong Liu * space to flush critical stripes (stripes that occupying log space near 39439b99586SSong Liu * last_checkpoint). This function helps check how much log space is 39539b99586SSong Liu * required to flush all cached stripes. 396a39f7afdSSong Liu * 39739b99586SSong Liu * To reduce log space requirements, two mechanisms are used to give cache 39839b99586SSong Liu * flush higher priorities: 39939b99586SSong Liu * 1. In handle_stripe_dirtying() and schedule_reconstruction(), 40039b99586SSong Liu * stripes ALREADY in journal can be flushed w/o pending writes; 40139b99586SSong Liu * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal 40239b99586SSong Liu * can be delayed (r5l_add_no_space_stripe). 403a39f7afdSSong Liu * 40439b99586SSong Liu * In cache flush, the stripe goes through 1 and then 2. For a stripe that 40539b99586SSong Liu * already passed 1, flushing it requires at most (conf->max_degraded + 1) 40639b99586SSong Liu * pages of journal space. For stripes that has not passed 1, flushing it 40739b99586SSong Liu * requires (conf->raid_disks + 1) pages of journal space. There are at 40839b99586SSong Liu * most (conf->group_cnt + 1) stripe that passed 1. So total journal space 40939b99586SSong Liu * required to flush all cached stripes (in pages) is: 41039b99586SSong Liu * 41139b99586SSong Liu * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + 41239b99586SSong Liu * (group_cnt + 1) * (raid_disks + 1) 41339b99586SSong Liu * or 41439b99586SSong Liu * (stripe_in_journal_count) * (max_degraded + 1) + 41539b99586SSong Liu * (group_cnt + 1) * (raid_disks - max_degraded) 416a39f7afdSSong Liu */ 417a39f7afdSSong Liu static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 418a39f7afdSSong Liu { 419a39f7afdSSong Liu struct r5l_log *log = conf->log; 420a39f7afdSSong Liu 421a39f7afdSSong Liu if (!r5c_is_writeback(log)) 422a39f7afdSSong Liu return 0; 423a39f7afdSSong Liu 42439b99586SSong Liu return BLOCK_SECTORS * 42539b99586SSong Liu ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + 42639b99586SSong Liu (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); 427a39f7afdSSong Liu } 428a39f7afdSSong Liu 429a39f7afdSSong Liu /* 430a39f7afdSSong Liu * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 431a39f7afdSSong Liu * 432a39f7afdSSong Liu * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 433a39f7afdSSong Liu * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 434a39f7afdSSong Liu * device is less than 2x of reclaim_required_space. 435a39f7afdSSong Liu */ 436a39f7afdSSong Liu static inline void r5c_update_log_state(struct r5l_log *log) 437a39f7afdSSong Liu { 438a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 439a39f7afdSSong Liu sector_t free_space; 440a39f7afdSSong Liu sector_t reclaim_space; 441f687a33eSSong Liu bool wake_reclaim = false; 442a39f7afdSSong Liu 443a39f7afdSSong Liu if (!r5c_is_writeback(log)) 444a39f7afdSSong Liu return; 445a39f7afdSSong Liu 446a39f7afdSSong Liu free_space = r5l_ring_distance(log, log->log_start, 447a39f7afdSSong Liu log->last_checkpoint); 448a39f7afdSSong Liu reclaim_space = r5c_log_required_to_flush_cache(conf); 449a39f7afdSSong Liu if (free_space < 2 * reclaim_space) 450a39f7afdSSong Liu set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 451f687a33eSSong Liu else { 452f687a33eSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 453f687a33eSSong Liu wake_reclaim = true; 454a39f7afdSSong Liu clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 455f687a33eSSong Liu } 456a39f7afdSSong Liu if (free_space < 3 * reclaim_space) 457a39f7afdSSong Liu set_bit(R5C_LOG_TIGHT, &conf->cache_state); 458a39f7afdSSong Liu else 459a39f7afdSSong Liu clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 460f687a33eSSong Liu 461f687a33eSSong Liu if (wake_reclaim) 462f687a33eSSong Liu r5l_wake_reclaim(log, 0); 463a39f7afdSSong Liu } 464a39f7afdSSong Liu 4652ded3703SSong Liu /* 4662ded3703SSong Liu * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 4672ded3703SSong Liu * This function should only be called in write-back mode. 4682ded3703SSong Liu */ 469a39f7afdSSong Liu void r5c_make_stripe_write_out(struct stripe_head *sh) 4702ded3703SSong Liu { 4712ded3703SSong Liu struct r5conf *conf = sh->raid_conf; 4722ded3703SSong Liu struct r5l_log *log = conf->log; 4732ded3703SSong Liu 4742ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 4752ded3703SSong Liu 4762ded3703SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 4772ded3703SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 4781e6d690bSSong Liu 4791e6d690bSSong Liu if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4801e6d690bSSong Liu atomic_inc(&conf->preread_active_stripes); 4811e6d690bSSong Liu } 4821e6d690bSSong Liu 4831e6d690bSSong Liu static void r5c_handle_data_cached(struct stripe_head *sh) 4841e6d690bSSong Liu { 4851e6d690bSSong Liu int i; 4861e6d690bSSong Liu 4871e6d690bSSong Liu for (i = sh->disks; i--; ) 4881e6d690bSSong Liu if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 4891e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[i].flags); 4901e6d690bSSong Liu clear_bit(R5_LOCKED, &sh->dev[i].flags); 4911e6d690bSSong Liu } 4921e6d690bSSong Liu clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 4931e6d690bSSong Liu } 4941e6d690bSSong Liu 4951e6d690bSSong Liu /* 4961e6d690bSSong Liu * this journal write must contain full parity, 4971e6d690bSSong Liu * it may also contain some data pages 4981e6d690bSSong Liu */ 4991e6d690bSSong Liu static void r5c_handle_parity_cached(struct stripe_head *sh) 5001e6d690bSSong Liu { 5011e6d690bSSong Liu int i; 5021e6d690bSSong Liu 5031e6d690bSSong Liu for (i = sh->disks; i--; ) 5041e6d690bSSong Liu if (test_bit(R5_InJournal, &sh->dev[i].flags)) 5051e6d690bSSong Liu set_bit(R5_Wantwrite, &sh->dev[i].flags); 5062ded3703SSong Liu } 5072ded3703SSong Liu 5082ded3703SSong Liu /* 5092ded3703SSong Liu * Setting proper flags after writing (or flushing) data and/or parity to the 5102ded3703SSong Liu * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 5112ded3703SSong Liu */ 5122ded3703SSong Liu static void r5c_finish_cache_stripe(struct stripe_head *sh) 5132ded3703SSong Liu { 5142ded3703SSong Liu struct r5l_log *log = sh->raid_conf->log; 5152ded3703SSong Liu 5162ded3703SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 5172ded3703SSong Liu BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 5182ded3703SSong Liu /* 5192ded3703SSong Liu * Set R5_InJournal for parity dev[pd_idx]. This means 5202ded3703SSong Liu * all data AND parity in the journal. For RAID 6, it is 5212ded3703SSong Liu * NOT necessary to set the flag for dev[qd_idx], as the 5222ded3703SSong Liu * two parities are written out together. 5232ded3703SSong Liu */ 5242ded3703SSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 5251e6d690bSSong Liu } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 5261e6d690bSSong Liu r5c_handle_data_cached(sh); 5271e6d690bSSong Liu } else { 5281e6d690bSSong Liu r5c_handle_parity_cached(sh); 5291e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 5301e6d690bSSong Liu } 5312ded3703SSong Liu } 5322ded3703SSong Liu 533d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io) 534d8858f43SChristoph Hellwig { 535d8858f43SChristoph Hellwig struct stripe_head *sh, *next; 536d8858f43SChristoph Hellwig 537d8858f43SChristoph Hellwig list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 538d8858f43SChristoph Hellwig list_del_init(&sh->log_list); 5392ded3703SSong Liu 5402ded3703SSong Liu r5c_finish_cache_stripe(sh); 5412ded3703SSong Liu 542d8858f43SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 543d8858f43SChristoph Hellwig raid5_release_stripe(sh); 544d8858f43SChristoph Hellwig } 545d8858f43SChristoph Hellwig } 546d8858f43SChristoph Hellwig 54756fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log) 54856fef7c6SChristoph Hellwig { 54956fef7c6SChristoph Hellwig struct r5l_io_unit *io, *next; 55056fef7c6SChristoph Hellwig 55156fef7c6SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 55256fef7c6SChristoph Hellwig 55356fef7c6SChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 55456fef7c6SChristoph Hellwig /* don't change list order */ 55556fef7c6SChristoph Hellwig if (io->state < IO_UNIT_IO_END) 55656fef7c6SChristoph Hellwig break; 55756fef7c6SChristoph Hellwig 55856fef7c6SChristoph Hellwig list_move_tail(&io->log_sibling, &log->finished_ios); 55956fef7c6SChristoph Hellwig r5l_io_run_stripes(io); 56056fef7c6SChristoph Hellwig } 56156fef7c6SChristoph Hellwig } 56256fef7c6SChristoph Hellwig 5633848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log) 5643848c0bcSChristoph Hellwig { 5653848c0bcSChristoph Hellwig struct r5l_io_unit *io, *next; 5663848c0bcSChristoph Hellwig 5673848c0bcSChristoph Hellwig assert_spin_locked(&log->io_list_lock); 5683848c0bcSChristoph Hellwig 5693848c0bcSChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 5703848c0bcSChristoph Hellwig /* don't change list order */ 5713848c0bcSChristoph Hellwig if (io->state < IO_UNIT_IO_END) 5723848c0bcSChristoph Hellwig break; 5733848c0bcSChristoph Hellwig list_move_tail(&io->log_sibling, &log->io_end_ios); 5743848c0bcSChristoph Hellwig } 5753848c0bcSChristoph Hellwig } 5763848c0bcSChristoph Hellwig 5773bddb7f8SSong Liu static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 578f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio) 579f6bed0efSShaohua Li { 580f6bed0efSShaohua Li struct r5l_io_unit *io = bio->bi_private; 5813bddb7f8SSong Liu struct r5l_io_unit *io_deferred; 582f6bed0efSShaohua Li struct r5l_log *log = io->log; 583509ffec7SChristoph Hellwig unsigned long flags; 584f6bed0efSShaohua Li 5856e74a9cfSShaohua Li if (bio->bi_error) 5866e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 5876e74a9cfSShaohua Li 588f6bed0efSShaohua Li bio_put(bio); 589e8deb638SChristoph Hellwig mempool_free(io->meta_page, log->meta_pool); 590f6bed0efSShaohua Li 591509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 592509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 59356fef7c6SChristoph Hellwig if (log->need_cache_flush) 5943848c0bcSChristoph Hellwig r5l_move_to_end_ios(log); 59556fef7c6SChristoph Hellwig else 59656fef7c6SChristoph Hellwig r5l_log_run_stripes(log); 5973bddb7f8SSong Liu if (!list_empty(&log->running_ios)) { 5983bddb7f8SSong Liu /* 5993bddb7f8SSong Liu * FLUSH/FUA io_unit is deferred because of ordering, now we 6003bddb7f8SSong Liu * can dispatch it 6013bddb7f8SSong Liu */ 6023bddb7f8SSong Liu io_deferred = list_first_entry(&log->running_ios, 6033bddb7f8SSong Liu struct r5l_io_unit, log_sibling); 6043bddb7f8SSong Liu if (io_deferred->io_deferred) 6053bddb7f8SSong Liu schedule_work(&log->deferred_io_work); 6063bddb7f8SSong Liu } 6073bddb7f8SSong Liu 608509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 609509ffec7SChristoph Hellwig 61056fef7c6SChristoph Hellwig if (log->need_cache_flush) 611f6bed0efSShaohua Li md_wakeup_thread(log->rdev->mddev->thread); 6123bddb7f8SSong Liu 6133bddb7f8SSong Liu if (io->has_null_flush) { 6143bddb7f8SSong Liu struct bio *bi; 6153bddb7f8SSong Liu 6163bddb7f8SSong Liu WARN_ON(bio_list_empty(&io->flush_barriers)); 6173bddb7f8SSong Liu while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 6183bddb7f8SSong Liu bio_endio(bi); 6193bddb7f8SSong Liu atomic_dec(&io->pending_stripe); 6203bddb7f8SSong Liu } 6213bddb7f8SSong Liu if (atomic_read(&io->pending_stripe) == 0) 6223bddb7f8SSong Liu __r5l_stripe_write_finished(io); 6233bddb7f8SSong Liu } 6243bddb7f8SSong Liu } 6253bddb7f8SSong Liu 6263bddb7f8SSong Liu static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 6273bddb7f8SSong Liu { 6283bddb7f8SSong Liu unsigned long flags; 6293bddb7f8SSong Liu 6303bddb7f8SSong Liu spin_lock_irqsave(&log->io_list_lock, flags); 6313bddb7f8SSong Liu __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 6323bddb7f8SSong Liu spin_unlock_irqrestore(&log->io_list_lock, flags); 6333bddb7f8SSong Liu 6343bddb7f8SSong Liu if (io->has_flush) 63520737738SShaohua Li io->current_bio->bi_opf |= REQ_PREFLUSH; 6363bddb7f8SSong Liu if (io->has_fua) 63720737738SShaohua Li io->current_bio->bi_opf |= REQ_FUA; 6383bddb7f8SSong Liu submit_bio(io->current_bio); 6393bddb7f8SSong Liu 6403bddb7f8SSong Liu if (!io->split_bio) 6413bddb7f8SSong Liu return; 6423bddb7f8SSong Liu 6433bddb7f8SSong Liu if (io->has_flush) 64420737738SShaohua Li io->split_bio->bi_opf |= REQ_PREFLUSH; 6453bddb7f8SSong Liu if (io->has_fua) 64620737738SShaohua Li io->split_bio->bi_opf |= REQ_FUA; 6473bddb7f8SSong Liu submit_bio(io->split_bio); 6483bddb7f8SSong Liu } 6493bddb7f8SSong Liu 6503bddb7f8SSong Liu /* deferred io_unit will be dispatched here */ 6513bddb7f8SSong Liu static void r5l_submit_io_async(struct work_struct *work) 6523bddb7f8SSong Liu { 6533bddb7f8SSong Liu struct r5l_log *log = container_of(work, struct r5l_log, 6543bddb7f8SSong Liu deferred_io_work); 6553bddb7f8SSong Liu struct r5l_io_unit *io = NULL; 6563bddb7f8SSong Liu unsigned long flags; 6573bddb7f8SSong Liu 6583bddb7f8SSong Liu spin_lock_irqsave(&log->io_list_lock, flags); 6593bddb7f8SSong Liu if (!list_empty(&log->running_ios)) { 6603bddb7f8SSong Liu io = list_first_entry(&log->running_ios, struct r5l_io_unit, 6613bddb7f8SSong Liu log_sibling); 6623bddb7f8SSong Liu if (!io->io_deferred) 6633bddb7f8SSong Liu io = NULL; 6643bddb7f8SSong Liu else 6653bddb7f8SSong Liu io->io_deferred = 0; 6663bddb7f8SSong Liu } 6673bddb7f8SSong Liu spin_unlock_irqrestore(&log->io_list_lock, flags); 6683bddb7f8SSong Liu if (io) 6693bddb7f8SSong Liu r5l_do_submit_io(log, io); 670f6bed0efSShaohua Li } 671f6bed0efSShaohua Li 6722e38a37fSSong Liu static void r5c_disable_writeback_async(struct work_struct *work) 6732e38a37fSSong Liu { 6742e38a37fSSong Liu struct r5l_log *log = container_of(work, struct r5l_log, 6752e38a37fSSong Liu disable_writeback_work); 6762e38a37fSSong Liu struct mddev *mddev = log->rdev->mddev; 6772e38a37fSSong Liu 6782e38a37fSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 6792e38a37fSSong Liu return; 6802e38a37fSSong Liu pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", 6812e38a37fSSong Liu mdname(mddev)); 6822e38a37fSSong Liu mddev_suspend(mddev); 6832e38a37fSSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 6842e38a37fSSong Liu mddev_resume(mddev); 6852e38a37fSSong Liu } 6862e38a37fSSong Liu 687f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log) 688f6bed0efSShaohua Li { 689f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 6903bddb7f8SSong Liu struct bio *bio; 691f6bed0efSShaohua Li struct r5l_meta_block *block; 692509ffec7SChristoph Hellwig unsigned long flags; 693f6bed0efSShaohua Li u32 crc; 6943bddb7f8SSong Liu bool do_submit = true; 695f6bed0efSShaohua Li 696f6bed0efSShaohua Li if (!io) 697f6bed0efSShaohua Li return; 698f6bed0efSShaohua Li 699f6bed0efSShaohua Li block = page_address(io->meta_page); 700f6bed0efSShaohua Li block->meta_size = cpu_to_le32(io->meta_offset); 7015cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 702f6bed0efSShaohua Li block->checksum = cpu_to_le32(crc); 7033bddb7f8SSong Liu bio = io->current_bio; 704f6bed0efSShaohua Li 705f6bed0efSShaohua Li log->current_io = NULL; 706509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 7073bddb7f8SSong Liu if (io->has_flush || io->has_fua) { 7083bddb7f8SSong Liu if (io != list_first_entry(&log->running_ios, 7093bddb7f8SSong Liu struct r5l_io_unit, log_sibling)) { 7103bddb7f8SSong Liu io->io_deferred = 1; 7113bddb7f8SSong Liu do_submit = false; 7123bddb7f8SSong Liu } 7133bddb7f8SSong Liu } 714509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 7153bddb7f8SSong Liu if (do_submit) 7163bddb7f8SSong Liu r5l_do_submit_io(log, io); 717f6bed0efSShaohua Li } 718f6bed0efSShaohua Li 7196143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log) 720b349feb3SChristoph Hellwig { 721c38d29b3SChristoph Hellwig struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 722b349feb3SChristoph Hellwig 723796a5cf0SMike Christie bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 724b349feb3SChristoph Hellwig bio->bi_bdev = log->rdev->bdev; 7251e932a37SChristoph Hellwig bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 726b349feb3SChristoph Hellwig 727b349feb3SChristoph Hellwig return bio; 728b349feb3SChristoph Hellwig } 729b349feb3SChristoph Hellwig 730c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 731c1b99198SChristoph Hellwig { 732c1b99198SChristoph Hellwig log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 733c1b99198SChristoph Hellwig 734a39f7afdSSong Liu r5c_update_log_state(log); 735c1b99198SChristoph Hellwig /* 736c1b99198SChristoph Hellwig * If we filled up the log device start from the beginning again, 737c1b99198SChristoph Hellwig * which will require a new bio. 738c1b99198SChristoph Hellwig * 739c1b99198SChristoph Hellwig * Note: for this to work properly the log size needs to me a multiple 740c1b99198SChristoph Hellwig * of BLOCK_SECTORS. 741c1b99198SChristoph Hellwig */ 742c1b99198SChristoph Hellwig if (log->log_start == 0) 7436143e2ceSChristoph Hellwig io->need_split_bio = true; 744c1b99198SChristoph Hellwig 745c1b99198SChristoph Hellwig io->log_end = log->log_start; 746c1b99198SChristoph Hellwig } 747c1b99198SChristoph Hellwig 748f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 749f6bed0efSShaohua Li { 750f6bed0efSShaohua Li struct r5l_io_unit *io; 751f6bed0efSShaohua Li struct r5l_meta_block *block; 752f6bed0efSShaohua Li 7535036c390SChristoph Hellwig io = mempool_alloc(log->io_pool, GFP_ATOMIC); 7545036c390SChristoph Hellwig if (!io) 7555036c390SChristoph Hellwig return NULL; 7565036c390SChristoph Hellwig memset(io, 0, sizeof(*io)); 7575036c390SChristoph Hellwig 75851039cd0SChristoph Hellwig io->log = log; 75951039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->log_sibling); 76051039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->stripe_list); 7613bddb7f8SSong Liu bio_list_init(&io->flush_barriers); 76251039cd0SChristoph Hellwig io->state = IO_UNIT_RUNNING; 763f6bed0efSShaohua Li 764e8deb638SChristoph Hellwig io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 765f6bed0efSShaohua Li block = page_address(io->meta_page); 766e8deb638SChristoph Hellwig clear_page(block); 767f6bed0efSShaohua Li block->magic = cpu_to_le32(R5LOG_MAGIC); 768f6bed0efSShaohua Li block->version = R5LOG_VERSION; 769f6bed0efSShaohua Li block->seq = cpu_to_le64(log->seq); 770f6bed0efSShaohua Li block->position = cpu_to_le64(log->log_start); 771f6bed0efSShaohua Li 772f6bed0efSShaohua Li io->log_start = log->log_start; 773f6bed0efSShaohua Li io->meta_offset = sizeof(struct r5l_meta_block); 7742b8ef16eSChristoph Hellwig io->seq = log->seq++; 775f6bed0efSShaohua Li 7766143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 7776143e2ceSChristoph Hellwig io->current_bio->bi_end_io = r5l_log_endio; 7786143e2ceSChristoph Hellwig io->current_bio->bi_private = io; 779b349feb3SChristoph Hellwig bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 780f6bed0efSShaohua Li 781c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 782f6bed0efSShaohua Li 783f6bed0efSShaohua Li spin_lock_irq(&log->io_list_lock); 784f6bed0efSShaohua Li list_add_tail(&io->log_sibling, &log->running_ios); 785f6bed0efSShaohua Li spin_unlock_irq(&log->io_list_lock); 786f6bed0efSShaohua Li 787f6bed0efSShaohua Li return io; 788f6bed0efSShaohua Li } 789f6bed0efSShaohua Li 790f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 791f6bed0efSShaohua Li { 79222581f58SChristoph Hellwig if (log->current_io && 79322581f58SChristoph Hellwig log->current_io->meta_offset + payload_size > PAGE_SIZE) 794f6bed0efSShaohua Li r5l_submit_current_io(log); 795f6bed0efSShaohua Li 7965036c390SChristoph Hellwig if (!log->current_io) { 797f6bed0efSShaohua Li log->current_io = r5l_new_meta(log); 7985036c390SChristoph Hellwig if (!log->current_io) 7995036c390SChristoph Hellwig return -ENOMEM; 8005036c390SChristoph Hellwig } 8015036c390SChristoph Hellwig 802f6bed0efSShaohua Li return 0; 803f6bed0efSShaohua Li } 804f6bed0efSShaohua Li 805f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 806f6bed0efSShaohua Li sector_t location, 807f6bed0efSShaohua Li u32 checksum1, u32 checksum2, 808f6bed0efSShaohua Li bool checksum2_valid) 809f6bed0efSShaohua Li { 810f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 811f6bed0efSShaohua Li struct r5l_payload_data_parity *payload; 812f6bed0efSShaohua Li 813f6bed0efSShaohua Li payload = page_address(io->meta_page) + io->meta_offset; 814f6bed0efSShaohua Li payload->header.type = cpu_to_le16(type); 815f6bed0efSShaohua Li payload->header.flags = cpu_to_le16(0); 816f6bed0efSShaohua Li payload->size = cpu_to_le32((1 + !!checksum2_valid) << 817f6bed0efSShaohua Li (PAGE_SHIFT - 9)); 818f6bed0efSShaohua Li payload->location = cpu_to_le64(location); 819f6bed0efSShaohua Li payload->checksum[0] = cpu_to_le32(checksum1); 820f6bed0efSShaohua Li if (checksum2_valid) 821f6bed0efSShaohua Li payload->checksum[1] = cpu_to_le32(checksum2); 822f6bed0efSShaohua Li 823f6bed0efSShaohua Li io->meta_offset += sizeof(struct r5l_payload_data_parity) + 824f6bed0efSShaohua Li sizeof(__le32) * (1 + !!checksum2_valid); 825f6bed0efSShaohua Li } 826f6bed0efSShaohua Li 827f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 828f6bed0efSShaohua Li { 829f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 830f6bed0efSShaohua Li 8316143e2ceSChristoph Hellwig if (io->need_split_bio) { 8323bddb7f8SSong Liu BUG_ON(io->split_bio); 8333bddb7f8SSong Liu io->split_bio = io->current_bio; 8346143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 8353bddb7f8SSong Liu bio_chain(io->current_bio, io->split_bio); 8363bddb7f8SSong Liu io->need_split_bio = false; 837f6bed0efSShaohua Li } 838f6bed0efSShaohua Li 8396143e2ceSChristoph Hellwig if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 8406143e2ceSChristoph Hellwig BUG(); 8416143e2ceSChristoph Hellwig 842c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 843f6bed0efSShaohua Li } 844f6bed0efSShaohua Li 8455036c390SChristoph Hellwig static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 846f6bed0efSShaohua Li int data_pages, int parity_pages) 847f6bed0efSShaohua Li { 848f6bed0efSShaohua Li int i; 849f6bed0efSShaohua Li int meta_size; 8505036c390SChristoph Hellwig int ret; 851f6bed0efSShaohua Li struct r5l_io_unit *io; 852f6bed0efSShaohua Li 853f6bed0efSShaohua Li meta_size = 854f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 855f6bed0efSShaohua Li * data_pages) + 856f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 857f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 858f6bed0efSShaohua Li 8595036c390SChristoph Hellwig ret = r5l_get_meta(log, meta_size); 8605036c390SChristoph Hellwig if (ret) 8615036c390SChristoph Hellwig return ret; 8625036c390SChristoph Hellwig 863f6bed0efSShaohua Li io = log->current_io; 864f6bed0efSShaohua Li 8653bddb7f8SSong Liu if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 8663bddb7f8SSong Liu io->has_flush = 1; 8673bddb7f8SSong Liu 868f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 8691e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 8701e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 871f6bed0efSShaohua Li continue; 872f6bed0efSShaohua Li if (i == sh->pd_idx || i == sh->qd_idx) 873f6bed0efSShaohua Li continue; 8743bddb7f8SSong Liu if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 8753bddb7f8SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 8763bddb7f8SSong Liu io->has_fua = 1; 8773bddb7f8SSong Liu /* 8783bddb7f8SSong Liu * we need to flush journal to make sure recovery can 8793bddb7f8SSong Liu * reach the data with fua flag 8803bddb7f8SSong Liu */ 8813bddb7f8SSong Liu io->has_flush = 1; 8823bddb7f8SSong Liu } 883f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 884f6bed0efSShaohua Li raid5_compute_blocknr(sh, i, 0), 885f6bed0efSShaohua Li sh->dev[i].log_checksum, 0, false); 886f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[i].page); 887f6bed0efSShaohua Li } 888f6bed0efSShaohua Li 8892ded3703SSong Liu if (parity_pages == 2) { 890f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 891f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 892f6bed0efSShaohua Li sh->dev[sh->qd_idx].log_checksum, true); 893f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 894f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 8952ded3703SSong Liu } else if (parity_pages == 1) { 896f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 897f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 898f6bed0efSShaohua Li 0, false); 899f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 9002ded3703SSong Liu } else /* Just writing data, not parity, in caching phase */ 9012ded3703SSong Liu BUG_ON(parity_pages != 0); 902f6bed0efSShaohua Li 903f6bed0efSShaohua Li list_add_tail(&sh->log_list, &io->stripe_list); 904f6bed0efSShaohua Li atomic_inc(&io->pending_stripe); 905f6bed0efSShaohua Li sh->log_io = io; 9065036c390SChristoph Hellwig 907a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 908a39f7afdSSong Liu return 0; 909a39f7afdSSong Liu 910a39f7afdSSong Liu if (sh->log_start == MaxSector) { 911a39f7afdSSong Liu BUG_ON(!list_empty(&sh->r5c)); 912a39f7afdSSong Liu sh->log_start = io->log_start; 913a39f7afdSSong Liu spin_lock_irq(&log->stripe_in_journal_lock); 914a39f7afdSSong Liu list_add_tail(&sh->r5c, 915a39f7afdSSong Liu &log->stripe_in_journal_list); 916a39f7afdSSong Liu spin_unlock_irq(&log->stripe_in_journal_lock); 917a39f7afdSSong Liu atomic_inc(&log->stripe_in_journal_count); 918a39f7afdSSong Liu } 9195036c390SChristoph Hellwig return 0; 920f6bed0efSShaohua Li } 921f6bed0efSShaohua Li 922a39f7afdSSong Liu /* add stripe to no_space_stripes, and then wake up reclaim */ 923a39f7afdSSong Liu static inline void r5l_add_no_space_stripe(struct r5l_log *log, 924a39f7afdSSong Liu struct stripe_head *sh) 925a39f7afdSSong Liu { 926a39f7afdSSong Liu spin_lock(&log->no_space_stripes_lock); 927a39f7afdSSong Liu list_add_tail(&sh->log_list, &log->no_space_stripes); 928a39f7afdSSong Liu spin_unlock(&log->no_space_stripes_lock); 929a39f7afdSSong Liu } 930a39f7afdSSong Liu 931f6bed0efSShaohua Li /* 932f6bed0efSShaohua Li * running in raid5d, where reclaim could wait for raid5d too (when it flushes 933f6bed0efSShaohua Li * data from log to raid disks), so we shouldn't wait for reclaim here 934f6bed0efSShaohua Li */ 935f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 936f6bed0efSShaohua Li { 937a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 938f6bed0efSShaohua Li int write_disks = 0; 939f6bed0efSShaohua Li int data_pages, parity_pages; 940f6bed0efSShaohua Li int reserve; 941f6bed0efSShaohua Li int i; 9425036c390SChristoph Hellwig int ret = 0; 943a39f7afdSSong Liu bool wake_reclaim = false; 944f6bed0efSShaohua Li 945f6bed0efSShaohua Li if (!log) 946f6bed0efSShaohua Li return -EAGAIN; 947f6bed0efSShaohua Li /* Don't support stripe batch */ 948f6bed0efSShaohua Li if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 949f6bed0efSShaohua Li test_bit(STRIPE_SYNCING, &sh->state)) { 950f6bed0efSShaohua Li /* the stripe is written to log, we start writing it to raid */ 951f6bed0efSShaohua Li clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 952f6bed0efSShaohua Li return -EAGAIN; 953f6bed0efSShaohua Li } 954f6bed0efSShaohua Li 9552ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 9562ded3703SSong Liu 957f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 958f6bed0efSShaohua Li void *addr; 959f6bed0efSShaohua Li 9601e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 9611e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 962f6bed0efSShaohua Li continue; 9631e6d690bSSong Liu 964f6bed0efSShaohua Li write_disks++; 965f6bed0efSShaohua Li /* checksum is already calculated in last run */ 966f6bed0efSShaohua Li if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 967f6bed0efSShaohua Li continue; 968f6bed0efSShaohua Li addr = kmap_atomic(sh->dev[i].page); 9695cb2fbd6SShaohua Li sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 970f6bed0efSShaohua Li addr, PAGE_SIZE); 971f6bed0efSShaohua Li kunmap_atomic(addr); 972f6bed0efSShaohua Li } 973f6bed0efSShaohua Li parity_pages = 1 + !!(sh->qd_idx >= 0); 974f6bed0efSShaohua Li data_pages = write_disks - parity_pages; 975f6bed0efSShaohua Li 976f6bed0efSShaohua Li set_bit(STRIPE_LOG_TRAPPED, &sh->state); 977253f9fd4SShaohua Li /* 978253f9fd4SShaohua Li * The stripe must enter state machine again to finish the write, so 979253f9fd4SShaohua Li * don't delay. 980253f9fd4SShaohua Li */ 981253f9fd4SShaohua Li clear_bit(STRIPE_DELAYED, &sh->state); 982f6bed0efSShaohua Li atomic_inc(&sh->count); 983f6bed0efSShaohua Li 984f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 985f6bed0efSShaohua Li /* meta + data */ 986f6bed0efSShaohua Li reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 987f6bed0efSShaohua Li 988a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 989a39f7afdSSong Liu if (!r5l_has_free_space(log, reserve)) { 990a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 991a39f7afdSSong Liu wake_reclaim = true; 9925036c390SChristoph Hellwig } else { 9935036c390SChristoph Hellwig ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 9945036c390SChristoph Hellwig if (ret) { 9955036c390SChristoph Hellwig spin_lock_irq(&log->io_list_lock); 996a39f7afdSSong Liu list_add_tail(&sh->log_list, 997a39f7afdSSong Liu &log->no_mem_stripes); 9985036c390SChristoph Hellwig spin_unlock_irq(&log->io_list_lock); 999f6bed0efSShaohua Li } 10005036c390SChristoph Hellwig } 1001a39f7afdSSong Liu } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 1002a39f7afdSSong Liu /* 1003a39f7afdSSong Liu * log space critical, do not process stripes that are 1004a39f7afdSSong Liu * not in cache yet (sh->log_start == MaxSector). 1005a39f7afdSSong Liu */ 1006a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 1007a39f7afdSSong Liu sh->log_start == MaxSector) { 1008a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 1009a39f7afdSSong Liu wake_reclaim = true; 1010a39f7afdSSong Liu reserve = 0; 1011a39f7afdSSong Liu } else if (!r5l_has_free_space(log, reserve)) { 1012a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 1013a39f7afdSSong Liu BUG(); 1014a39f7afdSSong Liu else 1015a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 1016a39f7afdSSong Liu } else { 1017a39f7afdSSong Liu ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 1018a39f7afdSSong Liu if (ret) { 1019a39f7afdSSong Liu spin_lock_irq(&log->io_list_lock); 1020a39f7afdSSong Liu list_add_tail(&sh->log_list, 1021a39f7afdSSong Liu &log->no_mem_stripes); 1022a39f7afdSSong Liu spin_unlock_irq(&log->io_list_lock); 1023a39f7afdSSong Liu } 1024a39f7afdSSong Liu } 1025a39f7afdSSong Liu } 1026f6bed0efSShaohua Li 10275036c390SChristoph Hellwig mutex_unlock(&log->io_mutex); 1028a39f7afdSSong Liu if (wake_reclaim) 1029a39f7afdSSong Liu r5l_wake_reclaim(log, reserve); 1030f6bed0efSShaohua Li return 0; 1031f6bed0efSShaohua Li } 1032f6bed0efSShaohua Li 1033f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log) 1034f6bed0efSShaohua Li { 1035f6bed0efSShaohua Li if (!log) 1036f6bed0efSShaohua Li return; 1037f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 1038f6bed0efSShaohua Li r5l_submit_current_io(log); 1039f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 1040f6bed0efSShaohua Li } 1041f6bed0efSShaohua Li 1042828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 1043828cbe98SShaohua Li { 1044828cbe98SShaohua Li if (!log) 1045828cbe98SShaohua Li return -ENODEV; 10463bddb7f8SSong Liu 10473bddb7f8SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 1048828cbe98SShaohua Li /* 10493bddb7f8SSong Liu * in write through (journal only) 10503bddb7f8SSong Liu * we flush log disk cache first, then write stripe data to 10513bddb7f8SSong Liu * raid disks. So if bio is finished, the log disk cache is 10523bddb7f8SSong Liu * flushed already. The recovery guarantees we can recovery 10533bddb7f8SSong Liu * the bio from log disk, so we don't need to flush again 1054828cbe98SShaohua Li */ 1055828cbe98SShaohua Li if (bio->bi_iter.bi_size == 0) { 1056828cbe98SShaohua Li bio_endio(bio); 1057828cbe98SShaohua Li return 0; 1058828cbe98SShaohua Li } 10591eff9d32SJens Axboe bio->bi_opf &= ~REQ_PREFLUSH; 10603bddb7f8SSong Liu } else { 10613bddb7f8SSong Liu /* write back (with cache) */ 10623bddb7f8SSong Liu if (bio->bi_iter.bi_size == 0) { 10633bddb7f8SSong Liu mutex_lock(&log->io_mutex); 10643bddb7f8SSong Liu r5l_get_meta(log, 0); 10653bddb7f8SSong Liu bio_list_add(&log->current_io->flush_barriers, bio); 10663bddb7f8SSong Liu log->current_io->has_flush = 1; 10673bddb7f8SSong Liu log->current_io->has_null_flush = 1; 10683bddb7f8SSong Liu atomic_inc(&log->current_io->pending_stripe); 10693bddb7f8SSong Liu r5l_submit_current_io(log); 10703bddb7f8SSong Liu mutex_unlock(&log->io_mutex); 10713bddb7f8SSong Liu return 0; 10723bddb7f8SSong Liu } 10733bddb7f8SSong Liu } 1074828cbe98SShaohua Li return -EAGAIN; 1075828cbe98SShaohua Li } 1076828cbe98SShaohua Li 1077f6bed0efSShaohua Li /* This will run after log space is reclaimed */ 1078f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log) 1079f6bed0efSShaohua Li { 1080f6bed0efSShaohua Li struct stripe_head *sh; 1081f6bed0efSShaohua Li 1082f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 1083f6bed0efSShaohua Li while (!list_empty(&log->no_space_stripes)) { 1084f6bed0efSShaohua Li sh = list_first_entry(&log->no_space_stripes, 1085f6bed0efSShaohua Li struct stripe_head, log_list); 1086f6bed0efSShaohua Li list_del_init(&sh->log_list); 1087f6bed0efSShaohua Li set_bit(STRIPE_HANDLE, &sh->state); 1088f6bed0efSShaohua Li raid5_release_stripe(sh); 1089f6bed0efSShaohua Li } 1090f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 1091f6bed0efSShaohua Li } 1092f6bed0efSShaohua Li 1093a39f7afdSSong Liu /* 1094a39f7afdSSong Liu * calculate new last_checkpoint 1095a39f7afdSSong Liu * for write through mode, returns log->next_checkpoint 1096a39f7afdSSong Liu * for write back, returns log_start of first sh in stripe_in_journal_list 1097a39f7afdSSong Liu */ 1098a39f7afdSSong Liu static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1099a39f7afdSSong Liu { 1100a39f7afdSSong Liu struct stripe_head *sh; 1101a39f7afdSSong Liu struct r5l_log *log = conf->log; 1102a39f7afdSSong Liu sector_t new_cp; 1103a39f7afdSSong Liu unsigned long flags; 1104a39f7afdSSong Liu 1105a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1106a39f7afdSSong Liu return log->next_checkpoint; 1107a39f7afdSSong Liu 1108a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1109a39f7afdSSong Liu if (list_empty(&conf->log->stripe_in_journal_list)) { 1110a39f7afdSSong Liu /* all stripes flushed */ 1111d3014e21SDan Carpenter spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1112a39f7afdSSong Liu return log->next_checkpoint; 1113a39f7afdSSong Liu } 1114a39f7afdSSong Liu sh = list_first_entry(&conf->log->stripe_in_journal_list, 1115a39f7afdSSong Liu struct stripe_head, r5c); 1116a39f7afdSSong Liu new_cp = sh->log_start; 1117a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1118a39f7afdSSong Liu return new_cp; 1119a39f7afdSSong Liu } 1120a39f7afdSSong Liu 112117036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log) 112217036461SChristoph Hellwig { 1123a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 1124a39f7afdSSong Liu 112517036461SChristoph Hellwig return r5l_ring_distance(log, log->last_checkpoint, 1126a39f7afdSSong Liu r5c_calculate_new_cp(conf)); 112717036461SChristoph Hellwig } 112817036461SChristoph Hellwig 11295036c390SChristoph Hellwig static void r5l_run_no_mem_stripe(struct r5l_log *log) 11305036c390SChristoph Hellwig { 11315036c390SChristoph Hellwig struct stripe_head *sh; 11325036c390SChristoph Hellwig 11335036c390SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 11345036c390SChristoph Hellwig 11355036c390SChristoph Hellwig if (!list_empty(&log->no_mem_stripes)) { 11365036c390SChristoph Hellwig sh = list_first_entry(&log->no_mem_stripes, 11375036c390SChristoph Hellwig struct stripe_head, log_list); 11385036c390SChristoph Hellwig list_del_init(&sh->log_list); 11395036c390SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 11405036c390SChristoph Hellwig raid5_release_stripe(sh); 11415036c390SChristoph Hellwig } 11425036c390SChristoph Hellwig } 11435036c390SChristoph Hellwig 114404732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log) 114517036461SChristoph Hellwig { 114617036461SChristoph Hellwig struct r5l_io_unit *io, *next; 114717036461SChristoph Hellwig bool found = false; 114817036461SChristoph Hellwig 114917036461SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 115017036461SChristoph Hellwig 115104732f74SChristoph Hellwig list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 115217036461SChristoph Hellwig /* don't change list order */ 115317036461SChristoph Hellwig if (io->state < IO_UNIT_STRIPE_END) 115417036461SChristoph Hellwig break; 115517036461SChristoph Hellwig 115617036461SChristoph Hellwig log->next_checkpoint = io->log_start; 115717036461SChristoph Hellwig 115817036461SChristoph Hellwig list_del(&io->log_sibling); 11595036c390SChristoph Hellwig mempool_free(io, log->io_pool); 11605036c390SChristoph Hellwig r5l_run_no_mem_stripe(log); 116117036461SChristoph Hellwig 116217036461SChristoph Hellwig found = true; 116317036461SChristoph Hellwig } 116417036461SChristoph Hellwig 116517036461SChristoph Hellwig return found; 116617036461SChristoph Hellwig } 116717036461SChristoph Hellwig 1168509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1169509ffec7SChristoph Hellwig { 1170509ffec7SChristoph Hellwig struct r5l_log *log = io->log; 1171a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 1172509ffec7SChristoph Hellwig unsigned long flags; 1173509ffec7SChristoph Hellwig 1174509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 1175509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 117617036461SChristoph Hellwig 117704732f74SChristoph Hellwig if (!r5l_complete_finished_ios(log)) { 117885f2f9a4SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 117985f2f9a4SShaohua Li return; 118085f2f9a4SShaohua Li } 1181509ffec7SChristoph Hellwig 1182a39f7afdSSong Liu if (r5l_reclaimable_space(log) > log->max_free_space || 1183a39f7afdSSong Liu test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1184509ffec7SChristoph Hellwig r5l_wake_reclaim(log, 0); 1185509ffec7SChristoph Hellwig 1186509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 1187509ffec7SChristoph Hellwig wake_up(&log->iounit_wait); 1188509ffec7SChristoph Hellwig } 1189509ffec7SChristoph Hellwig 11900576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh) 11910576b1c6SShaohua Li { 11920576b1c6SShaohua Li struct r5l_io_unit *io; 11930576b1c6SShaohua Li 11940576b1c6SShaohua Li io = sh->log_io; 11950576b1c6SShaohua Li sh->log_io = NULL; 11960576b1c6SShaohua Li 1197509ffec7SChristoph Hellwig if (io && atomic_dec_and_test(&io->pending_stripe)) 1198509ffec7SChristoph Hellwig __r5l_stripe_write_finished(io); 11990576b1c6SShaohua Li } 12000576b1c6SShaohua Li 1201a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio) 1202a8c34f91SShaohua Li { 1203a8c34f91SShaohua Li struct r5l_log *log = container_of(bio, struct r5l_log, 1204a8c34f91SShaohua Li flush_bio); 1205a8c34f91SShaohua Li unsigned long flags; 1206a8c34f91SShaohua Li struct r5l_io_unit *io; 1207a8c34f91SShaohua Li 12086e74a9cfSShaohua Li if (bio->bi_error) 12096e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 12106e74a9cfSShaohua Li 1211a8c34f91SShaohua Li spin_lock_irqsave(&log->io_list_lock, flags); 1212d8858f43SChristoph Hellwig list_for_each_entry(io, &log->flushing_ios, log_sibling) 1213d8858f43SChristoph Hellwig r5l_io_run_stripes(io); 121404732f74SChristoph Hellwig list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1215a8c34f91SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 1216a8c34f91SShaohua Li } 1217a8c34f91SShaohua Li 12180576b1c6SShaohua Li /* 12190576b1c6SShaohua Li * Starting dispatch IO to raid. 12200576b1c6SShaohua Li * io_unit(meta) consists of a log. There is one situation we want to avoid. A 12210576b1c6SShaohua Li * broken meta in the middle of a log causes recovery can't find meta at the 12220576b1c6SShaohua Li * head of log. If operations require meta at the head persistent in log, we 12230576b1c6SShaohua Li * must make sure meta before it persistent in log too. A case is: 12240576b1c6SShaohua Li * 12250576b1c6SShaohua Li * stripe data/parity is in log, we start write stripe to raid disks. stripe 12260576b1c6SShaohua Li * data/parity must be persistent in log before we do the write to raid disks. 12270576b1c6SShaohua Li * 12280576b1c6SShaohua Li * The solution is we restrictly maintain io_unit list order. In this case, we 12290576b1c6SShaohua Li * only write stripes of an io_unit to raid disks till the io_unit is the first 12300576b1c6SShaohua Li * one whose data/parity is in log. 12310576b1c6SShaohua Li */ 12320576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log) 12330576b1c6SShaohua Li { 1234a8c34f91SShaohua Li bool do_flush; 123556fef7c6SChristoph Hellwig 123656fef7c6SChristoph Hellwig if (!log || !log->need_cache_flush) 12370576b1c6SShaohua Li return; 12380576b1c6SShaohua Li 1239a8c34f91SShaohua Li spin_lock_irq(&log->io_list_lock); 1240a8c34f91SShaohua Li /* flush bio is running */ 1241a8c34f91SShaohua Li if (!list_empty(&log->flushing_ios)) { 1242a8c34f91SShaohua Li spin_unlock_irq(&log->io_list_lock); 12430576b1c6SShaohua Li return; 12440576b1c6SShaohua Li } 1245a8c34f91SShaohua Li list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1246a8c34f91SShaohua Li do_flush = !list_empty(&log->flushing_ios); 12470576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 1248a8c34f91SShaohua Li 1249a8c34f91SShaohua Li if (!do_flush) 1250a8c34f91SShaohua Li return; 1251a8c34f91SShaohua Li bio_reset(&log->flush_bio); 1252a8c34f91SShaohua Li log->flush_bio.bi_bdev = log->rdev->bdev; 1253a8c34f91SShaohua Li log->flush_bio.bi_end_io = r5l_log_flush_endio; 125470fd7614SChristoph Hellwig log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 12554e49ea4aSMike Christie submit_bio(&log->flush_bio); 12560576b1c6SShaohua Li } 12570576b1c6SShaohua Li 12580576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp); 12594b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log, 12604b482044SShaohua Li sector_t end) 12614b482044SShaohua Li { 12624b482044SShaohua Li struct block_device *bdev = log->rdev->bdev; 12634b482044SShaohua Li struct mddev *mddev; 12644b482044SShaohua Li 12654b482044SShaohua Li r5l_write_super(log, end); 12664b482044SShaohua Li 12674b482044SShaohua Li if (!blk_queue_discard(bdev_get_queue(bdev))) 12684b482044SShaohua Li return; 12694b482044SShaohua Li 12704b482044SShaohua Li mddev = log->rdev->mddev; 12714b482044SShaohua Li /* 12728e018c21SShaohua Li * Discard could zero data, so before discard we must make sure 12738e018c21SShaohua Li * superblock is updated to new log tail. Updating superblock (either 12748e018c21SShaohua Li * directly call md_update_sb() or depend on md thread) must hold 12758e018c21SShaohua Li * reconfig mutex. On the other hand, raid5_quiesce is called with 12768e018c21SShaohua Li * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 12778e018c21SShaohua Li * for all IO finish, hence waitting for reclaim thread, while reclaim 12788e018c21SShaohua Li * thread is calling this function and waitting for reconfig mutex. So 12798e018c21SShaohua Li * there is a deadlock. We workaround this issue with a trylock. 12808e018c21SShaohua Li * FIXME: we could miss discard if we can't take reconfig mutex 12814b482044SShaohua Li */ 12822953079cSShaohua Li set_mask_bits(&mddev->sb_flags, 0, 12832953079cSShaohua Li BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 12848e018c21SShaohua Li if (!mddev_trylock(mddev)) 12858e018c21SShaohua Li return; 12864b482044SShaohua Li md_update_sb(mddev, 1); 12878e018c21SShaohua Li mddev_unlock(mddev); 12884b482044SShaohua Li 12896e74a9cfSShaohua Li /* discard IO error really doesn't matter, ignore it */ 12904b482044SShaohua Li if (log->last_checkpoint < end) { 12914b482044SShaohua Li blkdev_issue_discard(bdev, 12924b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 12934b482044SShaohua Li end - log->last_checkpoint, GFP_NOIO, 0); 12944b482044SShaohua Li } else { 12954b482044SShaohua Li blkdev_issue_discard(bdev, 12964b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 12974b482044SShaohua Li log->device_size - log->last_checkpoint, 12984b482044SShaohua Li GFP_NOIO, 0); 12994b482044SShaohua Li blkdev_issue_discard(bdev, log->rdev->data_offset, end, 13004b482044SShaohua Li GFP_NOIO, 0); 13014b482044SShaohua Li } 13024b482044SShaohua Li } 13034b482044SShaohua Li 1304a39f7afdSSong Liu /* 1305a39f7afdSSong Liu * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1306a39f7afdSSong Liu * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1307a39f7afdSSong Liu * 1308a39f7afdSSong Liu * must hold conf->device_lock 1309a39f7afdSSong Liu */ 1310a39f7afdSSong Liu static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1311a39f7afdSSong Liu { 1312a39f7afdSSong Liu BUG_ON(list_empty(&sh->lru)); 1313a39f7afdSSong Liu BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1314a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1315a39f7afdSSong Liu 1316a39f7afdSSong Liu /* 1317a39f7afdSSong Liu * The stripe is not ON_RELEASE_LIST, so it is safe to call 1318a39f7afdSSong Liu * raid5_release_stripe() while holding conf->device_lock 1319a39f7afdSSong Liu */ 1320a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1321a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1322a39f7afdSSong Liu 1323a39f7afdSSong Liu list_del_init(&sh->lru); 1324a39f7afdSSong Liu atomic_inc(&sh->count); 1325a39f7afdSSong Liu 1326a39f7afdSSong Liu set_bit(STRIPE_HANDLE, &sh->state); 1327a39f7afdSSong Liu atomic_inc(&conf->active_stripes); 1328a39f7afdSSong Liu r5c_make_stripe_write_out(sh); 1329a39f7afdSSong Liu 1330*e33fbb9cSShaohua Li if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 1331*e33fbb9cSShaohua Li atomic_inc(&conf->r5c_flushing_partial_stripes); 1332*e33fbb9cSShaohua Li else 1333*e33fbb9cSShaohua Li atomic_inc(&conf->r5c_flushing_full_stripes); 1334a39f7afdSSong Liu raid5_release_stripe(sh); 1335a39f7afdSSong Liu } 1336a39f7afdSSong Liu 1337a39f7afdSSong Liu /* 1338a39f7afdSSong Liu * if num == 0, flush all full stripes 1339a39f7afdSSong Liu * if num > 0, flush all full stripes. If less than num full stripes are 1340a39f7afdSSong Liu * flushed, flush some partial stripes until totally num stripes are 1341a39f7afdSSong Liu * flushed or there is no more cached stripes. 1342a39f7afdSSong Liu */ 1343a39f7afdSSong Liu void r5c_flush_cache(struct r5conf *conf, int num) 1344a39f7afdSSong Liu { 1345a39f7afdSSong Liu int count; 1346a39f7afdSSong Liu struct stripe_head *sh, *next; 1347a39f7afdSSong Liu 1348a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1349a39f7afdSSong Liu if (!conf->log) 1350a39f7afdSSong Liu return; 1351a39f7afdSSong Liu 1352a39f7afdSSong Liu count = 0; 1353a39f7afdSSong Liu list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1354a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1355a39f7afdSSong Liu count++; 1356a39f7afdSSong Liu } 1357a39f7afdSSong Liu 1358a39f7afdSSong Liu if (count >= num) 1359a39f7afdSSong Liu return; 1360a39f7afdSSong Liu list_for_each_entry_safe(sh, next, 1361a39f7afdSSong Liu &conf->r5c_partial_stripe_list, lru) { 1362a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1363a39f7afdSSong Liu if (++count >= num) 1364a39f7afdSSong Liu break; 1365a39f7afdSSong Liu } 1366a39f7afdSSong Liu } 1367a39f7afdSSong Liu 1368a39f7afdSSong Liu static void r5c_do_reclaim(struct r5conf *conf) 1369a39f7afdSSong Liu { 1370a39f7afdSSong Liu struct r5l_log *log = conf->log; 1371a39f7afdSSong Liu struct stripe_head *sh; 1372a39f7afdSSong Liu int count = 0; 1373a39f7afdSSong Liu unsigned long flags; 1374a39f7afdSSong Liu int total_cached; 1375a39f7afdSSong Liu int stripes_to_flush; 1376*e33fbb9cSShaohua Li int flushing_partial, flushing_full; 1377a39f7afdSSong Liu 1378a39f7afdSSong Liu if (!r5c_is_writeback(log)) 1379a39f7afdSSong Liu return; 1380a39f7afdSSong Liu 1381*e33fbb9cSShaohua Li flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); 1382*e33fbb9cSShaohua Li flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); 1383a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1384*e33fbb9cSShaohua Li atomic_read(&conf->r5c_cached_full_stripes) - 1385*e33fbb9cSShaohua Li flushing_full - flushing_partial; 1386a39f7afdSSong Liu 1387a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 3 / 4 || 1388a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 1389a39f7afdSSong Liu /* 1390a39f7afdSSong Liu * if stripe cache pressure high, flush all full stripes and 1391a39f7afdSSong Liu * some partial stripes 1392a39f7afdSSong Liu */ 1393a39f7afdSSong Liu stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1394a39f7afdSSong Liu else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1395*e33fbb9cSShaohua Li atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > 1396a39f7afdSSong Liu R5C_FULL_STRIPE_FLUSH_BATCH) 1397a39f7afdSSong Liu /* 1398a39f7afdSSong Liu * if stripe cache pressure moderate, or if there is many full 1399a39f7afdSSong Liu * stripes,flush all full stripes 1400a39f7afdSSong Liu */ 1401a39f7afdSSong Liu stripes_to_flush = 0; 1402a39f7afdSSong Liu else 1403a39f7afdSSong Liu /* no need to flush */ 1404a39f7afdSSong Liu stripes_to_flush = -1; 1405a39f7afdSSong Liu 1406a39f7afdSSong Liu if (stripes_to_flush >= 0) { 1407a39f7afdSSong Liu spin_lock_irqsave(&conf->device_lock, flags); 1408a39f7afdSSong Liu r5c_flush_cache(conf, stripes_to_flush); 1409a39f7afdSSong Liu spin_unlock_irqrestore(&conf->device_lock, flags); 1410a39f7afdSSong Liu } 1411a39f7afdSSong Liu 1412a39f7afdSSong Liu /* if log space is tight, flush stripes on stripe_in_journal_list */ 1413a39f7afdSSong Liu if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1414a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1415a39f7afdSSong Liu spin_lock(&conf->device_lock); 1416a39f7afdSSong Liu list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1417a39f7afdSSong Liu /* 1418a39f7afdSSong Liu * stripes on stripe_in_journal_list could be in any 1419a39f7afdSSong Liu * state of the stripe_cache state machine. In this 1420a39f7afdSSong Liu * case, we only want to flush stripe on 1421a39f7afdSSong Liu * r5c_cached_full/partial_stripes. The following 1422a39f7afdSSong Liu * condition makes sure the stripe is on one of the 1423a39f7afdSSong Liu * two lists. 1424a39f7afdSSong Liu */ 1425a39f7afdSSong Liu if (!list_empty(&sh->lru) && 1426a39f7afdSSong Liu !test_bit(STRIPE_HANDLE, &sh->state) && 1427a39f7afdSSong Liu atomic_read(&sh->count) == 0) { 1428a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1429a39f7afdSSong Liu if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1430a39f7afdSSong Liu break; 1431a39f7afdSSong Liu } 1432e8fd52eeSShaohua Li } 1433a39f7afdSSong Liu spin_unlock(&conf->device_lock); 1434a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1435a39f7afdSSong Liu } 1436f687a33eSSong Liu 1437f687a33eSSong Liu if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 1438f687a33eSSong Liu r5l_run_no_space_stripes(log); 1439f687a33eSSong Liu 1440a39f7afdSSong Liu md_wakeup_thread(conf->mddev->thread); 1441a39f7afdSSong Liu } 14424b482044SShaohua Li 14430576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log) 14440576b1c6SShaohua Li { 1445a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 14460576b1c6SShaohua Li sector_t reclaim_target = xchg(&log->reclaim_target, 0); 144717036461SChristoph Hellwig sector_t reclaimable; 144817036461SChristoph Hellwig sector_t next_checkpoint; 1449a39f7afdSSong Liu bool write_super; 14500576b1c6SShaohua Li 14510576b1c6SShaohua Li spin_lock_irq(&log->io_list_lock); 1452a39f7afdSSong Liu write_super = r5l_reclaimable_space(log) > log->max_free_space || 1453a39f7afdSSong Liu reclaim_target != 0 || !list_empty(&log->no_space_stripes); 14540576b1c6SShaohua Li /* 14550576b1c6SShaohua Li * move proper io_unit to reclaim list. We should not change the order. 14560576b1c6SShaohua Li * reclaimable/unreclaimable io_unit can be mixed in the list, we 14570576b1c6SShaohua Li * shouldn't reuse space of an unreclaimable io_unit 14580576b1c6SShaohua Li */ 14590576b1c6SShaohua Li while (1) { 146017036461SChristoph Hellwig reclaimable = r5l_reclaimable_space(log); 146117036461SChristoph Hellwig if (reclaimable >= reclaim_target || 14620576b1c6SShaohua Li (list_empty(&log->running_ios) && 14630576b1c6SShaohua Li list_empty(&log->io_end_ios) && 1464a8c34f91SShaohua Li list_empty(&log->flushing_ios) && 146504732f74SChristoph Hellwig list_empty(&log->finished_ios))) 14660576b1c6SShaohua Li break; 14670576b1c6SShaohua Li 146817036461SChristoph Hellwig md_wakeup_thread(log->rdev->mddev->thread); 146917036461SChristoph Hellwig wait_event_lock_irq(log->iounit_wait, 147017036461SChristoph Hellwig r5l_reclaimable_space(log) > reclaimable, 147117036461SChristoph Hellwig log->io_list_lock); 14720576b1c6SShaohua Li } 147317036461SChristoph Hellwig 1474a39f7afdSSong Liu next_checkpoint = r5c_calculate_new_cp(conf); 14750576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 14760576b1c6SShaohua Li 1477a39f7afdSSong Liu if (reclaimable == 0 || !write_super) 14780576b1c6SShaohua Li return; 14790576b1c6SShaohua Li 14800576b1c6SShaohua Li /* 14810576b1c6SShaohua Li * write_super will flush cache of each raid disk. We must write super 14820576b1c6SShaohua Li * here, because the log area might be reused soon and we don't want to 14830576b1c6SShaohua Li * confuse recovery 14840576b1c6SShaohua Li */ 14854b482044SShaohua Li r5l_write_super_and_discard_space(log, next_checkpoint); 14860576b1c6SShaohua Li 14870576b1c6SShaohua Li mutex_lock(&log->io_mutex); 148817036461SChristoph Hellwig log->last_checkpoint = next_checkpoint; 1489a39f7afdSSong Liu r5c_update_log_state(log); 14900576b1c6SShaohua Li mutex_unlock(&log->io_mutex); 14910576b1c6SShaohua Li 149217036461SChristoph Hellwig r5l_run_no_space_stripes(log); 14930576b1c6SShaohua Li } 14940576b1c6SShaohua Li 14950576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread) 14960576b1c6SShaohua Li { 14970576b1c6SShaohua Li struct mddev *mddev = thread->mddev; 14980576b1c6SShaohua Li struct r5conf *conf = mddev->private; 14990576b1c6SShaohua Li struct r5l_log *log = conf->log; 15000576b1c6SShaohua Li 15010576b1c6SShaohua Li if (!log) 15020576b1c6SShaohua Li return; 1503a39f7afdSSong Liu r5c_do_reclaim(conf); 15040576b1c6SShaohua Li r5l_do_reclaim(log); 15050576b1c6SShaohua Li } 15060576b1c6SShaohua Li 1507a39f7afdSSong Liu void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1508f6bed0efSShaohua Li { 15090576b1c6SShaohua Li unsigned long target; 15100576b1c6SShaohua Li unsigned long new = (unsigned long)space; /* overflow in theory */ 15110576b1c6SShaohua Li 1512a39f7afdSSong Liu if (!log) 1513a39f7afdSSong Liu return; 15140576b1c6SShaohua Li do { 15150576b1c6SShaohua Li target = log->reclaim_target; 15160576b1c6SShaohua Li if (new < target) 15170576b1c6SShaohua Li return; 15180576b1c6SShaohua Li } while (cmpxchg(&log->reclaim_target, target, new) != target); 15190576b1c6SShaohua Li md_wakeup_thread(log->reclaim_thread); 1520f6bed0efSShaohua Li } 1521f6bed0efSShaohua Li 1522e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state) 1523e6c033f7SShaohua Li { 15244b482044SShaohua Li struct mddev *mddev; 1525e6c033f7SShaohua Li if (!log || state == 2) 1526e6c033f7SShaohua Li return; 1527ce1ccd07SShaohua Li if (state == 0) 1528ce1ccd07SShaohua Li kthread_unpark(log->reclaim_thread->tsk); 1529ce1ccd07SShaohua Li else if (state == 1) { 15304b482044SShaohua Li /* make sure r5l_write_super_and_discard_space exits */ 15314b482044SShaohua Li mddev = log->rdev->mddev; 15324b482044SShaohua Li wake_up(&mddev->sb_wait); 1533ce1ccd07SShaohua Li kthread_park(log->reclaim_thread->tsk); 1534a39f7afdSSong Liu r5l_wake_reclaim(log, MaxSector); 1535e6c033f7SShaohua Li r5l_do_reclaim(log); 1536e6c033f7SShaohua Li } 1537e6c033f7SShaohua Li } 1538e6c033f7SShaohua Li 15396e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf) 15406e74a9cfSShaohua Li { 1541f6b6ec5cSShaohua Li struct r5l_log *log; 1542f6b6ec5cSShaohua Li bool ret; 15437dde2ad3SShaohua Li /* don't allow write if journal disk is missing */ 1544f6b6ec5cSShaohua Li rcu_read_lock(); 1545f6b6ec5cSShaohua Li log = rcu_dereference(conf->log); 1546f6b6ec5cSShaohua Li 1547f6b6ec5cSShaohua Li if (!log) 1548f6b6ec5cSShaohua Li ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1549f6b6ec5cSShaohua Li else 1550f6b6ec5cSShaohua Li ret = test_bit(Faulty, &log->rdev->flags); 1551f6b6ec5cSShaohua Li rcu_read_unlock(); 1552f6b6ec5cSShaohua Li return ret; 15536e74a9cfSShaohua Li } 15546e74a9cfSShaohua Li 1555355810d1SShaohua Li struct r5l_recovery_ctx { 1556355810d1SShaohua Li struct page *meta_page; /* current meta */ 1557355810d1SShaohua Li sector_t meta_total_blocks; /* total size of current meta and data */ 1558355810d1SShaohua Li sector_t pos; /* recovery position */ 1559355810d1SShaohua Li u64 seq; /* recovery position seq */ 1560b4c625c6SSong Liu int data_parity_stripes; /* number of data_parity stripes */ 1561b4c625c6SSong Liu int data_only_stripes; /* number of data_only stripes */ 1562b4c625c6SSong Liu struct list_head cached_list; 1563355810d1SShaohua Li }; 1564355810d1SShaohua Li 15659ed988f5SSong Liu static int r5l_recovery_read_meta_block(struct r5l_log *log, 1566355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1567355810d1SShaohua Li { 1568355810d1SShaohua Li struct page *page = ctx->meta_page; 1569355810d1SShaohua Li struct r5l_meta_block *mb; 1570355810d1SShaohua Li u32 crc, stored_crc; 1571355810d1SShaohua Li 1572796a5cf0SMike Christie if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1573796a5cf0SMike Christie false)) 1574355810d1SShaohua Li return -EIO; 1575355810d1SShaohua Li 1576355810d1SShaohua Li mb = page_address(page); 1577355810d1SShaohua Li stored_crc = le32_to_cpu(mb->checksum); 1578355810d1SShaohua Li mb->checksum = 0; 1579355810d1SShaohua Li 1580355810d1SShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1581355810d1SShaohua Li le64_to_cpu(mb->seq) != ctx->seq || 1582355810d1SShaohua Li mb->version != R5LOG_VERSION || 1583355810d1SShaohua Li le64_to_cpu(mb->position) != ctx->pos) 1584355810d1SShaohua Li return -EINVAL; 1585355810d1SShaohua Li 15865cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1587355810d1SShaohua Li if (stored_crc != crc) 1588355810d1SShaohua Li return -EINVAL; 1589355810d1SShaohua Li 1590355810d1SShaohua Li if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1591355810d1SShaohua Li return -EINVAL; 1592355810d1SShaohua Li 1593355810d1SShaohua Li ctx->meta_total_blocks = BLOCK_SECTORS; 1594355810d1SShaohua Li 1595355810d1SShaohua Li return 0; 1596355810d1SShaohua Li } 1597355810d1SShaohua Li 15989ed988f5SSong Liu static void 15999ed988f5SSong Liu r5l_recovery_create_empty_meta_block(struct r5l_log *log, 16009ed988f5SSong Liu struct page *page, 16019ed988f5SSong Liu sector_t pos, u64 seq) 1602355810d1SShaohua Li { 1603355810d1SShaohua Li struct r5l_meta_block *mb; 1604355810d1SShaohua Li 1605355810d1SShaohua Li mb = page_address(page); 16069ed988f5SSong Liu clear_page(mb); 1607355810d1SShaohua Li mb->magic = cpu_to_le32(R5LOG_MAGIC); 1608355810d1SShaohua Li mb->version = R5LOG_VERSION; 1609355810d1SShaohua Li mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1610355810d1SShaohua Li mb->seq = cpu_to_le64(seq); 1611355810d1SShaohua Li mb->position = cpu_to_le64(pos); 1612355810d1SShaohua Li } 1613355810d1SShaohua Li 1614355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1615355810d1SShaohua Li u64 seq) 1616355810d1SShaohua Li { 1617355810d1SShaohua Li struct page *page; 1618355810d1SShaohua Li struct r5l_meta_block *mb; 1619355810d1SShaohua Li 16209ed988f5SSong Liu page = alloc_page(GFP_KERNEL); 1621355810d1SShaohua Li if (!page) 1622355810d1SShaohua Li return -ENOMEM; 16239ed988f5SSong Liu r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1624355810d1SShaohua Li mb = page_address(page); 16255c88f403SSong Liu mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 16265c88f403SSong Liu mb, PAGE_SIZE)); 1627796a5cf0SMike Christie if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 162870fd7614SChristoph Hellwig REQ_FUA, false)) { 1629355810d1SShaohua Li __free_page(page); 1630355810d1SShaohua Li return -EIO; 1631355810d1SShaohua Li } 1632355810d1SShaohua Li __free_page(page); 1633355810d1SShaohua Li return 0; 1634355810d1SShaohua Li } 1635355810d1SShaohua Li 1636b4c625c6SSong Liu /* 1637b4c625c6SSong Liu * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1638b4c625c6SSong Liu * to mark valid (potentially not flushed) data in the journal. 1639b4c625c6SSong Liu * 1640b4c625c6SSong Liu * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1641b4c625c6SSong Liu * so there should not be any mismatch here. 1642b4c625c6SSong Liu */ 1643b4c625c6SSong Liu static void r5l_recovery_load_data(struct r5l_log *log, 1644b4c625c6SSong Liu struct stripe_head *sh, 1645b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1646b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1647b4c625c6SSong Liu sector_t log_offset) 1648f6bed0efSShaohua Li { 1649b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1650b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1651b4c625c6SSong Liu int dd_idx; 1652355810d1SShaohua Li 1653b4c625c6SSong Liu raid5_compute_sector(conf, 1654b4c625c6SSong Liu le64_to_cpu(payload->location), 0, 1655b4c625c6SSong Liu &dd_idx, sh); 1656b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1657b4c625c6SSong Liu sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1658b4c625c6SSong Liu sh->dev[dd_idx].log_checksum = 1659b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1660b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS; 1661b4c625c6SSong Liu 1662b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1663b4c625c6SSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 1664b4c625c6SSong Liu } 1665b4c625c6SSong Liu 1666b4c625c6SSong Liu static void r5l_recovery_load_parity(struct r5l_log *log, 1667b4c625c6SSong Liu struct stripe_head *sh, 1668b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1669b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1670b4c625c6SSong Liu sector_t log_offset) 1671b4c625c6SSong Liu { 1672b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1673b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1674b4c625c6SSong Liu 1675b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1676b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1677b4c625c6SSong Liu sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1678b4c625c6SSong Liu sh->dev[sh->pd_idx].log_checksum = 1679b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1680b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1681b4c625c6SSong Liu 1682b4c625c6SSong Liu if (sh->qd_idx >= 0) { 1683b4c625c6SSong Liu sync_page_io(log->rdev, 1684b4c625c6SSong Liu r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1685b4c625c6SSong Liu PAGE_SIZE, sh->dev[sh->qd_idx].page, 1686b4c625c6SSong Liu REQ_OP_READ, 0, false); 1687b4c625c6SSong Liu sh->dev[sh->qd_idx].log_checksum = 1688b4c625c6SSong Liu le32_to_cpu(payload->checksum[1]); 1689b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1690b4c625c6SSong Liu } 1691b4c625c6SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 1692b4c625c6SSong Liu } 1693b4c625c6SSong Liu 1694b4c625c6SSong Liu static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1695b4c625c6SSong Liu { 1696b4c625c6SSong Liu int i; 1697b4c625c6SSong Liu 1698b4c625c6SSong Liu sh->state = 0; 1699b4c625c6SSong Liu sh->log_start = MaxSector; 1700b4c625c6SSong Liu for (i = sh->disks; i--; ) 1701b4c625c6SSong Liu sh->dev[i].flags = 0; 1702b4c625c6SSong Liu } 1703b4c625c6SSong Liu 1704b4c625c6SSong Liu static void 1705b4c625c6SSong Liu r5l_recovery_replay_one_stripe(struct r5conf *conf, 1706b4c625c6SSong Liu struct stripe_head *sh, 1707b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1708b4c625c6SSong Liu { 1709b4c625c6SSong Liu struct md_rdev *rdev, *rrdev; 1710b4c625c6SSong Liu int disk_index; 1711b4c625c6SSong Liu int data_count = 0; 1712b4c625c6SSong Liu 1713b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1714b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1715b4c625c6SSong Liu continue; 1716b4c625c6SSong Liu if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1717b4c625c6SSong Liu continue; 1718b4c625c6SSong Liu data_count++; 1719b4c625c6SSong Liu } 1720b4c625c6SSong Liu 1721b4c625c6SSong Liu /* 1722b4c625c6SSong Liu * stripes that only have parity must have been flushed 1723b4c625c6SSong Liu * before the crash that we are now recovering from, so 1724b4c625c6SSong Liu * there is nothing more to recovery. 1725b4c625c6SSong Liu */ 1726b4c625c6SSong Liu if (data_count == 0) 1727b4c625c6SSong Liu goto out; 1728b4c625c6SSong Liu 1729b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1730b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1731b4c625c6SSong Liu continue; 1732b4c625c6SSong Liu 1733b4c625c6SSong Liu /* in case device is broken */ 1734b4c625c6SSong Liu rcu_read_lock(); 1735b4c625c6SSong Liu rdev = rcu_dereference(conf->disks[disk_index].rdev); 1736b4c625c6SSong Liu if (rdev) { 1737b4c625c6SSong Liu atomic_inc(&rdev->nr_pending); 1738b4c625c6SSong Liu rcu_read_unlock(); 1739b4c625c6SSong Liu sync_page_io(rdev, sh->sector, PAGE_SIZE, 1740b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1741b4c625c6SSong Liu false); 1742b4c625c6SSong Liu rdev_dec_pending(rdev, rdev->mddev); 1743b4c625c6SSong Liu rcu_read_lock(); 1744b4c625c6SSong Liu } 1745b4c625c6SSong Liu rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1746b4c625c6SSong Liu if (rrdev) { 1747b4c625c6SSong Liu atomic_inc(&rrdev->nr_pending); 1748b4c625c6SSong Liu rcu_read_unlock(); 1749b4c625c6SSong Liu sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1750b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1751b4c625c6SSong Liu false); 1752b4c625c6SSong Liu rdev_dec_pending(rrdev, rrdev->mddev); 1753b4c625c6SSong Liu rcu_read_lock(); 1754b4c625c6SSong Liu } 1755b4c625c6SSong Liu rcu_read_unlock(); 1756b4c625c6SSong Liu } 1757b4c625c6SSong Liu ctx->data_parity_stripes++; 1758b4c625c6SSong Liu out: 1759b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1760b4c625c6SSong Liu } 1761b4c625c6SSong Liu 1762b4c625c6SSong Liu static struct stripe_head * 1763b4c625c6SSong Liu r5c_recovery_alloc_stripe(struct r5conf *conf, 17643c66abbaSSong Liu sector_t stripe_sect) 1765b4c625c6SSong Liu { 1766b4c625c6SSong Liu struct stripe_head *sh; 1767b4c625c6SSong Liu 1768b4c625c6SSong Liu sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1769b4c625c6SSong Liu if (!sh) 1770b4c625c6SSong Liu return NULL; /* no more stripe available */ 1771b4c625c6SSong Liu 1772b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1773b4c625c6SSong Liu 1774b4c625c6SSong Liu return sh; 1775b4c625c6SSong Liu } 1776b4c625c6SSong Liu 1777b4c625c6SSong Liu static struct stripe_head * 1778b4c625c6SSong Liu r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1779b4c625c6SSong Liu { 1780b4c625c6SSong Liu struct stripe_head *sh; 1781b4c625c6SSong Liu 1782b4c625c6SSong Liu list_for_each_entry(sh, list, lru) 1783b4c625c6SSong Liu if (sh->sector == sect) 1784b4c625c6SSong Liu return sh; 1785b4c625c6SSong Liu return NULL; 1786b4c625c6SSong Liu } 1787b4c625c6SSong Liu 1788b4c625c6SSong Liu static void 1789b4c625c6SSong Liu r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1790b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1791b4c625c6SSong Liu { 1792b4c625c6SSong Liu struct stripe_head *sh, *next; 1793b4c625c6SSong Liu 1794b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1795b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1796b4c625c6SSong Liu list_del_init(&sh->lru); 1797b4c625c6SSong Liu raid5_release_stripe(sh); 1798b4c625c6SSong Liu } 1799b4c625c6SSong Liu } 1800b4c625c6SSong Liu 1801b4c625c6SSong Liu static void 1802b4c625c6SSong Liu r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1803b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1804b4c625c6SSong Liu { 1805b4c625c6SSong Liu struct stripe_head *sh, *next; 1806b4c625c6SSong Liu 1807b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1808b4c625c6SSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1809b4c625c6SSong Liu r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1810b4c625c6SSong Liu list_del_init(&sh->lru); 1811b4c625c6SSong Liu raid5_release_stripe(sh); 1812b4c625c6SSong Liu } 1813b4c625c6SSong Liu } 1814b4c625c6SSong Liu 1815b4c625c6SSong Liu /* if matches return 0; otherwise return -EINVAL */ 1816b4c625c6SSong Liu static int 1817b4c625c6SSong Liu r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1818b4c625c6SSong Liu sector_t log_offset, __le32 log_checksum) 1819b4c625c6SSong Liu { 1820b4c625c6SSong Liu void *addr; 1821b4c625c6SSong Liu u32 checksum; 1822b4c625c6SSong Liu 1823b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1824b4c625c6SSong Liu page, REQ_OP_READ, 0, false); 1825b4c625c6SSong Liu addr = kmap_atomic(page); 1826b4c625c6SSong Liu checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1827b4c625c6SSong Liu kunmap_atomic(addr); 1828b4c625c6SSong Liu return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1829b4c625c6SSong Liu } 1830b4c625c6SSong Liu 1831b4c625c6SSong Liu /* 1832b4c625c6SSong Liu * before loading data to stripe cache, we need verify checksum for all data, 1833b4c625c6SSong Liu * if there is mismatch for any data page, we drop all data in the mata block 1834b4c625c6SSong Liu */ 1835b4c625c6SSong Liu static int 1836b4c625c6SSong Liu r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1837b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1838b4c625c6SSong Liu { 1839b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1840b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1841b4c625c6SSong Liu struct r5l_meta_block *mb = page_address(ctx->meta_page); 1842b4c625c6SSong Liu sector_t mb_offset = sizeof(struct r5l_meta_block); 1843b4c625c6SSong Liu sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1844b4c625c6SSong Liu struct page *page; 1845b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 1846b4c625c6SSong Liu 1847b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 1848b4c625c6SSong Liu if (!page) 1849355810d1SShaohua Li return -ENOMEM; 1850355810d1SShaohua Li 1851b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 1852b4c625c6SSong Liu payload = (void *)mb + mb_offset; 1853b4c625c6SSong Liu 1854b4c625c6SSong Liu if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1855b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 1856b4c625c6SSong Liu log, page, log_offset, 1857b4c625c6SSong Liu payload->checksum[0]) < 0) 1858b4c625c6SSong Liu goto mismatch; 1859b4c625c6SSong Liu } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1860b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 1861b4c625c6SSong Liu log, page, log_offset, 1862b4c625c6SSong Liu payload->checksum[0]) < 0) 1863b4c625c6SSong Liu goto mismatch; 1864b4c625c6SSong Liu if (conf->max_degraded == 2 && /* q for RAID 6 */ 1865b4c625c6SSong Liu r5l_recovery_verify_data_checksum( 1866b4c625c6SSong Liu log, page, 1867b4c625c6SSong Liu r5l_ring_add(log, log_offset, 1868b4c625c6SSong Liu BLOCK_SECTORS), 1869b4c625c6SSong Liu payload->checksum[1]) < 0) 1870b4c625c6SSong Liu goto mismatch; 1871b4c625c6SSong Liu } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1872b4c625c6SSong Liu goto mismatch; 1873b4c625c6SSong Liu 1874b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 1875b4c625c6SSong Liu le32_to_cpu(payload->size)); 1876b4c625c6SSong Liu 1877b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 1878b4c625c6SSong Liu sizeof(__le32) * 1879b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1880b4c625c6SSong Liu } 1881b4c625c6SSong Liu 1882b4c625c6SSong Liu put_page(page); 1883b4c625c6SSong Liu return 0; 1884b4c625c6SSong Liu 1885b4c625c6SSong Liu mismatch: 1886b4c625c6SSong Liu put_page(page); 1887b4c625c6SSong Liu return -EINVAL; 1888b4c625c6SSong Liu } 1889b4c625c6SSong Liu 1890b4c625c6SSong Liu /* 1891b4c625c6SSong Liu * Analyze all data/parity pages in one meta block 1892b4c625c6SSong Liu * Returns: 1893b4c625c6SSong Liu * 0 for success 1894b4c625c6SSong Liu * -EINVAL for unknown playload type 1895b4c625c6SSong Liu * -EAGAIN for checksum mismatch of data page 1896b4c625c6SSong Liu * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1897b4c625c6SSong Liu */ 1898b4c625c6SSong Liu static int 1899b4c625c6SSong Liu r5c_recovery_analyze_meta_block(struct r5l_log *log, 1900b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1901b4c625c6SSong Liu struct list_head *cached_stripe_list) 1902b4c625c6SSong Liu { 1903b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1904b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1905b4c625c6SSong Liu struct r5l_meta_block *mb; 1906b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 1907b4c625c6SSong Liu int mb_offset; 1908b4c625c6SSong Liu sector_t log_offset; 1909b4c625c6SSong Liu sector_t stripe_sect; 1910b4c625c6SSong Liu struct stripe_head *sh; 1911b4c625c6SSong Liu int ret; 1912b4c625c6SSong Liu 1913b4c625c6SSong Liu /* 1914b4c625c6SSong Liu * for mismatch in data blocks, we will drop all data in this mb, but 1915b4c625c6SSong Liu * we will still read next mb for other data with FLUSH flag, as 1916b4c625c6SSong Liu * io_unit could finish out of order. 1917b4c625c6SSong Liu */ 1918b4c625c6SSong Liu ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1919b4c625c6SSong Liu if (ret == -EINVAL) 1920b4c625c6SSong Liu return -EAGAIN; 1921b4c625c6SSong Liu else if (ret) 1922b4c625c6SSong Liu return ret; /* -ENOMEM duo to alloc_page() failed */ 1923b4c625c6SSong Liu 1924b4c625c6SSong Liu mb = page_address(ctx->meta_page); 1925b4c625c6SSong Liu mb_offset = sizeof(struct r5l_meta_block); 1926b4c625c6SSong Liu log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1927b4c625c6SSong Liu 1928b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 1929b4c625c6SSong Liu int dd; 1930b4c625c6SSong Liu 1931b4c625c6SSong Liu payload = (void *)mb + mb_offset; 1932b4c625c6SSong Liu stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1933b4c625c6SSong Liu raid5_compute_sector( 1934b4c625c6SSong Liu conf, le64_to_cpu(payload->location), 0, &dd, 1935b4c625c6SSong Liu NULL) 1936b4c625c6SSong Liu : le64_to_cpu(payload->location); 1937b4c625c6SSong Liu 1938b4c625c6SSong Liu sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1939b4c625c6SSong Liu stripe_sect); 1940b4c625c6SSong Liu 1941b4c625c6SSong Liu if (!sh) { 19423c66abbaSSong Liu sh = r5c_recovery_alloc_stripe(conf, stripe_sect); 1943b4c625c6SSong Liu /* 1944b4c625c6SSong Liu * cannot get stripe from raid5_get_active_stripe 1945b4c625c6SSong Liu * try replay some stripes 1946b4c625c6SSong Liu */ 1947b4c625c6SSong Liu if (!sh) { 1948b4c625c6SSong Liu r5c_recovery_replay_stripes( 1949b4c625c6SSong Liu cached_stripe_list, ctx); 1950b4c625c6SSong Liu sh = r5c_recovery_alloc_stripe( 19513c66abbaSSong Liu conf, stripe_sect); 1952b4c625c6SSong Liu } 1953b4c625c6SSong Liu if (!sh) { 1954b4c625c6SSong Liu pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1955b4c625c6SSong Liu mdname(mddev), 1956b4c625c6SSong Liu conf->min_nr_stripes * 2); 1957b4c625c6SSong Liu raid5_set_cache_size(mddev, 1958b4c625c6SSong Liu conf->min_nr_stripes * 2); 19593c66abbaSSong Liu sh = r5c_recovery_alloc_stripe(conf, 19603c66abbaSSong Liu stripe_sect); 1961b4c625c6SSong Liu } 1962b4c625c6SSong Liu if (!sh) { 1963b4c625c6SSong Liu pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1964b4c625c6SSong Liu mdname(mddev)); 1965b4c625c6SSong Liu return -ENOMEM; 1966b4c625c6SSong Liu } 1967b4c625c6SSong Liu list_add_tail(&sh->lru, cached_stripe_list); 1968b4c625c6SSong Liu } 1969b4c625c6SSong Liu 1970b4c625c6SSong Liu if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1971f7b7bee7SZhengyuan Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 1972f7b7bee7SZhengyuan Liu test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 1973b4c625c6SSong Liu r5l_recovery_replay_one_stripe(conf, sh, ctx); 1974b4c625c6SSong Liu list_move_tail(&sh->lru, cached_stripe_list); 1975b4c625c6SSong Liu } 1976b4c625c6SSong Liu r5l_recovery_load_data(log, sh, ctx, payload, 1977b4c625c6SSong Liu log_offset); 1978b4c625c6SSong Liu } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1979b4c625c6SSong Liu r5l_recovery_load_parity(log, sh, ctx, payload, 1980b4c625c6SSong Liu log_offset); 1981b4c625c6SSong Liu else 1982b4c625c6SSong Liu return -EINVAL; 1983b4c625c6SSong Liu 1984b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 1985b4c625c6SSong Liu le32_to_cpu(payload->size)); 1986b4c625c6SSong Liu 1987b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 1988b4c625c6SSong Liu sizeof(__le32) * 1989b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1990b4c625c6SSong Liu } 1991b4c625c6SSong Liu 1992b4c625c6SSong Liu return 0; 1993b4c625c6SSong Liu } 1994b4c625c6SSong Liu 1995b4c625c6SSong Liu /* 1996b4c625c6SSong Liu * Load the stripe into cache. The stripe will be written out later by 1997b4c625c6SSong Liu * the stripe cache state machine. 1998b4c625c6SSong Liu */ 1999b4c625c6SSong Liu static void r5c_recovery_load_one_stripe(struct r5l_log *log, 2000b4c625c6SSong Liu struct stripe_head *sh) 2001b4c625c6SSong Liu { 2002b4c625c6SSong Liu struct r5dev *dev; 2003b4c625c6SSong Liu int i; 2004b4c625c6SSong Liu 2005b4c625c6SSong Liu for (i = sh->disks; i--; ) { 2006b4c625c6SSong Liu dev = sh->dev + i; 2007b4c625c6SSong Liu if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 2008b4c625c6SSong Liu set_bit(R5_InJournal, &dev->flags); 2009b4c625c6SSong Liu set_bit(R5_UPTODATE, &dev->flags); 2010b4c625c6SSong Liu } 2011b4c625c6SSong Liu } 2012b4c625c6SSong Liu } 2013b4c625c6SSong Liu 2014b4c625c6SSong Liu /* 2015b4c625c6SSong Liu * Scan through the log for all to-be-flushed data 2016b4c625c6SSong Liu * 2017b4c625c6SSong Liu * For stripes with data and parity, namely Data-Parity stripe 2018b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 2019b4c625c6SSong Liu * 2020b4c625c6SSong Liu * For stripes with only data, namely Data-Only stripe 2021b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 2022b4c625c6SSong Liu * 2023b4c625c6SSong Liu * For a stripe, if we see data after parity, we should discard all previous 2024b4c625c6SSong Liu * data and parity for this stripe, as these data are already flushed to 2025b4c625c6SSong Liu * the array. 2026b4c625c6SSong Liu * 2027b4c625c6SSong Liu * At the end of the scan, we return the new journal_tail, which points to 2028b4c625c6SSong Liu * first data-only stripe on the journal device, or next invalid meta block. 2029b4c625c6SSong Liu */ 2030b4c625c6SSong Liu static int r5c_recovery_flush_log(struct r5l_log *log, 2031b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 2032b4c625c6SSong Liu { 2033bc8f167fSJackieLiu struct stripe_head *sh; 2034b4c625c6SSong Liu int ret = 0; 2035b4c625c6SSong Liu 2036b4c625c6SSong Liu /* scan through the log */ 2037b4c625c6SSong Liu while (1) { 2038b4c625c6SSong Liu if (r5l_recovery_read_meta_block(log, ctx)) 2039b4c625c6SSong Liu break; 2040b4c625c6SSong Liu 2041b4c625c6SSong Liu ret = r5c_recovery_analyze_meta_block(log, ctx, 2042b4c625c6SSong Liu &ctx->cached_list); 2043b4c625c6SSong Liu /* 2044b4c625c6SSong Liu * -EAGAIN means mismatch in data block, in this case, we still 2045b4c625c6SSong Liu * try scan the next metablock 2046b4c625c6SSong Liu */ 2047b4c625c6SSong Liu if (ret && ret != -EAGAIN) 2048b4c625c6SSong Liu break; /* ret == -EINVAL or -ENOMEM */ 2049b4c625c6SSong Liu ctx->seq++; 2050b4c625c6SSong Liu ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 2051b4c625c6SSong Liu } 2052b4c625c6SSong Liu 2053b4c625c6SSong Liu if (ret == -ENOMEM) { 2054b4c625c6SSong Liu r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 2055b4c625c6SSong Liu return ret; 2056b4c625c6SSong Liu } 2057b4c625c6SSong Liu 2058b4c625c6SSong Liu /* replay data-parity stripes */ 2059b4c625c6SSong Liu r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 2060b4c625c6SSong Liu 2061b4c625c6SSong Liu /* load data-only stripes to stripe cache */ 2062bc8f167fSJackieLiu list_for_each_entry(sh, &ctx->cached_list, lru) { 2063b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2064b4c625c6SSong Liu r5c_recovery_load_one_stripe(log, sh); 2065b4c625c6SSong Liu ctx->data_only_stripes++; 2066b4c625c6SSong Liu } 2067b4c625c6SSong Liu 2068b4c625c6SSong Liu return 0; 2069b4c625c6SSong Liu } 2070355810d1SShaohua Li 2071355810d1SShaohua Li /* 2072355810d1SShaohua Li * we did a recovery. Now ctx.pos points to an invalid meta block. New 2073355810d1SShaohua Li * log will start here. but we can't let superblock point to last valid 2074355810d1SShaohua Li * meta block. The log might looks like: 2075355810d1SShaohua Li * | meta 1| meta 2| meta 3| 2076355810d1SShaohua Li * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2077355810d1SShaohua Li * superblock points to meta 1, we write a new valid meta 2n. if crash 2078355810d1SShaohua Li * happens again, new recovery will start from meta 1. Since meta 2n is 2079355810d1SShaohua Li * valid now, recovery will think meta 3 is valid, which is wrong. 2080355810d1SShaohua Li * The solution is we create a new meta in meta2 with its seq == meta 20813c6edc66SSong Liu * 1's seq + 10000 and let superblock points to meta2. The same recovery 20823c6edc66SSong Liu * will not think meta 3 is a valid meta, because its seq doesn't match 2083355810d1SShaohua Li */ 2084355810d1SShaohua Li 2085b4c625c6SSong Liu /* 2086b4c625c6SSong Liu * Before recovery, the log looks like the following 2087b4c625c6SSong Liu * 2088b4c625c6SSong Liu * --------------------------------------------- 2089b4c625c6SSong Liu * | valid log | invalid log | 2090b4c625c6SSong Liu * --------------------------------------------- 2091b4c625c6SSong Liu * ^ 2092b4c625c6SSong Liu * |- log->last_checkpoint 2093b4c625c6SSong Liu * |- log->last_cp_seq 2094b4c625c6SSong Liu * 2095b4c625c6SSong Liu * Now we scan through the log until we see invalid entry 2096b4c625c6SSong Liu * 2097b4c625c6SSong Liu * --------------------------------------------- 2098b4c625c6SSong Liu * | valid log | invalid log | 2099b4c625c6SSong Liu * --------------------------------------------- 2100b4c625c6SSong Liu * ^ ^ 2101b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos 2102b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq 2103b4c625c6SSong Liu * 2104b4c625c6SSong Liu * From this point, we need to increase seq number by 10 to avoid 2105b4c625c6SSong Liu * confusing next recovery. 2106b4c625c6SSong Liu * 2107b4c625c6SSong Liu * --------------------------------------------- 2108b4c625c6SSong Liu * | valid log | invalid log | 2109b4c625c6SSong Liu * --------------------------------------------- 2110b4c625c6SSong Liu * ^ ^ 2111b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+1 21123c6edc66SSong Liu * |- log->last_cp_seq |- ctx->seq+10001 2113b4c625c6SSong Liu * 2114b4c625c6SSong Liu * However, it is not safe to start the state machine yet, because data only 2115b4c625c6SSong Liu * parities are not yet secured in RAID. To save these data only parities, we 2116b4c625c6SSong Liu * rewrite them from seq+11. 2117b4c625c6SSong Liu * 2118b4c625c6SSong Liu * ----------------------------------------------------------------- 2119b4c625c6SSong Liu * | valid log | data only stripes | invalid log | 2120b4c625c6SSong Liu * ----------------------------------------------------------------- 2121b4c625c6SSong Liu * ^ ^ 2122b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 21233c6edc66SSong Liu * |- log->last_cp_seq |- ctx->seq+10000+n 2124b4c625c6SSong Liu * 2125b4c625c6SSong Liu * If failure happens again during this process, the recovery can safe start 2126b4c625c6SSong Liu * again from log->last_checkpoint. 2127b4c625c6SSong Liu * 2128b4c625c6SSong Liu * Once data only stripes are rewritten to journal, we move log_tail 2129b4c625c6SSong Liu * 2130b4c625c6SSong Liu * ----------------------------------------------------------------- 2131b4c625c6SSong Liu * | old log | data only stripes | invalid log | 2132b4c625c6SSong Liu * ----------------------------------------------------------------- 2133b4c625c6SSong Liu * ^ ^ 2134b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 21353c6edc66SSong Liu * |- log->last_cp_seq |- ctx->seq+10000+n 2136b4c625c6SSong Liu * 2137b4c625c6SSong Liu * Then we can safely start the state machine. If failure happens from this 2138b4c625c6SSong Liu * point on, the recovery will start from new log->last_checkpoint. 2139b4c625c6SSong Liu */ 2140b4c625c6SSong Liu static int 2141b4c625c6SSong Liu r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2142b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 2143b4c625c6SSong Liu { 2144a85dd7b8SSong Liu struct stripe_head *sh; 2145b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 2146b4c625c6SSong Liu struct page *page; 21473c66abbaSSong Liu sector_t next_checkpoint = MaxSector; 2148b4c625c6SSong Liu 2149b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 2150b4c625c6SSong Liu if (!page) { 2151b4c625c6SSong Liu pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2152b4c625c6SSong Liu mdname(mddev)); 2153b4c625c6SSong Liu return -ENOMEM; 2154b4c625c6SSong Liu } 2155b4c625c6SSong Liu 21563c66abbaSSong Liu WARN_ON(list_empty(&ctx->cached_list)); 21573c66abbaSSong Liu 2158a85dd7b8SSong Liu list_for_each_entry(sh, &ctx->cached_list, lru) { 2159b4c625c6SSong Liu struct r5l_meta_block *mb; 2160b4c625c6SSong Liu int i; 2161b4c625c6SSong Liu int offset; 2162b4c625c6SSong Liu sector_t write_pos; 2163b4c625c6SSong Liu 2164b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2165b4c625c6SSong Liu r5l_recovery_create_empty_meta_block(log, page, 2166b4c625c6SSong Liu ctx->pos, ctx->seq); 2167b4c625c6SSong Liu mb = page_address(page); 2168b4c625c6SSong Liu offset = le32_to_cpu(mb->meta_size); 2169fc833c2aSJackieLiu write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2170b4c625c6SSong Liu 2171b4c625c6SSong Liu for (i = sh->disks; i--; ) { 2172b4c625c6SSong Liu struct r5dev *dev = &sh->dev[i]; 2173b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 2174b4c625c6SSong Liu void *addr; 2175b4c625c6SSong Liu 2176b4c625c6SSong Liu if (test_bit(R5_InJournal, &dev->flags)) { 2177b4c625c6SSong Liu payload = (void *)mb + offset; 2178b4c625c6SSong Liu payload->header.type = cpu_to_le16( 2179b4c625c6SSong Liu R5LOG_PAYLOAD_DATA); 2180b4c625c6SSong Liu payload->size = BLOCK_SECTORS; 2181b4c625c6SSong Liu payload->location = cpu_to_le64( 2182b4c625c6SSong Liu raid5_compute_blocknr(sh, i, 0)); 2183b4c625c6SSong Liu addr = kmap_atomic(dev->page); 2184b4c625c6SSong Liu payload->checksum[0] = cpu_to_le32( 2185b4c625c6SSong Liu crc32c_le(log->uuid_checksum, addr, 2186b4c625c6SSong Liu PAGE_SIZE)); 2187b4c625c6SSong Liu kunmap_atomic(addr); 2188b4c625c6SSong Liu sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2189b4c625c6SSong Liu dev->page, REQ_OP_WRITE, 0, false); 2190b4c625c6SSong Liu write_pos = r5l_ring_add(log, write_pos, 2191b4c625c6SSong Liu BLOCK_SECTORS); 2192b4c625c6SSong Liu offset += sizeof(__le32) + 2193b4c625c6SSong Liu sizeof(struct r5l_payload_data_parity); 2194b4c625c6SSong Liu 2195b4c625c6SSong Liu } 2196b4c625c6SSong Liu } 2197b4c625c6SSong Liu mb->meta_size = cpu_to_le32(offset); 21985c88f403SSong Liu mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 21995c88f403SSong Liu mb, PAGE_SIZE)); 2200b4c625c6SSong Liu sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 220120737738SShaohua Li REQ_OP_WRITE, REQ_FUA, false); 2202b4c625c6SSong Liu sh->log_start = ctx->pos; 22033c66abbaSSong Liu list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 22043c66abbaSSong Liu atomic_inc(&log->stripe_in_journal_count); 2205b4c625c6SSong Liu ctx->pos = write_pos; 2206b4c625c6SSong Liu ctx->seq += 1; 22073c66abbaSSong Liu next_checkpoint = sh->log_start; 2208b4c625c6SSong Liu } 22093c66abbaSSong Liu log->next_checkpoint = next_checkpoint; 2210b4c625c6SSong Liu __free_page(page); 2211b4c625c6SSong Liu return 0; 2212b4c625c6SSong Liu } 2213b4c625c6SSong Liu 2214a85dd7b8SSong Liu static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, 2215a85dd7b8SSong Liu struct r5l_recovery_ctx *ctx) 2216a85dd7b8SSong Liu { 2217a85dd7b8SSong Liu struct mddev *mddev = log->rdev->mddev; 2218a85dd7b8SSong Liu struct r5conf *conf = mddev->private; 2219a85dd7b8SSong Liu struct stripe_head *sh, *next; 2220a85dd7b8SSong Liu 2221a85dd7b8SSong Liu if (ctx->data_only_stripes == 0) 2222a85dd7b8SSong Liu return; 2223a85dd7b8SSong Liu 2224a85dd7b8SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; 2225a85dd7b8SSong Liu 2226a85dd7b8SSong Liu list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2227a85dd7b8SSong Liu r5c_make_stripe_write_out(sh); 2228a85dd7b8SSong Liu set_bit(STRIPE_HANDLE, &sh->state); 2229a85dd7b8SSong Liu list_del_init(&sh->lru); 2230a85dd7b8SSong Liu raid5_release_stripe(sh); 2231a85dd7b8SSong Liu } 2232a85dd7b8SSong Liu 2233a85dd7b8SSong Liu md_wakeup_thread(conf->mddev->thread); 2234a85dd7b8SSong Liu /* reuse conf->wait_for_quiescent in recovery */ 2235a85dd7b8SSong Liu wait_event(conf->wait_for_quiescent, 2236a85dd7b8SSong Liu atomic_read(&conf->active_stripes) == 0); 2237a85dd7b8SSong Liu 2238a85dd7b8SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2239a85dd7b8SSong Liu } 2240a85dd7b8SSong Liu 2241f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log) 2242f6bed0efSShaohua Li { 22435aabf7c4SSong Liu struct mddev *mddev = log->rdev->mddev; 2244355810d1SShaohua Li struct r5l_recovery_ctx ctx; 22455aabf7c4SSong Liu int ret; 224643b96748SJackieLiu sector_t pos; 2247355810d1SShaohua Li 2248355810d1SShaohua Li ctx.pos = log->last_checkpoint; 2249355810d1SShaohua Li ctx.seq = log->last_cp_seq; 2250355810d1SShaohua Li ctx.meta_page = alloc_page(GFP_KERNEL); 2251b4c625c6SSong Liu ctx.data_only_stripes = 0; 2252b4c625c6SSong Liu ctx.data_parity_stripes = 0; 2253b4c625c6SSong Liu INIT_LIST_HEAD(&ctx.cached_list); 2254b4c625c6SSong Liu 2255355810d1SShaohua Li if (!ctx.meta_page) 2256355810d1SShaohua Li return -ENOMEM; 2257355810d1SShaohua Li 22585aabf7c4SSong Liu ret = r5c_recovery_flush_log(log, &ctx); 2259355810d1SShaohua Li __free_page(ctx.meta_page); 2260355810d1SShaohua Li 2261355810d1SShaohua Li if (ret) 2262355810d1SShaohua Li return ret; 22635aabf7c4SSong Liu 226443b96748SJackieLiu pos = ctx.pos; 22653c6edc66SSong Liu ctx.seq += 10000; 226643b96748SJackieLiu 226743b96748SJackieLiu 22685aabf7c4SSong Liu if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 22695aabf7c4SSong Liu pr_debug("md/raid:%s: starting from clean shutdown\n", 22705aabf7c4SSong Liu mdname(mddev)); 2271a85dd7b8SSong Liu else 227299f17890SColin Ian King pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 22735aabf7c4SSong Liu mdname(mddev), ctx.data_only_stripes, 22745aabf7c4SSong Liu ctx.data_parity_stripes); 22755aabf7c4SSong Liu 2276a85dd7b8SSong Liu if (ctx.data_only_stripes == 0) { 2277a85dd7b8SSong Liu log->next_checkpoint = ctx.pos; 2278a85dd7b8SSong Liu r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2279a85dd7b8SSong Liu ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2280a85dd7b8SSong Liu } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 22815aabf7c4SSong Liu pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 22825aabf7c4SSong Liu mdname(mddev)); 22835aabf7c4SSong Liu return -EIO; 22845aabf7c4SSong Liu } 22855aabf7c4SSong Liu 2286355810d1SShaohua Li log->log_start = ctx.pos; 2287355810d1SShaohua Li log->seq = ctx.seq; 228843b96748SJackieLiu log->last_checkpoint = pos; 228943b96748SJackieLiu r5l_write_super(log, pos); 2290a85dd7b8SSong Liu 2291a85dd7b8SSong Liu r5c_recovery_flush_data_only_stripes(log, &ctx); 2292f6bed0efSShaohua Li return 0; 2293f6bed0efSShaohua Li } 2294f6bed0efSShaohua Li 2295f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp) 2296f6bed0efSShaohua Li { 2297f6bed0efSShaohua Li struct mddev *mddev = log->rdev->mddev; 2298f6bed0efSShaohua Li 2299f6bed0efSShaohua Li log->rdev->journal_tail = cp; 23002953079cSShaohua Li set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2301f6bed0efSShaohua Li } 2302f6bed0efSShaohua Li 23032c7da14bSSong Liu static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 23042c7da14bSSong Liu { 23052c7da14bSSong Liu struct r5conf *conf = mddev->private; 23062c7da14bSSong Liu int ret; 23072c7da14bSSong Liu 23082c7da14bSSong Liu if (!conf->log) 23092c7da14bSSong Liu return 0; 23102c7da14bSSong Liu 23112c7da14bSSong Liu switch (conf->log->r5c_journal_mode) { 23122c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_THROUGH: 23132c7da14bSSong Liu ret = snprintf( 23142c7da14bSSong Liu page, PAGE_SIZE, "[%s] %s\n", 23152c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 23162c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 23172c7da14bSSong Liu break; 23182c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_BACK: 23192c7da14bSSong Liu ret = snprintf( 23202c7da14bSSong Liu page, PAGE_SIZE, "%s [%s]\n", 23212c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 23222c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 23232c7da14bSSong Liu break; 23242c7da14bSSong Liu default: 23252c7da14bSSong Liu ret = 0; 23262c7da14bSSong Liu } 23272c7da14bSSong Liu return ret; 23282c7da14bSSong Liu } 23292c7da14bSSong Liu 23302c7da14bSSong Liu static ssize_t r5c_journal_mode_store(struct mddev *mddev, 23312c7da14bSSong Liu const char *page, size_t length) 23322c7da14bSSong Liu { 23332c7da14bSSong Liu struct r5conf *conf = mddev->private; 23342c7da14bSSong Liu struct r5l_log *log = conf->log; 23352c7da14bSSong Liu int val = -1, i; 23362c7da14bSSong Liu int len = length; 23372c7da14bSSong Liu 23382c7da14bSSong Liu if (!log) 23392c7da14bSSong Liu return -ENODEV; 23402c7da14bSSong Liu 23412c7da14bSSong Liu if (len && page[len - 1] == '\n') 23422c7da14bSSong Liu len -= 1; 23432c7da14bSSong Liu for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 23442c7da14bSSong Liu if (strlen(r5c_journal_mode_str[i]) == len && 23452c7da14bSSong Liu strncmp(page, r5c_journal_mode_str[i], len) == 0) { 23462c7da14bSSong Liu val = i; 23472c7da14bSSong Liu break; 23482c7da14bSSong Liu } 23492c7da14bSSong Liu if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 23502c7da14bSSong Liu val > R5C_JOURNAL_MODE_WRITE_BACK) 23512c7da14bSSong Liu return -EINVAL; 23522c7da14bSSong Liu 23532e38a37fSSong Liu if (raid5_calc_degraded(conf) > 0 && 23542e38a37fSSong Liu val == R5C_JOURNAL_MODE_WRITE_BACK) 23552e38a37fSSong Liu return -EINVAL; 23562e38a37fSSong Liu 23572c7da14bSSong Liu mddev_suspend(mddev); 23582c7da14bSSong Liu conf->log->r5c_journal_mode = val; 23592c7da14bSSong Liu mddev_resume(mddev); 23602c7da14bSSong Liu 23612c7da14bSSong Liu pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 23622c7da14bSSong Liu mdname(mddev), val, r5c_journal_mode_str[val]); 23632c7da14bSSong Liu return length; 23642c7da14bSSong Liu } 23652c7da14bSSong Liu 23662c7da14bSSong Liu struct md_sysfs_entry 23672c7da14bSSong Liu r5c_journal_mode = __ATTR(journal_mode, 0644, 23682c7da14bSSong Liu r5c_journal_mode_show, r5c_journal_mode_store); 23692c7da14bSSong Liu 23702ded3703SSong Liu /* 23712ded3703SSong Liu * Try handle write operation in caching phase. This function should only 23722ded3703SSong Liu * be called in write-back mode. 23732ded3703SSong Liu * 23742ded3703SSong Liu * If all outstanding writes can be handled in caching phase, returns 0 23752ded3703SSong Liu * If writes requires write-out phase, call r5c_make_stripe_write_out() 23762ded3703SSong Liu * and returns -EAGAIN 23772ded3703SSong Liu */ 23782ded3703SSong Liu int r5c_try_caching_write(struct r5conf *conf, 23792ded3703SSong Liu struct stripe_head *sh, 23802ded3703SSong Liu struct stripe_head_state *s, 23812ded3703SSong Liu int disks) 23822ded3703SSong Liu { 23832ded3703SSong Liu struct r5l_log *log = conf->log; 23841e6d690bSSong Liu int i; 23851e6d690bSSong Liu struct r5dev *dev; 23861e6d690bSSong Liu int to_cache = 0; 238703b047f4SSong Liu void **pslot; 238803b047f4SSong Liu sector_t tree_index; 238903b047f4SSong Liu int ret; 239003b047f4SSong Liu uintptr_t refcount; 23912ded3703SSong Liu 23922ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 23932ded3703SSong Liu 23941e6d690bSSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 23951e6d690bSSong Liu /* 23961e6d690bSSong Liu * There are two different scenarios here: 23971e6d690bSSong Liu * 1. The stripe has some data cached, and it is sent to 23981e6d690bSSong Liu * write-out phase for reclaim 23991e6d690bSSong Liu * 2. The stripe is clean, and this is the first write 24001e6d690bSSong Liu * 24011e6d690bSSong Liu * For 1, return -EAGAIN, so we continue with 24021e6d690bSSong Liu * handle_stripe_dirtying(). 24031e6d690bSSong Liu * 24041e6d690bSSong Liu * For 2, set STRIPE_R5C_CACHING and continue with caching 24051e6d690bSSong Liu * write. 24061e6d690bSSong Liu */ 24071e6d690bSSong Liu 24081e6d690bSSong Liu /* case 1: anything injournal or anything in written */ 24091e6d690bSSong Liu if (s->injournal > 0 || s->written > 0) 24101e6d690bSSong Liu return -EAGAIN; 24111e6d690bSSong Liu /* case 2 */ 24121e6d690bSSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 24131e6d690bSSong Liu } 24141e6d690bSSong Liu 24152e38a37fSSong Liu /* 24162e38a37fSSong Liu * When run in degraded mode, array is set to write-through mode. 24172e38a37fSSong Liu * This check helps drain pending write safely in the transition to 24182e38a37fSSong Liu * write-through mode. 24192e38a37fSSong Liu */ 24202e38a37fSSong Liu if (s->failed) { 24212e38a37fSSong Liu r5c_make_stripe_write_out(sh); 24222e38a37fSSong Liu return -EAGAIN; 24232e38a37fSSong Liu } 24242e38a37fSSong Liu 24251e6d690bSSong Liu for (i = disks; i--; ) { 24261e6d690bSSong Liu dev = &sh->dev[i]; 24271e6d690bSSong Liu /* if non-overwrite, use writing-out phase */ 24281e6d690bSSong Liu if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 24291e6d690bSSong Liu !test_bit(R5_InJournal, &dev->flags)) { 24302ded3703SSong Liu r5c_make_stripe_write_out(sh); 24312ded3703SSong Liu return -EAGAIN; 24322ded3703SSong Liu } 24331e6d690bSSong Liu } 24341e6d690bSSong Liu 243503b047f4SSong Liu /* if the stripe is not counted in big_stripe_tree, add it now */ 243603b047f4SSong Liu if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 243703b047f4SSong Liu !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 243803b047f4SSong Liu tree_index = r5c_tree_index(conf, sh->sector); 243903b047f4SSong Liu spin_lock(&log->tree_lock); 244003b047f4SSong Liu pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 244103b047f4SSong Liu tree_index); 244203b047f4SSong Liu if (pslot) { 244303b047f4SSong Liu refcount = (uintptr_t)radix_tree_deref_slot_protected( 244403b047f4SSong Liu pslot, &log->tree_lock) >> 244503b047f4SSong Liu R5C_RADIX_COUNT_SHIFT; 244603b047f4SSong Liu radix_tree_replace_slot( 244703b047f4SSong Liu &log->big_stripe_tree, pslot, 244803b047f4SSong Liu (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); 244903b047f4SSong Liu } else { 245003b047f4SSong Liu /* 245103b047f4SSong Liu * this radix_tree_insert can fail safely, so no 245203b047f4SSong Liu * need to call radix_tree_preload() 245303b047f4SSong Liu */ 245403b047f4SSong Liu ret = radix_tree_insert( 245503b047f4SSong Liu &log->big_stripe_tree, tree_index, 245603b047f4SSong Liu (void *)(1 << R5C_RADIX_COUNT_SHIFT)); 245703b047f4SSong Liu if (ret) { 245803b047f4SSong Liu spin_unlock(&log->tree_lock); 245903b047f4SSong Liu r5c_make_stripe_write_out(sh); 246003b047f4SSong Liu return -EAGAIN; 246103b047f4SSong Liu } 246203b047f4SSong Liu } 246303b047f4SSong Liu spin_unlock(&log->tree_lock); 246403b047f4SSong Liu 246503b047f4SSong Liu /* 246603b047f4SSong Liu * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is 246703b047f4SSong Liu * counted in the radix tree 246803b047f4SSong Liu */ 246903b047f4SSong Liu set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 247003b047f4SSong Liu atomic_inc(&conf->r5c_cached_partial_stripes); 247103b047f4SSong Liu } 247203b047f4SSong Liu 24731e6d690bSSong Liu for (i = disks; i--; ) { 24741e6d690bSSong Liu dev = &sh->dev[i]; 24751e6d690bSSong Liu if (dev->towrite) { 24761e6d690bSSong Liu set_bit(R5_Wantwrite, &dev->flags); 24771e6d690bSSong Liu set_bit(R5_Wantdrain, &dev->flags); 24781e6d690bSSong Liu set_bit(R5_LOCKED, &dev->flags); 24791e6d690bSSong Liu to_cache++; 24801e6d690bSSong Liu } 24811e6d690bSSong Liu } 24821e6d690bSSong Liu 24831e6d690bSSong Liu if (to_cache) { 24841e6d690bSSong Liu set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 24851e6d690bSSong Liu /* 24861e6d690bSSong Liu * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 24871e6d690bSSong Liu * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 24881e6d690bSSong Liu * r5c_handle_data_cached() 24891e6d690bSSong Liu */ 24901e6d690bSSong Liu set_bit(STRIPE_LOG_TRAPPED, &sh->state); 24911e6d690bSSong Liu } 24921e6d690bSSong Liu 24931e6d690bSSong Liu return 0; 24941e6d690bSSong Liu } 24951e6d690bSSong Liu 24961e6d690bSSong Liu /* 24971e6d690bSSong Liu * free extra pages (orig_page) we allocated for prexor 24981e6d690bSSong Liu */ 24991e6d690bSSong Liu void r5c_release_extra_page(struct stripe_head *sh) 25001e6d690bSSong Liu { 2501d7bd398eSSong Liu struct r5conf *conf = sh->raid_conf; 25021e6d690bSSong Liu int i; 2503d7bd398eSSong Liu bool using_disk_info_extra_page; 2504d7bd398eSSong Liu 2505d7bd398eSSong Liu using_disk_info_extra_page = 2506d7bd398eSSong Liu sh->dev[0].orig_page == conf->disks[0].extra_page; 25071e6d690bSSong Liu 25081e6d690bSSong Liu for (i = sh->disks; i--; ) 25091e6d690bSSong Liu if (sh->dev[i].page != sh->dev[i].orig_page) { 25101e6d690bSSong Liu struct page *p = sh->dev[i].orig_page; 25111e6d690bSSong Liu 25121e6d690bSSong Liu sh->dev[i].orig_page = sh->dev[i].page; 251386aa1397SSong Liu clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 251486aa1397SSong Liu 2515d7bd398eSSong Liu if (!using_disk_info_extra_page) 25161e6d690bSSong Liu put_page(p); 25171e6d690bSSong Liu } 2518d7bd398eSSong Liu 2519d7bd398eSSong Liu if (using_disk_info_extra_page) { 2520d7bd398eSSong Liu clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 2521d7bd398eSSong Liu md_wakeup_thread(conf->mddev->thread); 2522d7bd398eSSong Liu } 2523d7bd398eSSong Liu } 2524d7bd398eSSong Liu 2525d7bd398eSSong Liu void r5c_use_extra_page(struct stripe_head *sh) 2526d7bd398eSSong Liu { 2527d7bd398eSSong Liu struct r5conf *conf = sh->raid_conf; 2528d7bd398eSSong Liu int i; 2529d7bd398eSSong Liu struct r5dev *dev; 2530d7bd398eSSong Liu 2531d7bd398eSSong Liu for (i = sh->disks; i--; ) { 2532d7bd398eSSong Liu dev = &sh->dev[i]; 2533d7bd398eSSong Liu if (dev->orig_page != dev->page) 2534d7bd398eSSong Liu put_page(dev->orig_page); 2535d7bd398eSSong Liu dev->orig_page = conf->disks[i].extra_page; 2536d7bd398eSSong Liu } 25371e6d690bSSong Liu } 25382ded3703SSong Liu 25392ded3703SSong Liu /* 25402ded3703SSong Liu * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 25412ded3703SSong Liu * stripe is committed to RAID disks. 25422ded3703SSong Liu */ 25432ded3703SSong Liu void r5c_finish_stripe_write_out(struct r5conf *conf, 25442ded3703SSong Liu struct stripe_head *sh, 25452ded3703SSong Liu struct stripe_head_state *s) 25462ded3703SSong Liu { 254703b047f4SSong Liu struct r5l_log *log = conf->log; 25481e6d690bSSong Liu int i; 25491e6d690bSSong Liu int do_wakeup = 0; 255003b047f4SSong Liu sector_t tree_index; 255103b047f4SSong Liu void **pslot; 255203b047f4SSong Liu uintptr_t refcount; 25531e6d690bSSong Liu 255403b047f4SSong Liu if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 25552ded3703SSong Liu return; 25562ded3703SSong Liu 25572ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 25582ded3703SSong Liu clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 25592ded3703SSong Liu 256003b047f4SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 25612ded3703SSong Liu return; 25621e6d690bSSong Liu 25631e6d690bSSong Liu for (i = sh->disks; i--; ) { 25641e6d690bSSong Liu clear_bit(R5_InJournal, &sh->dev[i].flags); 25651e6d690bSSong Liu if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 25661e6d690bSSong Liu do_wakeup = 1; 25671e6d690bSSong Liu } 25681e6d690bSSong Liu 25691e6d690bSSong Liu /* 25701e6d690bSSong Liu * analyse_stripe() runs before r5c_finish_stripe_write_out(), 25711e6d690bSSong Liu * We updated R5_InJournal, so we also update s->injournal. 25721e6d690bSSong Liu */ 25731e6d690bSSong Liu s->injournal = 0; 25741e6d690bSSong Liu 25751e6d690bSSong Liu if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 25761e6d690bSSong Liu if (atomic_dec_and_test(&conf->pending_full_writes)) 25771e6d690bSSong Liu md_wakeup_thread(conf->mddev->thread); 25781e6d690bSSong Liu 25791e6d690bSSong Liu if (do_wakeup) 25801e6d690bSSong Liu wake_up(&conf->wait_for_overlap); 2581a39f7afdSSong Liu 258203b047f4SSong Liu spin_lock_irq(&log->stripe_in_journal_lock); 2583a39f7afdSSong Liu list_del_init(&sh->r5c); 258403b047f4SSong Liu spin_unlock_irq(&log->stripe_in_journal_lock); 2585a39f7afdSSong Liu sh->log_start = MaxSector; 258603b047f4SSong Liu 258703b047f4SSong Liu atomic_dec(&log->stripe_in_journal_count); 258803b047f4SSong Liu r5c_update_log_state(log); 258903b047f4SSong Liu 259003b047f4SSong Liu /* stop counting this stripe in big_stripe_tree */ 259103b047f4SSong Liu if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || 259203b047f4SSong Liu test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 259303b047f4SSong Liu tree_index = r5c_tree_index(conf, sh->sector); 259403b047f4SSong Liu spin_lock(&log->tree_lock); 259503b047f4SSong Liu pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 259603b047f4SSong Liu tree_index); 259703b047f4SSong Liu BUG_ON(pslot == NULL); 259803b047f4SSong Liu refcount = (uintptr_t)radix_tree_deref_slot_protected( 259903b047f4SSong Liu pslot, &log->tree_lock) >> 260003b047f4SSong Liu R5C_RADIX_COUNT_SHIFT; 260103b047f4SSong Liu if (refcount == 1) 260203b047f4SSong Liu radix_tree_delete(&log->big_stripe_tree, tree_index); 260303b047f4SSong Liu else 260403b047f4SSong Liu radix_tree_replace_slot( 260503b047f4SSong Liu &log->big_stripe_tree, pslot, 260603b047f4SSong Liu (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); 260703b047f4SSong Liu spin_unlock(&log->tree_lock); 260803b047f4SSong Liu } 260903b047f4SSong Liu 261003b047f4SSong Liu if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 261103b047f4SSong Liu BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 2612*e33fbb9cSShaohua Li atomic_dec(&conf->r5c_flushing_partial_stripes); 261303b047f4SSong Liu atomic_dec(&conf->r5c_cached_partial_stripes); 261403b047f4SSong Liu } 261503b047f4SSong Liu 261603b047f4SSong Liu if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 261703b047f4SSong Liu BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 2618*e33fbb9cSShaohua Li atomic_dec(&conf->r5c_flushing_full_stripes); 261903b047f4SSong Liu atomic_dec(&conf->r5c_cached_full_stripes); 262003b047f4SSong Liu } 26211e6d690bSSong Liu } 26221e6d690bSSong Liu 26231e6d690bSSong Liu int 26241e6d690bSSong Liu r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 26251e6d690bSSong Liu struct stripe_head_state *s) 26261e6d690bSSong Liu { 2627a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 26281e6d690bSSong Liu int pages = 0; 26291e6d690bSSong Liu int reserve; 26301e6d690bSSong Liu int i; 26311e6d690bSSong Liu int ret = 0; 26321e6d690bSSong Liu 26331e6d690bSSong Liu BUG_ON(!log); 26341e6d690bSSong Liu 26351e6d690bSSong Liu for (i = 0; i < sh->disks; i++) { 26361e6d690bSSong Liu void *addr; 26371e6d690bSSong Liu 26381e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 26391e6d690bSSong Liu continue; 26401e6d690bSSong Liu addr = kmap_atomic(sh->dev[i].page); 26411e6d690bSSong Liu sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 26421e6d690bSSong Liu addr, PAGE_SIZE); 26431e6d690bSSong Liu kunmap_atomic(addr); 26441e6d690bSSong Liu pages++; 26451e6d690bSSong Liu } 26461e6d690bSSong Liu WARN_ON(pages == 0); 26471e6d690bSSong Liu 26481e6d690bSSong Liu /* 26491e6d690bSSong Liu * The stripe must enter state machine again to call endio, so 26501e6d690bSSong Liu * don't delay. 26511e6d690bSSong Liu */ 26521e6d690bSSong Liu clear_bit(STRIPE_DELAYED, &sh->state); 26531e6d690bSSong Liu atomic_inc(&sh->count); 26541e6d690bSSong Liu 26551e6d690bSSong Liu mutex_lock(&log->io_mutex); 26561e6d690bSSong Liu /* meta + data */ 26571e6d690bSSong Liu reserve = (1 + pages) << (PAGE_SHIFT - 9); 26581e6d690bSSong Liu 2659a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2660a39f7afdSSong Liu sh->log_start == MaxSector) 2661a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 2662a39f7afdSSong Liu else if (!r5l_has_free_space(log, reserve)) { 2663a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 2664a39f7afdSSong Liu BUG(); 2665a39f7afdSSong Liu else 2666a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 26671e6d690bSSong Liu } else { 26681e6d690bSSong Liu ret = r5l_log_stripe(log, sh, pages, 0); 26691e6d690bSSong Liu if (ret) { 26701e6d690bSSong Liu spin_lock_irq(&log->io_list_lock); 26711e6d690bSSong Liu list_add_tail(&sh->log_list, &log->no_mem_stripes); 26721e6d690bSSong Liu spin_unlock_irq(&log->io_list_lock); 26731e6d690bSSong Liu } 26741e6d690bSSong Liu } 26751e6d690bSSong Liu 26761e6d690bSSong Liu mutex_unlock(&log->io_mutex); 26771e6d690bSSong Liu return 0; 2678f6bed0efSShaohua Li } 2679f6bed0efSShaohua Li 268003b047f4SSong Liu /* check whether this big stripe is in write back cache. */ 268103b047f4SSong Liu bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) 268203b047f4SSong Liu { 268303b047f4SSong Liu struct r5l_log *log = conf->log; 268403b047f4SSong Liu sector_t tree_index; 268503b047f4SSong Liu void *slot; 268603b047f4SSong Liu 268703b047f4SSong Liu if (!log) 268803b047f4SSong Liu return false; 268903b047f4SSong Liu 269003b047f4SSong Liu WARN_ON_ONCE(!rcu_read_lock_held()); 269103b047f4SSong Liu tree_index = r5c_tree_index(conf, sect); 269203b047f4SSong Liu slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); 269303b047f4SSong Liu return slot != NULL; 269403b047f4SSong Liu } 269503b047f4SSong Liu 2696f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log) 2697f6bed0efSShaohua Li { 2698f6bed0efSShaohua Li struct md_rdev *rdev = log->rdev; 2699f6bed0efSShaohua Li struct page *page; 2700f6bed0efSShaohua Li struct r5l_meta_block *mb; 2701f6bed0efSShaohua Li sector_t cp = log->rdev->journal_tail; 2702f6bed0efSShaohua Li u32 stored_crc, expected_crc; 2703f6bed0efSShaohua Li bool create_super = false; 2704d30dfeb9SJackieLiu int ret = 0; 2705f6bed0efSShaohua Li 2706f6bed0efSShaohua Li /* Make sure it's valid */ 2707f6bed0efSShaohua Li if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2708f6bed0efSShaohua Li cp = 0; 2709f6bed0efSShaohua Li page = alloc_page(GFP_KERNEL); 2710f6bed0efSShaohua Li if (!page) 2711f6bed0efSShaohua Li return -ENOMEM; 2712f6bed0efSShaohua Li 2713796a5cf0SMike Christie if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2714f6bed0efSShaohua Li ret = -EIO; 2715f6bed0efSShaohua Li goto ioerr; 2716f6bed0efSShaohua Li } 2717f6bed0efSShaohua Li mb = page_address(page); 2718f6bed0efSShaohua Li 2719f6bed0efSShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2720f6bed0efSShaohua Li mb->version != R5LOG_VERSION) { 2721f6bed0efSShaohua Li create_super = true; 2722f6bed0efSShaohua Li goto create; 2723f6bed0efSShaohua Li } 2724f6bed0efSShaohua Li stored_crc = le32_to_cpu(mb->checksum); 2725f6bed0efSShaohua Li mb->checksum = 0; 27265cb2fbd6SShaohua Li expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2727f6bed0efSShaohua Li if (stored_crc != expected_crc) { 2728f6bed0efSShaohua Li create_super = true; 2729f6bed0efSShaohua Li goto create; 2730f6bed0efSShaohua Li } 2731f6bed0efSShaohua Li if (le64_to_cpu(mb->position) != cp) { 2732f6bed0efSShaohua Li create_super = true; 2733f6bed0efSShaohua Li goto create; 2734f6bed0efSShaohua Li } 2735f6bed0efSShaohua Li create: 2736f6bed0efSShaohua Li if (create_super) { 2737f6bed0efSShaohua Li log->last_cp_seq = prandom_u32(); 2738f6bed0efSShaohua Li cp = 0; 273956056c2eSZhengyuan Liu r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2740f6bed0efSShaohua Li /* 2741f6bed0efSShaohua Li * Make sure super points to correct address. Log might have 2742f6bed0efSShaohua Li * data very soon. If super hasn't correct log tail address, 2743f6bed0efSShaohua Li * recovery can't find the log 2744f6bed0efSShaohua Li */ 2745f6bed0efSShaohua Li r5l_write_super(log, cp); 2746f6bed0efSShaohua Li } else 2747f6bed0efSShaohua Li log->last_cp_seq = le64_to_cpu(mb->seq); 2748f6bed0efSShaohua Li 2749f6bed0efSShaohua Li log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 27500576b1c6SShaohua Li log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 27510576b1c6SShaohua Li if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 27520576b1c6SShaohua Li log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2753f6bed0efSShaohua Li log->last_checkpoint = cp; 2754f6bed0efSShaohua Li 2755f6bed0efSShaohua Li __free_page(page); 2756f6bed0efSShaohua Li 2757d30dfeb9SJackieLiu if (create_super) { 2758d30dfeb9SJackieLiu log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); 2759d30dfeb9SJackieLiu log->seq = log->last_cp_seq + 1; 2760d30dfeb9SJackieLiu log->next_checkpoint = cp; 2761d30dfeb9SJackieLiu } else 27623d7e7e1dSZhengyuan Liu ret = r5l_recovery_log(log); 2763d30dfeb9SJackieLiu 27643d7e7e1dSZhengyuan Liu r5c_update_log_state(log); 27653d7e7e1dSZhengyuan Liu return ret; 2766f6bed0efSShaohua Li ioerr: 2767f6bed0efSShaohua Li __free_page(page); 2768f6bed0efSShaohua Li return ret; 2769f6bed0efSShaohua Li } 2770f6bed0efSShaohua Li 27712e38a37fSSong Liu void r5c_update_on_rdev_error(struct mddev *mddev) 27722e38a37fSSong Liu { 27732e38a37fSSong Liu struct r5conf *conf = mddev->private; 27742e38a37fSSong Liu struct r5l_log *log = conf->log; 27752e38a37fSSong Liu 27762e38a37fSSong Liu if (!log) 27772e38a37fSSong Liu return; 27782e38a37fSSong Liu 27792e38a37fSSong Liu if (raid5_calc_degraded(conf) > 0 && 27802e38a37fSSong Liu conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) 27812e38a37fSSong Liu schedule_work(&log->disable_writeback_work); 27822e38a37fSSong Liu } 27832e38a37fSSong Liu 2784f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2785f6bed0efSShaohua Li { 2786c888a8f9SJens Axboe struct request_queue *q = bdev_get_queue(rdev->bdev); 2787f6bed0efSShaohua Li struct r5l_log *log; 2788f6bed0efSShaohua Li 2789f6bed0efSShaohua Li if (PAGE_SIZE != 4096) 2790f6bed0efSShaohua Li return -EINVAL; 2791c757ec95SSong Liu 2792c757ec95SSong Liu /* 2793c757ec95SSong Liu * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2794c757ec95SSong Liu * raid_disks r5l_payload_data_parity. 2795c757ec95SSong Liu * 2796c757ec95SSong Liu * Write journal and cache does not work for very big array 2797c757ec95SSong Liu * (raid_disks > 203) 2798c757ec95SSong Liu */ 2799c757ec95SSong Liu if (sizeof(struct r5l_meta_block) + 2800c757ec95SSong Liu ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2801c757ec95SSong Liu conf->raid_disks) > PAGE_SIZE) { 2802c757ec95SSong Liu pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2803c757ec95SSong Liu mdname(conf->mddev), conf->raid_disks); 2804c757ec95SSong Liu return -EINVAL; 2805c757ec95SSong Liu } 2806c757ec95SSong Liu 2807f6bed0efSShaohua Li log = kzalloc(sizeof(*log), GFP_KERNEL); 2808f6bed0efSShaohua Li if (!log) 2809f6bed0efSShaohua Li return -ENOMEM; 2810f6bed0efSShaohua Li log->rdev = rdev; 2811f6bed0efSShaohua Li 2812c888a8f9SJens Axboe log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 281356fef7c6SChristoph Hellwig 28145cb2fbd6SShaohua Li log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2815f6bed0efSShaohua Li sizeof(rdev->mddev->uuid)); 2816f6bed0efSShaohua Li 2817f6bed0efSShaohua Li mutex_init(&log->io_mutex); 2818f6bed0efSShaohua Li 2819f6bed0efSShaohua Li spin_lock_init(&log->io_list_lock); 2820f6bed0efSShaohua Li INIT_LIST_HEAD(&log->running_ios); 28210576b1c6SShaohua Li INIT_LIST_HEAD(&log->io_end_ios); 2822a8c34f91SShaohua Li INIT_LIST_HEAD(&log->flushing_ios); 282304732f74SChristoph Hellwig INIT_LIST_HEAD(&log->finished_ios); 28243a83f467SMing Lei bio_init(&log->flush_bio, NULL, 0); 2825f6bed0efSShaohua Li 2826f6bed0efSShaohua Li log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2827f6bed0efSShaohua Li if (!log->io_kc) 2828f6bed0efSShaohua Li goto io_kc; 2829f6bed0efSShaohua Li 28305036c390SChristoph Hellwig log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 28315036c390SChristoph Hellwig if (!log->io_pool) 28325036c390SChristoph Hellwig goto io_pool; 28335036c390SChristoph Hellwig 2834c38d29b3SChristoph Hellwig log->bs = bioset_create(R5L_POOL_SIZE, 0); 2835c38d29b3SChristoph Hellwig if (!log->bs) 2836c38d29b3SChristoph Hellwig goto io_bs; 2837c38d29b3SChristoph Hellwig 2838e8deb638SChristoph Hellwig log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2839e8deb638SChristoph Hellwig if (!log->meta_pool) 2840e8deb638SChristoph Hellwig goto out_mempool; 2841e8deb638SChristoph Hellwig 284203b047f4SSong Liu spin_lock_init(&log->tree_lock); 284303b047f4SSong Liu INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); 284403b047f4SSong Liu 28450576b1c6SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 28460576b1c6SShaohua Li log->rdev->mddev, "reclaim"); 28470576b1c6SShaohua Li if (!log->reclaim_thread) 28480576b1c6SShaohua Li goto reclaim_thread; 2849a39f7afdSSong Liu log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2850a39f7afdSSong Liu 28510fd22b45SShaohua Li init_waitqueue_head(&log->iounit_wait); 28520576b1c6SShaohua Li 28535036c390SChristoph Hellwig INIT_LIST_HEAD(&log->no_mem_stripes); 28545036c390SChristoph Hellwig 2855f6bed0efSShaohua Li INIT_LIST_HEAD(&log->no_space_stripes); 2856f6bed0efSShaohua Li spin_lock_init(&log->no_space_stripes_lock); 2857f6bed0efSShaohua Li 28583bddb7f8SSong Liu INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 28592e38a37fSSong Liu INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); 28603bddb7f8SSong Liu 28612ded3703SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2862a39f7afdSSong Liu INIT_LIST_HEAD(&log->stripe_in_journal_list); 2863a39f7afdSSong Liu spin_lock_init(&log->stripe_in_journal_lock); 2864a39f7afdSSong Liu atomic_set(&log->stripe_in_journal_count, 0); 28652ded3703SSong Liu 2866d2250f10SSong Liu rcu_assign_pointer(conf->log, log); 2867d2250f10SSong Liu 2868f6bed0efSShaohua Li if (r5l_load_log(log)) 2869f6bed0efSShaohua Li goto error; 2870f6bed0efSShaohua Li 2871a62ab49eSShaohua Li set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2872f6bed0efSShaohua Li return 0; 2873e8deb638SChristoph Hellwig 2874f6bed0efSShaohua Li error: 2875d2250f10SSong Liu rcu_assign_pointer(conf->log, NULL); 28760576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 28770576b1c6SShaohua Li reclaim_thread: 2878e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 2879e8deb638SChristoph Hellwig out_mempool: 2880c38d29b3SChristoph Hellwig bioset_free(log->bs); 2881c38d29b3SChristoph Hellwig io_bs: 28825036c390SChristoph Hellwig mempool_destroy(log->io_pool); 28835036c390SChristoph Hellwig io_pool: 2884f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 2885f6bed0efSShaohua Li io_kc: 2886f6bed0efSShaohua Li kfree(log); 2887f6bed0efSShaohua Li return -EINVAL; 2888f6bed0efSShaohua Li } 2889f6bed0efSShaohua Li 2890f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log) 2891f6bed0efSShaohua Li { 28922e38a37fSSong Liu flush_work(&log->disable_writeback_work); 28930576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 2894e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 2895c38d29b3SChristoph Hellwig bioset_free(log->bs); 28965036c390SChristoph Hellwig mempool_destroy(log->io_pool); 2897f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 2898f6bed0efSShaohua Li kfree(log); 2899f6bed0efSShaohua Li } 2900