xref: /linux/drivers/md/raid5-cache.c (revision a39f7afde358ca89e9fc09a5525d3f8631a98a3a)
1f6bed0efSShaohua Li /*
2f6bed0efSShaohua Li  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3f6bed0efSShaohua Li  *
4f6bed0efSShaohua Li  * This program is free software; you can redistribute it and/or modify it
5f6bed0efSShaohua Li  * under the terms and conditions of the GNU General Public License,
6f6bed0efSShaohua Li  * version 2, as published by the Free Software Foundation.
7f6bed0efSShaohua Li  *
8f6bed0efSShaohua Li  * This program is distributed in the hope it will be useful, but WITHOUT
9f6bed0efSShaohua Li  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10f6bed0efSShaohua Li  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11f6bed0efSShaohua Li  * more details.
12f6bed0efSShaohua Li  *
13f6bed0efSShaohua Li  */
14f6bed0efSShaohua Li #include <linux/kernel.h>
15f6bed0efSShaohua Li #include <linux/wait.h>
16f6bed0efSShaohua Li #include <linux/blkdev.h>
17f6bed0efSShaohua Li #include <linux/slab.h>
18f6bed0efSShaohua Li #include <linux/raid/md_p.h>
195cb2fbd6SShaohua Li #include <linux/crc32c.h>
20f6bed0efSShaohua Li #include <linux/random.h>
21f6bed0efSShaohua Li #include "md.h"
22f6bed0efSShaohua Li #include "raid5.h"
231e6d690bSSong Liu #include "bitmap.h"
24f6bed0efSShaohua Li 
25f6bed0efSShaohua Li /*
26f6bed0efSShaohua Li  * metadata/data stored in disk with 4k size unit (a block) regardless
27f6bed0efSShaohua Li  * underneath hardware sector size. only works with PAGE_SIZE == 4096
28f6bed0efSShaohua Li  */
29f6bed0efSShaohua Li #define BLOCK_SECTORS (8)
30f6bed0efSShaohua Li 
310576b1c6SShaohua Li /*
32*a39f7afdSSong Liu  * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
33*a39f7afdSSong Liu  *
34*a39f7afdSSong Liu  * In write through mode, the reclaim runs every log->max_free_space.
35*a39f7afdSSong Liu  * This can prevent the recovery scans for too long
360576b1c6SShaohua Li  */
370576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
380576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
390576b1c6SShaohua Li 
40*a39f7afdSSong Liu /* wake up reclaim thread periodically */
41*a39f7afdSSong Liu #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
42*a39f7afdSSong Liu /* start flush with these full stripes */
43*a39f7afdSSong Liu #define R5C_FULL_STRIPE_FLUSH_BATCH 256
44*a39f7afdSSong Liu /* reclaim stripes in groups */
45*a39f7afdSSong Liu #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
46*a39f7afdSSong Liu 
47c38d29b3SChristoph Hellwig /*
48c38d29b3SChristoph Hellwig  * We only need 2 bios per I/O unit to make progress, but ensure we
49c38d29b3SChristoph Hellwig  * have a few more available to not get too tight.
50c38d29b3SChristoph Hellwig  */
51c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE	4
52c38d29b3SChristoph Hellwig 
532ded3703SSong Liu /*
542ded3703SSong Liu  * r5c journal modes of the array: write-back or write-through.
552ded3703SSong Liu  * write-through mode has identical behavior as existing log only
562ded3703SSong Liu  * implementation.
572ded3703SSong Liu  */
582ded3703SSong Liu enum r5c_journal_mode {
592ded3703SSong Liu 	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
602ded3703SSong Liu 	R5C_JOURNAL_MODE_WRITE_BACK = 1,
612ded3703SSong Liu };
622ded3703SSong Liu 
632ded3703SSong Liu /*
642ded3703SSong Liu  * raid5 cache state machine
652ded3703SSong Liu  *
662ded3703SSong Liu  * With rhe RAID cache, each stripe works in two phases:
672ded3703SSong Liu  *	- caching phase
682ded3703SSong Liu  *	- writing-out phase
692ded3703SSong Liu  *
702ded3703SSong Liu  * These two phases are controlled by bit STRIPE_R5C_CACHING:
712ded3703SSong Liu  *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
722ded3703SSong Liu  *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
732ded3703SSong Liu  *
742ded3703SSong Liu  * When there is no journal, or the journal is in write-through mode,
752ded3703SSong Liu  * the stripe is always in writing-out phase.
762ded3703SSong Liu  *
772ded3703SSong Liu  * For write-back journal, the stripe is sent to caching phase on write
782ded3703SSong Liu  * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
792ded3703SSong Liu  * the write-out phase by clearing STRIPE_R5C_CACHING.
802ded3703SSong Liu  *
812ded3703SSong Liu  * Stripes in caching phase do not write the raid disks. Instead, all
822ded3703SSong Liu  * writes are committed from the log device. Therefore, a stripe in
832ded3703SSong Liu  * caching phase handles writes as:
842ded3703SSong Liu  *	- write to log device
852ded3703SSong Liu  *	- return IO
862ded3703SSong Liu  *
872ded3703SSong Liu  * Stripes in writing-out phase handle writes as:
882ded3703SSong Liu  *	- calculate parity
892ded3703SSong Liu  *	- write pending data and parity to journal
902ded3703SSong Liu  *	- write data and parity to raid disks
912ded3703SSong Liu  *	- return IO for pending writes
922ded3703SSong Liu  */
932ded3703SSong Liu 
94f6bed0efSShaohua Li struct r5l_log {
95f6bed0efSShaohua Li 	struct md_rdev *rdev;
96f6bed0efSShaohua Li 
97f6bed0efSShaohua Li 	u32 uuid_checksum;
98f6bed0efSShaohua Li 
99f6bed0efSShaohua Li 	sector_t device_size;		/* log device size, round to
100f6bed0efSShaohua Li 					 * BLOCK_SECTORS */
1010576b1c6SShaohua Li 	sector_t max_free_space;	/* reclaim run if free space is at
1020576b1c6SShaohua Li 					 * this size */
103f6bed0efSShaohua Li 
104f6bed0efSShaohua Li 	sector_t last_checkpoint;	/* log tail. where recovery scan
105f6bed0efSShaohua Li 					 * starts from */
106f6bed0efSShaohua Li 	u64 last_cp_seq;		/* log tail sequence */
107f6bed0efSShaohua Li 
108f6bed0efSShaohua Li 	sector_t log_start;		/* log head. where new data appends */
109f6bed0efSShaohua Li 	u64 seq;			/* log head sequence */
110f6bed0efSShaohua Li 
11117036461SChristoph Hellwig 	sector_t next_checkpoint;
11217036461SChristoph Hellwig 	u64 next_cp_seq;
11317036461SChristoph Hellwig 
114f6bed0efSShaohua Li 	struct mutex io_mutex;
115f6bed0efSShaohua Li 	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
116f6bed0efSShaohua Li 
117f6bed0efSShaohua Li 	spinlock_t io_list_lock;
118f6bed0efSShaohua Li 	struct list_head running_ios;	/* io_units which are still running,
119f6bed0efSShaohua Li 					 * and have not yet been completely
120f6bed0efSShaohua Li 					 * written to the log */
121f6bed0efSShaohua Li 	struct list_head io_end_ios;	/* io_units which have been completely
122f6bed0efSShaohua Li 					 * written to the log but not yet written
123f6bed0efSShaohua Li 					 * to the RAID */
124a8c34f91SShaohua Li 	struct list_head flushing_ios;	/* io_units which are waiting for log
125a8c34f91SShaohua Li 					 * cache flush */
12604732f74SChristoph Hellwig 	struct list_head finished_ios;	/* io_units which settle down in log disk */
127a8c34f91SShaohua Li 	struct bio flush_bio;
128f6bed0efSShaohua Li 
1295036c390SChristoph Hellwig 	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
1305036c390SChristoph Hellwig 
131f6bed0efSShaohua Li 	struct kmem_cache *io_kc;
1325036c390SChristoph Hellwig 	mempool_t *io_pool;
133c38d29b3SChristoph Hellwig 	struct bio_set *bs;
134e8deb638SChristoph Hellwig 	mempool_t *meta_pool;
135f6bed0efSShaohua Li 
1360576b1c6SShaohua Li 	struct md_thread *reclaim_thread;
1370576b1c6SShaohua Li 	unsigned long reclaim_target;	/* number of space that need to be
1380576b1c6SShaohua Li 					 * reclaimed.  if it's 0, reclaim spaces
1390576b1c6SShaohua Li 					 * used by io_units which are in
1400576b1c6SShaohua Li 					 * IO_UNIT_STRIPE_END state (eg, reclaim
1410576b1c6SShaohua Li 					 * dones't wait for specific io_unit
1420576b1c6SShaohua Li 					 * switching to IO_UNIT_STRIPE_END
1430576b1c6SShaohua Li 					 * state) */
1440fd22b45SShaohua Li 	wait_queue_head_t iounit_wait;
1450576b1c6SShaohua Li 
146f6bed0efSShaohua Li 	struct list_head no_space_stripes; /* pending stripes, log has no space */
147f6bed0efSShaohua Li 	spinlock_t no_space_stripes_lock;
14856fef7c6SChristoph Hellwig 
14956fef7c6SChristoph Hellwig 	bool need_cache_flush;
1502ded3703SSong Liu 
1512ded3703SSong Liu 	/* for r5c_cache */
1522ded3703SSong Liu 	enum r5c_journal_mode r5c_journal_mode;
153*a39f7afdSSong Liu 
154*a39f7afdSSong Liu 	/* all stripes in r5cache, in the order of seq at sh->log_start */
155*a39f7afdSSong Liu 	struct list_head stripe_in_journal_list;
156*a39f7afdSSong Liu 
157*a39f7afdSSong Liu 	spinlock_t stripe_in_journal_lock;
158*a39f7afdSSong Liu 	atomic_t stripe_in_journal_count;
159f6bed0efSShaohua Li };
160f6bed0efSShaohua Li 
161f6bed0efSShaohua Li /*
162f6bed0efSShaohua Li  * an IO range starts from a meta data block and end at the next meta data
163f6bed0efSShaohua Li  * block. The io unit's the meta data block tracks data/parity followed it. io
164f6bed0efSShaohua Li  * unit is written to log disk with normal write, as we always flush log disk
165f6bed0efSShaohua Li  * first and then start move data to raid disks, there is no requirement to
166f6bed0efSShaohua Li  * write io unit with FLUSH/FUA
167f6bed0efSShaohua Li  */
168f6bed0efSShaohua Li struct r5l_io_unit {
169f6bed0efSShaohua Li 	struct r5l_log *log;
170f6bed0efSShaohua Li 
171f6bed0efSShaohua Li 	struct page *meta_page;	/* store meta block */
172f6bed0efSShaohua Li 	int meta_offset;	/* current offset in meta_page */
173f6bed0efSShaohua Li 
174f6bed0efSShaohua Li 	struct bio *current_bio;/* current_bio accepting new data */
175f6bed0efSShaohua Li 
176f6bed0efSShaohua Li 	atomic_t pending_stripe;/* how many stripes not flushed to raid */
177f6bed0efSShaohua Li 	u64 seq;		/* seq number of the metablock */
178f6bed0efSShaohua Li 	sector_t log_start;	/* where the io_unit starts */
179f6bed0efSShaohua Li 	sector_t log_end;	/* where the io_unit ends */
180f6bed0efSShaohua Li 	struct list_head log_sibling; /* log->running_ios */
181f6bed0efSShaohua Li 	struct list_head stripe_list; /* stripes added to the io_unit */
182f6bed0efSShaohua Li 
183f6bed0efSShaohua Li 	int state;
1846143e2ceSChristoph Hellwig 	bool need_split_bio;
185f6bed0efSShaohua Li };
186f6bed0efSShaohua Li 
187f6bed0efSShaohua Li /* r5l_io_unit state */
188f6bed0efSShaohua Li enum r5l_io_unit_state {
189f6bed0efSShaohua Li 	IO_UNIT_RUNNING = 0,	/* accepting new IO */
190f6bed0efSShaohua Li 	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
191f6bed0efSShaohua Li 				 * don't accepting new bio */
192f6bed0efSShaohua Li 	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
193a8c34f91SShaohua Li 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
194f6bed0efSShaohua Li };
195f6bed0efSShaohua Li 
1962ded3703SSong Liu bool r5c_is_writeback(struct r5l_log *log)
1972ded3703SSong Liu {
1982ded3703SSong Liu 	return (log != NULL &&
1992ded3703SSong Liu 		log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
2002ded3703SSong Liu }
2012ded3703SSong Liu 
202f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
203f6bed0efSShaohua Li {
204f6bed0efSShaohua Li 	start += inc;
205f6bed0efSShaohua Li 	if (start >= log->device_size)
206f6bed0efSShaohua Li 		start = start - log->device_size;
207f6bed0efSShaohua Li 	return start;
208f6bed0efSShaohua Li }
209f6bed0efSShaohua Li 
210f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
211f6bed0efSShaohua Li 				  sector_t end)
212f6bed0efSShaohua Li {
213f6bed0efSShaohua Li 	if (end >= start)
214f6bed0efSShaohua Li 		return end - start;
215f6bed0efSShaohua Li 	else
216f6bed0efSShaohua Li 		return end + log->device_size - start;
217f6bed0efSShaohua Li }
218f6bed0efSShaohua Li 
219f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
220f6bed0efSShaohua Li {
221f6bed0efSShaohua Li 	sector_t used_size;
222f6bed0efSShaohua Li 
223f6bed0efSShaohua Li 	used_size = r5l_ring_distance(log, log->last_checkpoint,
224f6bed0efSShaohua Li 					log->log_start);
225f6bed0efSShaohua Li 
226f6bed0efSShaohua Li 	return log->device_size > used_size + size;
227f6bed0efSShaohua Li }
228f6bed0efSShaohua Li 
229f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
230f6bed0efSShaohua Li 				    enum r5l_io_unit_state state)
231f6bed0efSShaohua Li {
232f6bed0efSShaohua Li 	if (WARN_ON(io->state >= state))
233f6bed0efSShaohua Li 		return;
234f6bed0efSShaohua Li 	io->state = state;
235f6bed0efSShaohua Li }
236f6bed0efSShaohua Li 
2371e6d690bSSong Liu static void
2381e6d690bSSong Liu r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
2391e6d690bSSong Liu 			      struct bio_list *return_bi)
2401e6d690bSSong Liu {
2411e6d690bSSong Liu 	struct bio *wbi, *wbi2;
2421e6d690bSSong Liu 
2431e6d690bSSong Liu 	wbi = dev->written;
2441e6d690bSSong Liu 	dev->written = NULL;
2451e6d690bSSong Liu 	while (wbi && wbi->bi_iter.bi_sector <
2461e6d690bSSong Liu 	       dev->sector + STRIPE_SECTORS) {
2471e6d690bSSong Liu 		wbi2 = r5_next_bio(wbi, dev->sector);
2481e6d690bSSong Liu 		if (!raid5_dec_bi_active_stripes(wbi)) {
2491e6d690bSSong Liu 			md_write_end(conf->mddev);
2501e6d690bSSong Liu 			bio_list_add(return_bi, wbi);
2511e6d690bSSong Liu 		}
2521e6d690bSSong Liu 		wbi = wbi2;
2531e6d690bSSong Liu 	}
2541e6d690bSSong Liu }
2551e6d690bSSong Liu 
2561e6d690bSSong Liu void r5c_handle_cached_data_endio(struct r5conf *conf,
2571e6d690bSSong Liu 	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
2581e6d690bSSong Liu {
2591e6d690bSSong Liu 	int i;
2601e6d690bSSong Liu 
2611e6d690bSSong Liu 	for (i = sh->disks; i--; ) {
2621e6d690bSSong Liu 		if (sh->dev[i].written) {
2631e6d690bSSong Liu 			set_bit(R5_UPTODATE, &sh->dev[i].flags);
2641e6d690bSSong Liu 			r5c_return_dev_pending_writes(conf, &sh->dev[i],
2651e6d690bSSong Liu 						      return_bi);
2661e6d690bSSong Liu 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2671e6d690bSSong Liu 					STRIPE_SECTORS,
2681e6d690bSSong Liu 					!test_bit(STRIPE_DEGRADED, &sh->state),
2691e6d690bSSong Liu 					0);
2701e6d690bSSong Liu 		}
2711e6d690bSSong Liu 	}
2721e6d690bSSong Liu }
2731e6d690bSSong Liu 
274*a39f7afdSSong Liu /* Check whether we should flush some stripes to free up stripe cache */
275*a39f7afdSSong Liu void r5c_check_stripe_cache_usage(struct r5conf *conf)
276*a39f7afdSSong Liu {
277*a39f7afdSSong Liu 	int total_cached;
278*a39f7afdSSong Liu 
279*a39f7afdSSong Liu 	if (!r5c_is_writeback(conf->log))
280*a39f7afdSSong Liu 		return;
281*a39f7afdSSong Liu 
282*a39f7afdSSong Liu 	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
283*a39f7afdSSong Liu 		atomic_read(&conf->r5c_cached_full_stripes);
284*a39f7afdSSong Liu 
285*a39f7afdSSong Liu 	/*
286*a39f7afdSSong Liu 	 * The following condition is true for either of the following:
287*a39f7afdSSong Liu 	 *   - stripe cache pressure high:
288*a39f7afdSSong Liu 	 *          total_cached > 3/4 min_nr_stripes ||
289*a39f7afdSSong Liu 	 *          empty_inactive_list_nr > 0
290*a39f7afdSSong Liu 	 *   - stripe cache pressure moderate:
291*a39f7afdSSong Liu 	 *          total_cached > 1/2 min_nr_stripes
292*a39f7afdSSong Liu 	 */
293*a39f7afdSSong Liu 	if (total_cached > conf->min_nr_stripes * 1 / 2 ||
294*a39f7afdSSong Liu 	    atomic_read(&conf->empty_inactive_list_nr) > 0)
295*a39f7afdSSong Liu 		r5l_wake_reclaim(conf->log, 0);
296*a39f7afdSSong Liu }
297*a39f7afdSSong Liu 
298*a39f7afdSSong Liu /*
299*a39f7afdSSong Liu  * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
300*a39f7afdSSong Liu  * stripes in the cache
301*a39f7afdSSong Liu  */
302*a39f7afdSSong Liu void r5c_check_cached_full_stripe(struct r5conf *conf)
303*a39f7afdSSong Liu {
304*a39f7afdSSong Liu 	if (!r5c_is_writeback(conf->log))
305*a39f7afdSSong Liu 		return;
306*a39f7afdSSong Liu 
307*a39f7afdSSong Liu 	/*
308*a39f7afdSSong Liu 	 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
309*a39f7afdSSong Liu 	 * or a full stripe (chunk size / 4k stripes).
310*a39f7afdSSong Liu 	 */
311*a39f7afdSSong Liu 	if (atomic_read(&conf->r5c_cached_full_stripes) >=
312*a39f7afdSSong Liu 	    min(R5C_FULL_STRIPE_FLUSH_BATCH,
313*a39f7afdSSong Liu 		conf->chunk_sectors >> STRIPE_SHIFT))
314*a39f7afdSSong Liu 		r5l_wake_reclaim(conf->log, 0);
315*a39f7afdSSong Liu }
316*a39f7afdSSong Liu 
317*a39f7afdSSong Liu /*
318*a39f7afdSSong Liu  * Total log space (in sectors) needed to flush all data in cache
319*a39f7afdSSong Liu  *
320*a39f7afdSSong Liu  * Currently, writing-out phase automatically includes all pending writes
321*a39f7afdSSong Liu  * to the same sector. So the reclaim of each stripe takes up to
322*a39f7afdSSong Liu  * (conf->raid_disks + 1) pages of log space.
323*a39f7afdSSong Liu  *
324*a39f7afdSSong Liu  * To totally avoid deadlock due to log space, the code reserves
325*a39f7afdSSong Liu  * (conf->raid_disks + 1) pages for each stripe in cache, which is not
326*a39f7afdSSong Liu  * necessary in most cases.
327*a39f7afdSSong Liu  *
328*a39f7afdSSong Liu  * To improve this, we will need writing-out phase to be able to NOT include
329*a39f7afdSSong Liu  * pending writes, which will reduce the requirement to
330*a39f7afdSSong Liu  * (conf->max_degraded + 1) pages per stripe in cache.
331*a39f7afdSSong Liu  */
332*a39f7afdSSong Liu static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
333*a39f7afdSSong Liu {
334*a39f7afdSSong Liu 	struct r5l_log *log = conf->log;
335*a39f7afdSSong Liu 
336*a39f7afdSSong Liu 	if (!r5c_is_writeback(log))
337*a39f7afdSSong Liu 		return 0;
338*a39f7afdSSong Liu 
339*a39f7afdSSong Liu 	return BLOCK_SECTORS * (conf->raid_disks + 1) *
340*a39f7afdSSong Liu 		atomic_read(&log->stripe_in_journal_count);
341*a39f7afdSSong Liu }
342*a39f7afdSSong Liu 
343*a39f7afdSSong Liu /*
344*a39f7afdSSong Liu  * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
345*a39f7afdSSong Liu  *
346*a39f7afdSSong Liu  * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
347*a39f7afdSSong Liu  * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
348*a39f7afdSSong Liu  * device is less than 2x of reclaim_required_space.
349*a39f7afdSSong Liu  */
350*a39f7afdSSong Liu static inline void r5c_update_log_state(struct r5l_log *log)
351*a39f7afdSSong Liu {
352*a39f7afdSSong Liu 	struct r5conf *conf = log->rdev->mddev->private;
353*a39f7afdSSong Liu 	sector_t free_space;
354*a39f7afdSSong Liu 	sector_t reclaim_space;
355*a39f7afdSSong Liu 
356*a39f7afdSSong Liu 	if (!r5c_is_writeback(log))
357*a39f7afdSSong Liu 		return;
358*a39f7afdSSong Liu 
359*a39f7afdSSong Liu 	free_space = r5l_ring_distance(log, log->log_start,
360*a39f7afdSSong Liu 				       log->last_checkpoint);
361*a39f7afdSSong Liu 	reclaim_space = r5c_log_required_to_flush_cache(conf);
362*a39f7afdSSong Liu 	if (free_space < 2 * reclaim_space)
363*a39f7afdSSong Liu 		set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
364*a39f7afdSSong Liu 	else
365*a39f7afdSSong Liu 		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
366*a39f7afdSSong Liu 	if (free_space < 3 * reclaim_space)
367*a39f7afdSSong Liu 		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
368*a39f7afdSSong Liu 	else
369*a39f7afdSSong Liu 		clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
370*a39f7afdSSong Liu }
371*a39f7afdSSong Liu 
3722ded3703SSong Liu /*
3732ded3703SSong Liu  * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
3742ded3703SSong Liu  * This function should only be called in write-back mode.
3752ded3703SSong Liu  */
376*a39f7afdSSong Liu void r5c_make_stripe_write_out(struct stripe_head *sh)
3772ded3703SSong Liu {
3782ded3703SSong Liu 	struct r5conf *conf = sh->raid_conf;
3792ded3703SSong Liu 	struct r5l_log *log = conf->log;
3802ded3703SSong Liu 
3812ded3703SSong Liu 	BUG_ON(!r5c_is_writeback(log));
3822ded3703SSong Liu 
3832ded3703SSong Liu 	WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
3842ded3703SSong Liu 	clear_bit(STRIPE_R5C_CACHING, &sh->state);
3851e6d690bSSong Liu 
3861e6d690bSSong Liu 	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3871e6d690bSSong Liu 		atomic_inc(&conf->preread_active_stripes);
3881e6d690bSSong Liu 
3891e6d690bSSong Liu 	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
3901e6d690bSSong Liu 		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
3911e6d690bSSong Liu 		atomic_dec(&conf->r5c_cached_partial_stripes);
3921e6d690bSSong Liu 	}
3931e6d690bSSong Liu 
3941e6d690bSSong Liu 	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
3951e6d690bSSong Liu 		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
3961e6d690bSSong Liu 		atomic_dec(&conf->r5c_cached_full_stripes);
3971e6d690bSSong Liu 	}
3981e6d690bSSong Liu }
3991e6d690bSSong Liu 
4001e6d690bSSong Liu static void r5c_handle_data_cached(struct stripe_head *sh)
4011e6d690bSSong Liu {
4021e6d690bSSong Liu 	int i;
4031e6d690bSSong Liu 
4041e6d690bSSong Liu 	for (i = sh->disks; i--; )
4051e6d690bSSong Liu 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
4061e6d690bSSong Liu 			set_bit(R5_InJournal, &sh->dev[i].flags);
4071e6d690bSSong Liu 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
4081e6d690bSSong Liu 		}
4091e6d690bSSong Liu 	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
4101e6d690bSSong Liu }
4111e6d690bSSong Liu 
4121e6d690bSSong Liu /*
4131e6d690bSSong Liu  * this journal write must contain full parity,
4141e6d690bSSong Liu  * it may also contain some data pages
4151e6d690bSSong Liu  */
4161e6d690bSSong Liu static void r5c_handle_parity_cached(struct stripe_head *sh)
4171e6d690bSSong Liu {
4181e6d690bSSong Liu 	int i;
4191e6d690bSSong Liu 
4201e6d690bSSong Liu 	for (i = sh->disks; i--; )
4211e6d690bSSong Liu 		if (test_bit(R5_InJournal, &sh->dev[i].flags))
4221e6d690bSSong Liu 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
4232ded3703SSong Liu }
4242ded3703SSong Liu 
4252ded3703SSong Liu /*
4262ded3703SSong Liu  * Setting proper flags after writing (or flushing) data and/or parity to the
4272ded3703SSong Liu  * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
4282ded3703SSong Liu  */
4292ded3703SSong Liu static void r5c_finish_cache_stripe(struct stripe_head *sh)
4302ded3703SSong Liu {
4312ded3703SSong Liu 	struct r5l_log *log = sh->raid_conf->log;
4322ded3703SSong Liu 
4332ded3703SSong Liu 	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
4342ded3703SSong Liu 		BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
4352ded3703SSong Liu 		/*
4362ded3703SSong Liu 		 * Set R5_InJournal for parity dev[pd_idx]. This means
4372ded3703SSong Liu 		 * all data AND parity in the journal. For RAID 6, it is
4382ded3703SSong Liu 		 * NOT necessary to set the flag for dev[qd_idx], as the
4392ded3703SSong Liu 		 * two parities are written out together.
4402ded3703SSong Liu 		 */
4412ded3703SSong Liu 		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
4421e6d690bSSong Liu 	} else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
4431e6d690bSSong Liu 		r5c_handle_data_cached(sh);
4441e6d690bSSong Liu 	} else {
4451e6d690bSSong Liu 		r5c_handle_parity_cached(sh);
4461e6d690bSSong Liu 		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
4471e6d690bSSong Liu 	}
4482ded3703SSong Liu }
4492ded3703SSong Liu 
450d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io)
451d8858f43SChristoph Hellwig {
452d8858f43SChristoph Hellwig 	struct stripe_head *sh, *next;
453d8858f43SChristoph Hellwig 
454d8858f43SChristoph Hellwig 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
455d8858f43SChristoph Hellwig 		list_del_init(&sh->log_list);
4562ded3703SSong Liu 
4572ded3703SSong Liu 		r5c_finish_cache_stripe(sh);
4582ded3703SSong Liu 
459d8858f43SChristoph Hellwig 		set_bit(STRIPE_HANDLE, &sh->state);
460d8858f43SChristoph Hellwig 		raid5_release_stripe(sh);
461d8858f43SChristoph Hellwig 	}
462d8858f43SChristoph Hellwig }
463d8858f43SChristoph Hellwig 
46456fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log)
46556fef7c6SChristoph Hellwig {
46656fef7c6SChristoph Hellwig 	struct r5l_io_unit *io, *next;
46756fef7c6SChristoph Hellwig 
46856fef7c6SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
46956fef7c6SChristoph Hellwig 
47056fef7c6SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
47156fef7c6SChristoph Hellwig 		/* don't change list order */
47256fef7c6SChristoph Hellwig 		if (io->state < IO_UNIT_IO_END)
47356fef7c6SChristoph Hellwig 			break;
47456fef7c6SChristoph Hellwig 
47556fef7c6SChristoph Hellwig 		list_move_tail(&io->log_sibling, &log->finished_ios);
47656fef7c6SChristoph Hellwig 		r5l_io_run_stripes(io);
47756fef7c6SChristoph Hellwig 	}
47856fef7c6SChristoph Hellwig }
47956fef7c6SChristoph Hellwig 
4803848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log)
4813848c0bcSChristoph Hellwig {
4823848c0bcSChristoph Hellwig 	struct r5l_io_unit *io, *next;
4833848c0bcSChristoph Hellwig 
4843848c0bcSChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
4853848c0bcSChristoph Hellwig 
4863848c0bcSChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
4873848c0bcSChristoph Hellwig 		/* don't change list order */
4883848c0bcSChristoph Hellwig 		if (io->state < IO_UNIT_IO_END)
4893848c0bcSChristoph Hellwig 			break;
4903848c0bcSChristoph Hellwig 		list_move_tail(&io->log_sibling, &log->io_end_ios);
4913848c0bcSChristoph Hellwig 	}
4923848c0bcSChristoph Hellwig }
4933848c0bcSChristoph Hellwig 
494f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio)
495f6bed0efSShaohua Li {
496f6bed0efSShaohua Li 	struct r5l_io_unit *io = bio->bi_private;
497f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
498509ffec7SChristoph Hellwig 	unsigned long flags;
499f6bed0efSShaohua Li 
5006e74a9cfSShaohua Li 	if (bio->bi_error)
5016e74a9cfSShaohua Li 		md_error(log->rdev->mddev, log->rdev);
5026e74a9cfSShaohua Li 
503f6bed0efSShaohua Li 	bio_put(bio);
504e8deb638SChristoph Hellwig 	mempool_free(io->meta_page, log->meta_pool);
505f6bed0efSShaohua Li 
506509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
507509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
50856fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
5093848c0bcSChristoph Hellwig 		r5l_move_to_end_ios(log);
51056fef7c6SChristoph Hellwig 	else
51156fef7c6SChristoph Hellwig 		r5l_log_run_stripes(log);
512509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
513509ffec7SChristoph Hellwig 
51456fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
515f6bed0efSShaohua Li 		md_wakeup_thread(log->rdev->mddev->thread);
516f6bed0efSShaohua Li }
517f6bed0efSShaohua Li 
518f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log)
519f6bed0efSShaohua Li {
520f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
521f6bed0efSShaohua Li 	struct r5l_meta_block *block;
522509ffec7SChristoph Hellwig 	unsigned long flags;
523f6bed0efSShaohua Li 	u32 crc;
524f6bed0efSShaohua Li 
525f6bed0efSShaohua Li 	if (!io)
526f6bed0efSShaohua Li 		return;
527f6bed0efSShaohua Li 
528f6bed0efSShaohua Li 	block = page_address(io->meta_page);
529f6bed0efSShaohua Li 	block->meta_size = cpu_to_le32(io->meta_offset);
5305cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
531f6bed0efSShaohua Li 	block->checksum = cpu_to_le32(crc);
532f6bed0efSShaohua Li 
533f6bed0efSShaohua Li 	log->current_io = NULL;
534509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
535509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
536509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
537f6bed0efSShaohua Li 
5384e49ea4aSMike Christie 	submit_bio(io->current_bio);
539f6bed0efSShaohua Li }
540f6bed0efSShaohua Li 
5416143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log)
542b349feb3SChristoph Hellwig {
543c38d29b3SChristoph Hellwig 	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
544b349feb3SChristoph Hellwig 
545796a5cf0SMike Christie 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
546b349feb3SChristoph Hellwig 	bio->bi_bdev = log->rdev->bdev;
5471e932a37SChristoph Hellwig 	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
548b349feb3SChristoph Hellwig 
549b349feb3SChristoph Hellwig 	return bio;
550b349feb3SChristoph Hellwig }
551b349feb3SChristoph Hellwig 
552c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
553c1b99198SChristoph Hellwig {
554c1b99198SChristoph Hellwig 	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
555c1b99198SChristoph Hellwig 
556*a39f7afdSSong Liu 	r5c_update_log_state(log);
557c1b99198SChristoph Hellwig 	/*
558c1b99198SChristoph Hellwig 	 * If we filled up the log device start from the beginning again,
559c1b99198SChristoph Hellwig 	 * which will require a new bio.
560c1b99198SChristoph Hellwig 	 *
561c1b99198SChristoph Hellwig 	 * Note: for this to work properly the log size needs to me a multiple
562c1b99198SChristoph Hellwig 	 * of BLOCK_SECTORS.
563c1b99198SChristoph Hellwig 	 */
564c1b99198SChristoph Hellwig 	if (log->log_start == 0)
5656143e2ceSChristoph Hellwig 		io->need_split_bio = true;
566c1b99198SChristoph Hellwig 
567c1b99198SChristoph Hellwig 	io->log_end = log->log_start;
568c1b99198SChristoph Hellwig }
569c1b99198SChristoph Hellwig 
570f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
571f6bed0efSShaohua Li {
572f6bed0efSShaohua Li 	struct r5l_io_unit *io;
573f6bed0efSShaohua Li 	struct r5l_meta_block *block;
574f6bed0efSShaohua Li 
5755036c390SChristoph Hellwig 	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
5765036c390SChristoph Hellwig 	if (!io)
5775036c390SChristoph Hellwig 		return NULL;
5785036c390SChristoph Hellwig 	memset(io, 0, sizeof(*io));
5795036c390SChristoph Hellwig 
58051039cd0SChristoph Hellwig 	io->log = log;
58151039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->log_sibling);
58251039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->stripe_list);
58351039cd0SChristoph Hellwig 	io->state = IO_UNIT_RUNNING;
584f6bed0efSShaohua Li 
585e8deb638SChristoph Hellwig 	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
586f6bed0efSShaohua Li 	block = page_address(io->meta_page);
587e8deb638SChristoph Hellwig 	clear_page(block);
588f6bed0efSShaohua Li 	block->magic = cpu_to_le32(R5LOG_MAGIC);
589f6bed0efSShaohua Li 	block->version = R5LOG_VERSION;
590f6bed0efSShaohua Li 	block->seq = cpu_to_le64(log->seq);
591f6bed0efSShaohua Li 	block->position = cpu_to_le64(log->log_start);
592f6bed0efSShaohua Li 
593f6bed0efSShaohua Li 	io->log_start = log->log_start;
594f6bed0efSShaohua Li 	io->meta_offset = sizeof(struct r5l_meta_block);
5952b8ef16eSChristoph Hellwig 	io->seq = log->seq++;
596f6bed0efSShaohua Li 
5976143e2ceSChristoph Hellwig 	io->current_bio = r5l_bio_alloc(log);
5986143e2ceSChristoph Hellwig 	io->current_bio->bi_end_io = r5l_log_endio;
5996143e2ceSChristoph Hellwig 	io->current_bio->bi_private = io;
600b349feb3SChristoph Hellwig 	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
601f6bed0efSShaohua Li 
602c1b99198SChristoph Hellwig 	r5_reserve_log_entry(log, io);
603f6bed0efSShaohua Li 
604f6bed0efSShaohua Li 	spin_lock_irq(&log->io_list_lock);
605f6bed0efSShaohua Li 	list_add_tail(&io->log_sibling, &log->running_ios);
606f6bed0efSShaohua Li 	spin_unlock_irq(&log->io_list_lock);
607f6bed0efSShaohua Li 
608f6bed0efSShaohua Li 	return io;
609f6bed0efSShaohua Li }
610f6bed0efSShaohua Li 
611f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
612f6bed0efSShaohua Li {
61322581f58SChristoph Hellwig 	if (log->current_io &&
61422581f58SChristoph Hellwig 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
615f6bed0efSShaohua Li 		r5l_submit_current_io(log);
616f6bed0efSShaohua Li 
6175036c390SChristoph Hellwig 	if (!log->current_io) {
618f6bed0efSShaohua Li 		log->current_io = r5l_new_meta(log);
6195036c390SChristoph Hellwig 		if (!log->current_io)
6205036c390SChristoph Hellwig 			return -ENOMEM;
6215036c390SChristoph Hellwig 	}
6225036c390SChristoph Hellwig 
623f6bed0efSShaohua Li 	return 0;
624f6bed0efSShaohua Li }
625f6bed0efSShaohua Li 
626f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
627f6bed0efSShaohua Li 				    sector_t location,
628f6bed0efSShaohua Li 				    u32 checksum1, u32 checksum2,
629f6bed0efSShaohua Li 				    bool checksum2_valid)
630f6bed0efSShaohua Li {
631f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
632f6bed0efSShaohua Li 	struct r5l_payload_data_parity *payload;
633f6bed0efSShaohua Li 
634f6bed0efSShaohua Li 	payload = page_address(io->meta_page) + io->meta_offset;
635f6bed0efSShaohua Li 	payload->header.type = cpu_to_le16(type);
636f6bed0efSShaohua Li 	payload->header.flags = cpu_to_le16(0);
637f6bed0efSShaohua Li 	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
638f6bed0efSShaohua Li 				    (PAGE_SHIFT - 9));
639f6bed0efSShaohua Li 	payload->location = cpu_to_le64(location);
640f6bed0efSShaohua Li 	payload->checksum[0] = cpu_to_le32(checksum1);
641f6bed0efSShaohua Li 	if (checksum2_valid)
642f6bed0efSShaohua Li 		payload->checksum[1] = cpu_to_le32(checksum2);
643f6bed0efSShaohua Li 
644f6bed0efSShaohua Li 	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
645f6bed0efSShaohua Li 		sizeof(__le32) * (1 + !!checksum2_valid);
646f6bed0efSShaohua Li }
647f6bed0efSShaohua Li 
648f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
649f6bed0efSShaohua Li {
650f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
651f6bed0efSShaohua Li 
6526143e2ceSChristoph Hellwig 	if (io->need_split_bio) {
6536143e2ceSChristoph Hellwig 		struct bio *prev = io->current_bio;
654f6bed0efSShaohua Li 
6556143e2ceSChristoph Hellwig 		io->current_bio = r5l_bio_alloc(log);
6566143e2ceSChristoph Hellwig 		bio_chain(io->current_bio, prev);
6576143e2ceSChristoph Hellwig 
6584e49ea4aSMike Christie 		submit_bio(prev);
659f6bed0efSShaohua Li 	}
660f6bed0efSShaohua Li 
6616143e2ceSChristoph Hellwig 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
6626143e2ceSChristoph Hellwig 		BUG();
6636143e2ceSChristoph Hellwig 
664c1b99198SChristoph Hellwig 	r5_reserve_log_entry(log, io);
665f6bed0efSShaohua Li }
666f6bed0efSShaohua Li 
6675036c390SChristoph Hellwig static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
668f6bed0efSShaohua Li 			   int data_pages, int parity_pages)
669f6bed0efSShaohua Li {
670f6bed0efSShaohua Li 	int i;
671f6bed0efSShaohua Li 	int meta_size;
6725036c390SChristoph Hellwig 	int ret;
673f6bed0efSShaohua Li 	struct r5l_io_unit *io;
674f6bed0efSShaohua Li 
675f6bed0efSShaohua Li 	meta_size =
676f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
677f6bed0efSShaohua Li 		 * data_pages) +
678f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
679f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
680f6bed0efSShaohua Li 
6815036c390SChristoph Hellwig 	ret = r5l_get_meta(log, meta_size);
6825036c390SChristoph Hellwig 	if (ret)
6835036c390SChristoph Hellwig 		return ret;
6845036c390SChristoph Hellwig 
685f6bed0efSShaohua Li 	io = log->current_io;
686f6bed0efSShaohua Li 
687f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
6881e6d690bSSong Liu 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
6891e6d690bSSong Liu 		    test_bit(R5_InJournal, &sh->dev[i].flags))
690f6bed0efSShaohua Li 			continue;
691f6bed0efSShaohua Li 		if (i == sh->pd_idx || i == sh->qd_idx)
692f6bed0efSShaohua Li 			continue;
693f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
694f6bed0efSShaohua Li 					raid5_compute_blocknr(sh, i, 0),
695f6bed0efSShaohua Li 					sh->dev[i].log_checksum, 0, false);
696f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[i].page);
697f6bed0efSShaohua Li 	}
698f6bed0efSShaohua Li 
6992ded3703SSong Liu 	if (parity_pages == 2) {
700f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
701f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
702f6bed0efSShaohua Li 					sh->dev[sh->qd_idx].log_checksum, true);
703f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
704f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
7052ded3703SSong Liu 	} else if (parity_pages == 1) {
706f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
707f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
708f6bed0efSShaohua Li 					0, false);
709f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
7102ded3703SSong Liu 	} else  /* Just writing data, not parity, in caching phase */
7112ded3703SSong Liu 		BUG_ON(parity_pages != 0);
712f6bed0efSShaohua Li 
713f6bed0efSShaohua Li 	list_add_tail(&sh->log_list, &io->stripe_list);
714f6bed0efSShaohua Li 	atomic_inc(&io->pending_stripe);
715f6bed0efSShaohua Li 	sh->log_io = io;
7165036c390SChristoph Hellwig 
717*a39f7afdSSong Liu 	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
718*a39f7afdSSong Liu 		return 0;
719*a39f7afdSSong Liu 
720*a39f7afdSSong Liu 	if (sh->log_start == MaxSector) {
721*a39f7afdSSong Liu 		BUG_ON(!list_empty(&sh->r5c));
722*a39f7afdSSong Liu 		sh->log_start = io->log_start;
723*a39f7afdSSong Liu 		spin_lock_irq(&log->stripe_in_journal_lock);
724*a39f7afdSSong Liu 		list_add_tail(&sh->r5c,
725*a39f7afdSSong Liu 			      &log->stripe_in_journal_list);
726*a39f7afdSSong Liu 		spin_unlock_irq(&log->stripe_in_journal_lock);
727*a39f7afdSSong Liu 		atomic_inc(&log->stripe_in_journal_count);
728*a39f7afdSSong Liu 	}
7295036c390SChristoph Hellwig 	return 0;
730f6bed0efSShaohua Li }
731f6bed0efSShaohua Li 
732*a39f7afdSSong Liu /* add stripe to no_space_stripes, and then wake up reclaim */
733*a39f7afdSSong Liu static inline void r5l_add_no_space_stripe(struct r5l_log *log,
734*a39f7afdSSong Liu 					   struct stripe_head *sh)
735*a39f7afdSSong Liu {
736*a39f7afdSSong Liu 	spin_lock(&log->no_space_stripes_lock);
737*a39f7afdSSong Liu 	list_add_tail(&sh->log_list, &log->no_space_stripes);
738*a39f7afdSSong Liu 	spin_unlock(&log->no_space_stripes_lock);
739*a39f7afdSSong Liu }
740*a39f7afdSSong Liu 
741f6bed0efSShaohua Li /*
742f6bed0efSShaohua Li  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
743f6bed0efSShaohua Li  * data from log to raid disks), so we shouldn't wait for reclaim here
744f6bed0efSShaohua Li  */
745f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
746f6bed0efSShaohua Li {
747*a39f7afdSSong Liu 	struct r5conf *conf = sh->raid_conf;
748f6bed0efSShaohua Li 	int write_disks = 0;
749f6bed0efSShaohua Li 	int data_pages, parity_pages;
750f6bed0efSShaohua Li 	int reserve;
751f6bed0efSShaohua Li 	int i;
7525036c390SChristoph Hellwig 	int ret = 0;
753*a39f7afdSSong Liu 	bool wake_reclaim = false;
754f6bed0efSShaohua Li 
755f6bed0efSShaohua Li 	if (!log)
756f6bed0efSShaohua Li 		return -EAGAIN;
757f6bed0efSShaohua Li 	/* Don't support stripe batch */
758f6bed0efSShaohua Li 	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
759f6bed0efSShaohua Li 	    test_bit(STRIPE_SYNCING, &sh->state)) {
760f6bed0efSShaohua Li 		/* the stripe is written to log, we start writing it to raid */
761f6bed0efSShaohua Li 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
762f6bed0efSShaohua Li 		return -EAGAIN;
763f6bed0efSShaohua Li 	}
764f6bed0efSShaohua Li 
7652ded3703SSong Liu 	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
7662ded3703SSong Liu 
767f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
768f6bed0efSShaohua Li 		void *addr;
769f6bed0efSShaohua Li 
7701e6d690bSSong Liu 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
7711e6d690bSSong Liu 		    test_bit(R5_InJournal, &sh->dev[i].flags))
772f6bed0efSShaohua Li 			continue;
7731e6d690bSSong Liu 
774f6bed0efSShaohua Li 		write_disks++;
775f6bed0efSShaohua Li 		/* checksum is already calculated in last run */
776f6bed0efSShaohua Li 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
777f6bed0efSShaohua Li 			continue;
778f6bed0efSShaohua Li 		addr = kmap_atomic(sh->dev[i].page);
7795cb2fbd6SShaohua Li 		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
780f6bed0efSShaohua Li 						    addr, PAGE_SIZE);
781f6bed0efSShaohua Li 		kunmap_atomic(addr);
782f6bed0efSShaohua Li 	}
783f6bed0efSShaohua Li 	parity_pages = 1 + !!(sh->qd_idx >= 0);
784f6bed0efSShaohua Li 	data_pages = write_disks - parity_pages;
785f6bed0efSShaohua Li 
786f6bed0efSShaohua Li 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
787253f9fd4SShaohua Li 	/*
788253f9fd4SShaohua Li 	 * The stripe must enter state machine again to finish the write, so
789253f9fd4SShaohua Li 	 * don't delay.
790253f9fd4SShaohua Li 	 */
791253f9fd4SShaohua Li 	clear_bit(STRIPE_DELAYED, &sh->state);
792f6bed0efSShaohua Li 	atomic_inc(&sh->count);
793f6bed0efSShaohua Li 
794f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
795f6bed0efSShaohua Li 	/* meta + data */
796f6bed0efSShaohua Li 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
797f6bed0efSShaohua Li 
798*a39f7afdSSong Liu 	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
799*a39f7afdSSong Liu 		if (!r5l_has_free_space(log, reserve)) {
800*a39f7afdSSong Liu 			r5l_add_no_space_stripe(log, sh);
801*a39f7afdSSong Liu 			wake_reclaim = true;
8025036c390SChristoph Hellwig 		} else {
8035036c390SChristoph Hellwig 			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
8045036c390SChristoph Hellwig 			if (ret) {
8055036c390SChristoph Hellwig 				spin_lock_irq(&log->io_list_lock);
806*a39f7afdSSong Liu 				list_add_tail(&sh->log_list,
807*a39f7afdSSong Liu 					      &log->no_mem_stripes);
8085036c390SChristoph Hellwig 				spin_unlock_irq(&log->io_list_lock);
809f6bed0efSShaohua Li 			}
8105036c390SChristoph Hellwig 		}
811*a39f7afdSSong Liu 	} else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
812*a39f7afdSSong Liu 		/*
813*a39f7afdSSong Liu 		 * log space critical, do not process stripes that are
814*a39f7afdSSong Liu 		 * not in cache yet (sh->log_start == MaxSector).
815*a39f7afdSSong Liu 		 */
816*a39f7afdSSong Liu 		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
817*a39f7afdSSong Liu 		    sh->log_start == MaxSector) {
818*a39f7afdSSong Liu 			r5l_add_no_space_stripe(log, sh);
819*a39f7afdSSong Liu 			wake_reclaim = true;
820*a39f7afdSSong Liu 			reserve = 0;
821*a39f7afdSSong Liu 		} else if (!r5l_has_free_space(log, reserve)) {
822*a39f7afdSSong Liu 			if (sh->log_start == log->last_checkpoint)
823*a39f7afdSSong Liu 				BUG();
824*a39f7afdSSong Liu 			else
825*a39f7afdSSong Liu 				r5l_add_no_space_stripe(log, sh);
826*a39f7afdSSong Liu 		} else {
827*a39f7afdSSong Liu 			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
828*a39f7afdSSong Liu 			if (ret) {
829*a39f7afdSSong Liu 				spin_lock_irq(&log->io_list_lock);
830*a39f7afdSSong Liu 				list_add_tail(&sh->log_list,
831*a39f7afdSSong Liu 					      &log->no_mem_stripes);
832*a39f7afdSSong Liu 				spin_unlock_irq(&log->io_list_lock);
833*a39f7afdSSong Liu 			}
834*a39f7afdSSong Liu 		}
835*a39f7afdSSong Liu 	}
836f6bed0efSShaohua Li 
8375036c390SChristoph Hellwig 	mutex_unlock(&log->io_mutex);
838*a39f7afdSSong Liu 	if (wake_reclaim)
839*a39f7afdSSong Liu 		r5l_wake_reclaim(log, reserve);
840f6bed0efSShaohua Li 	return 0;
841f6bed0efSShaohua Li }
842f6bed0efSShaohua Li 
843f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log)
844f6bed0efSShaohua Li {
845f6bed0efSShaohua Li 	if (!log)
846f6bed0efSShaohua Li 		return;
847f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
848f6bed0efSShaohua Li 	r5l_submit_current_io(log);
849f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
850f6bed0efSShaohua Li }
851f6bed0efSShaohua Li 
852828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
853828cbe98SShaohua Li {
854828cbe98SShaohua Li 	if (!log)
855828cbe98SShaohua Li 		return -ENODEV;
856828cbe98SShaohua Li 	/*
857828cbe98SShaohua Li 	 * we flush log disk cache first, then write stripe data to raid disks.
858828cbe98SShaohua Li 	 * So if bio is finished, the log disk cache is flushed already. The
859828cbe98SShaohua Li 	 * recovery guarantees we can recovery the bio from log disk, so we
860828cbe98SShaohua Li 	 * don't need to flush again
861828cbe98SShaohua Li 	 */
862828cbe98SShaohua Li 	if (bio->bi_iter.bi_size == 0) {
863828cbe98SShaohua Li 		bio_endio(bio);
864828cbe98SShaohua Li 		return 0;
865828cbe98SShaohua Li 	}
8661eff9d32SJens Axboe 	bio->bi_opf &= ~REQ_PREFLUSH;
867828cbe98SShaohua Li 	return -EAGAIN;
868828cbe98SShaohua Li }
869828cbe98SShaohua Li 
870f6bed0efSShaohua Li /* This will run after log space is reclaimed */
871f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log)
872f6bed0efSShaohua Li {
873f6bed0efSShaohua Li 	struct stripe_head *sh;
874f6bed0efSShaohua Li 
875f6bed0efSShaohua Li 	spin_lock(&log->no_space_stripes_lock);
876f6bed0efSShaohua Li 	while (!list_empty(&log->no_space_stripes)) {
877f6bed0efSShaohua Li 		sh = list_first_entry(&log->no_space_stripes,
878f6bed0efSShaohua Li 				      struct stripe_head, log_list);
879f6bed0efSShaohua Li 		list_del_init(&sh->log_list);
880f6bed0efSShaohua Li 		set_bit(STRIPE_HANDLE, &sh->state);
881f6bed0efSShaohua Li 		raid5_release_stripe(sh);
882f6bed0efSShaohua Li 	}
883f6bed0efSShaohua Li 	spin_unlock(&log->no_space_stripes_lock);
884f6bed0efSShaohua Li }
885f6bed0efSShaohua Li 
886*a39f7afdSSong Liu /*
887*a39f7afdSSong Liu  * calculate new last_checkpoint
888*a39f7afdSSong Liu  * for write through mode, returns log->next_checkpoint
889*a39f7afdSSong Liu  * for write back, returns log_start of first sh in stripe_in_journal_list
890*a39f7afdSSong Liu  */
891*a39f7afdSSong Liu static sector_t r5c_calculate_new_cp(struct r5conf *conf)
892*a39f7afdSSong Liu {
893*a39f7afdSSong Liu 	struct stripe_head *sh;
894*a39f7afdSSong Liu 	struct r5l_log *log = conf->log;
895*a39f7afdSSong Liu 	sector_t new_cp;
896*a39f7afdSSong Liu 	unsigned long flags;
897*a39f7afdSSong Liu 
898*a39f7afdSSong Liu 	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
899*a39f7afdSSong Liu 		return log->next_checkpoint;
900*a39f7afdSSong Liu 
901*a39f7afdSSong Liu 	spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
902*a39f7afdSSong Liu 	if (list_empty(&conf->log->stripe_in_journal_list)) {
903*a39f7afdSSong Liu 		/* all stripes flushed */
904*a39f7afdSSong Liu 		spin_unlock(&log->stripe_in_journal_lock);
905*a39f7afdSSong Liu 		return log->next_checkpoint;
906*a39f7afdSSong Liu 	}
907*a39f7afdSSong Liu 	sh = list_first_entry(&conf->log->stripe_in_journal_list,
908*a39f7afdSSong Liu 			      struct stripe_head, r5c);
909*a39f7afdSSong Liu 	new_cp = sh->log_start;
910*a39f7afdSSong Liu 	spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
911*a39f7afdSSong Liu 	return new_cp;
912*a39f7afdSSong Liu }
913*a39f7afdSSong Liu 
91417036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log)
91517036461SChristoph Hellwig {
916*a39f7afdSSong Liu 	struct r5conf *conf = log->rdev->mddev->private;
917*a39f7afdSSong Liu 
91817036461SChristoph Hellwig 	return r5l_ring_distance(log, log->last_checkpoint,
919*a39f7afdSSong Liu 				 r5c_calculate_new_cp(conf));
92017036461SChristoph Hellwig }
92117036461SChristoph Hellwig 
9225036c390SChristoph Hellwig static void r5l_run_no_mem_stripe(struct r5l_log *log)
9235036c390SChristoph Hellwig {
9245036c390SChristoph Hellwig 	struct stripe_head *sh;
9255036c390SChristoph Hellwig 
9265036c390SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
9275036c390SChristoph Hellwig 
9285036c390SChristoph Hellwig 	if (!list_empty(&log->no_mem_stripes)) {
9295036c390SChristoph Hellwig 		sh = list_first_entry(&log->no_mem_stripes,
9305036c390SChristoph Hellwig 				      struct stripe_head, log_list);
9315036c390SChristoph Hellwig 		list_del_init(&sh->log_list);
9325036c390SChristoph Hellwig 		set_bit(STRIPE_HANDLE, &sh->state);
9335036c390SChristoph Hellwig 		raid5_release_stripe(sh);
9345036c390SChristoph Hellwig 	}
9355036c390SChristoph Hellwig }
9365036c390SChristoph Hellwig 
93704732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log)
93817036461SChristoph Hellwig {
93917036461SChristoph Hellwig 	struct r5l_io_unit *io, *next;
94017036461SChristoph Hellwig 	bool found = false;
94117036461SChristoph Hellwig 
94217036461SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
94317036461SChristoph Hellwig 
94404732f74SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
94517036461SChristoph Hellwig 		/* don't change list order */
94617036461SChristoph Hellwig 		if (io->state < IO_UNIT_STRIPE_END)
94717036461SChristoph Hellwig 			break;
94817036461SChristoph Hellwig 
94917036461SChristoph Hellwig 		log->next_checkpoint = io->log_start;
95017036461SChristoph Hellwig 		log->next_cp_seq = io->seq;
95117036461SChristoph Hellwig 
95217036461SChristoph Hellwig 		list_del(&io->log_sibling);
9535036c390SChristoph Hellwig 		mempool_free(io, log->io_pool);
9545036c390SChristoph Hellwig 		r5l_run_no_mem_stripe(log);
95517036461SChristoph Hellwig 
95617036461SChristoph Hellwig 		found = true;
95717036461SChristoph Hellwig 	}
95817036461SChristoph Hellwig 
95917036461SChristoph Hellwig 	return found;
96017036461SChristoph Hellwig }
96117036461SChristoph Hellwig 
962509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
963509ffec7SChristoph Hellwig {
964509ffec7SChristoph Hellwig 	struct r5l_log *log = io->log;
965*a39f7afdSSong Liu 	struct r5conf *conf = log->rdev->mddev->private;
966509ffec7SChristoph Hellwig 	unsigned long flags;
967509ffec7SChristoph Hellwig 
968509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
969509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
97017036461SChristoph Hellwig 
97104732f74SChristoph Hellwig 	if (!r5l_complete_finished_ios(log)) {
97285f2f9a4SShaohua Li 		spin_unlock_irqrestore(&log->io_list_lock, flags);
97385f2f9a4SShaohua Li 		return;
97485f2f9a4SShaohua Li 	}
975509ffec7SChristoph Hellwig 
976*a39f7afdSSong Liu 	if (r5l_reclaimable_space(log) > log->max_free_space ||
977*a39f7afdSSong Liu 	    test_bit(R5C_LOG_TIGHT, &conf->cache_state))
978509ffec7SChristoph Hellwig 		r5l_wake_reclaim(log, 0);
979509ffec7SChristoph Hellwig 
980509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
981509ffec7SChristoph Hellwig 	wake_up(&log->iounit_wait);
982509ffec7SChristoph Hellwig }
983509ffec7SChristoph Hellwig 
9840576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh)
9850576b1c6SShaohua Li {
9860576b1c6SShaohua Li 	struct r5l_io_unit *io;
9870576b1c6SShaohua Li 
9880576b1c6SShaohua Li 	io = sh->log_io;
9890576b1c6SShaohua Li 	sh->log_io = NULL;
9900576b1c6SShaohua Li 
991509ffec7SChristoph Hellwig 	if (io && atomic_dec_and_test(&io->pending_stripe))
992509ffec7SChristoph Hellwig 		__r5l_stripe_write_finished(io);
9930576b1c6SShaohua Li }
9940576b1c6SShaohua Li 
995a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio)
996a8c34f91SShaohua Li {
997a8c34f91SShaohua Li 	struct r5l_log *log = container_of(bio, struct r5l_log,
998a8c34f91SShaohua Li 		flush_bio);
999a8c34f91SShaohua Li 	unsigned long flags;
1000a8c34f91SShaohua Li 	struct r5l_io_unit *io;
1001a8c34f91SShaohua Li 
10026e74a9cfSShaohua Li 	if (bio->bi_error)
10036e74a9cfSShaohua Li 		md_error(log->rdev->mddev, log->rdev);
10046e74a9cfSShaohua Li 
1005a8c34f91SShaohua Li 	spin_lock_irqsave(&log->io_list_lock, flags);
1006d8858f43SChristoph Hellwig 	list_for_each_entry(io, &log->flushing_ios, log_sibling)
1007d8858f43SChristoph Hellwig 		r5l_io_run_stripes(io);
100804732f74SChristoph Hellwig 	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1009a8c34f91SShaohua Li 	spin_unlock_irqrestore(&log->io_list_lock, flags);
1010a8c34f91SShaohua Li }
1011a8c34f91SShaohua Li 
10120576b1c6SShaohua Li /*
10130576b1c6SShaohua Li  * Starting dispatch IO to raid.
10140576b1c6SShaohua Li  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
10150576b1c6SShaohua Li  * broken meta in the middle of a log causes recovery can't find meta at the
10160576b1c6SShaohua Li  * head of log. If operations require meta at the head persistent in log, we
10170576b1c6SShaohua Li  * must make sure meta before it persistent in log too. A case is:
10180576b1c6SShaohua Li  *
10190576b1c6SShaohua Li  * stripe data/parity is in log, we start write stripe to raid disks. stripe
10200576b1c6SShaohua Li  * data/parity must be persistent in log before we do the write to raid disks.
10210576b1c6SShaohua Li  *
10220576b1c6SShaohua Li  * The solution is we restrictly maintain io_unit list order. In this case, we
10230576b1c6SShaohua Li  * only write stripes of an io_unit to raid disks till the io_unit is the first
10240576b1c6SShaohua Li  * one whose data/parity is in log.
10250576b1c6SShaohua Li  */
10260576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log)
10270576b1c6SShaohua Li {
1028a8c34f91SShaohua Li 	bool do_flush;
102956fef7c6SChristoph Hellwig 
103056fef7c6SChristoph Hellwig 	if (!log || !log->need_cache_flush)
10310576b1c6SShaohua Li 		return;
10320576b1c6SShaohua Li 
1033a8c34f91SShaohua Li 	spin_lock_irq(&log->io_list_lock);
1034a8c34f91SShaohua Li 	/* flush bio is running */
1035a8c34f91SShaohua Li 	if (!list_empty(&log->flushing_ios)) {
1036a8c34f91SShaohua Li 		spin_unlock_irq(&log->io_list_lock);
10370576b1c6SShaohua Li 		return;
10380576b1c6SShaohua Li 	}
1039a8c34f91SShaohua Li 	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1040a8c34f91SShaohua Li 	do_flush = !list_empty(&log->flushing_ios);
10410576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
1042a8c34f91SShaohua Li 
1043a8c34f91SShaohua Li 	if (!do_flush)
1044a8c34f91SShaohua Li 		return;
1045a8c34f91SShaohua Li 	bio_reset(&log->flush_bio);
1046a8c34f91SShaohua Li 	log->flush_bio.bi_bdev = log->rdev->bdev;
1047a8c34f91SShaohua Li 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
1048796a5cf0SMike Christie 	bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
10494e49ea4aSMike Christie 	submit_bio(&log->flush_bio);
10500576b1c6SShaohua Li }
10510576b1c6SShaohua Li 
10520576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp);
10534b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log,
10544b482044SShaohua Li 	sector_t end)
10554b482044SShaohua Li {
10564b482044SShaohua Li 	struct block_device *bdev = log->rdev->bdev;
10574b482044SShaohua Li 	struct mddev *mddev;
10584b482044SShaohua Li 
10594b482044SShaohua Li 	r5l_write_super(log, end);
10604b482044SShaohua Li 
10614b482044SShaohua Li 	if (!blk_queue_discard(bdev_get_queue(bdev)))
10624b482044SShaohua Li 		return;
10634b482044SShaohua Li 
10644b482044SShaohua Li 	mddev = log->rdev->mddev;
10654b482044SShaohua Li 	/*
10668e018c21SShaohua Li 	 * Discard could zero data, so before discard we must make sure
10678e018c21SShaohua Li 	 * superblock is updated to new log tail. Updating superblock (either
10688e018c21SShaohua Li 	 * directly call md_update_sb() or depend on md thread) must hold
10698e018c21SShaohua Li 	 * reconfig mutex. On the other hand, raid5_quiesce is called with
10708e018c21SShaohua Li 	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
10718e018c21SShaohua Li 	 * for all IO finish, hence waitting for reclaim thread, while reclaim
10728e018c21SShaohua Li 	 * thread is calling this function and waitting for reconfig mutex. So
10738e018c21SShaohua Li 	 * there is a deadlock. We workaround this issue with a trylock.
10748e018c21SShaohua Li 	 * FIXME: we could miss discard if we can't take reconfig mutex
10754b482044SShaohua Li 	 */
107685ad1d13SGuoqing Jiang 	set_mask_bits(&mddev->flags, 0,
107785ad1d13SGuoqing Jiang 		BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
10788e018c21SShaohua Li 	if (!mddev_trylock(mddev))
10798e018c21SShaohua Li 		return;
10804b482044SShaohua Li 	md_update_sb(mddev, 1);
10818e018c21SShaohua Li 	mddev_unlock(mddev);
10824b482044SShaohua Li 
10836e74a9cfSShaohua Li 	/* discard IO error really doesn't matter, ignore it */
10844b482044SShaohua Li 	if (log->last_checkpoint < end) {
10854b482044SShaohua Li 		blkdev_issue_discard(bdev,
10864b482044SShaohua Li 				log->last_checkpoint + log->rdev->data_offset,
10874b482044SShaohua Li 				end - log->last_checkpoint, GFP_NOIO, 0);
10884b482044SShaohua Li 	} else {
10894b482044SShaohua Li 		blkdev_issue_discard(bdev,
10904b482044SShaohua Li 				log->last_checkpoint + log->rdev->data_offset,
10914b482044SShaohua Li 				log->device_size - log->last_checkpoint,
10924b482044SShaohua Li 				GFP_NOIO, 0);
10934b482044SShaohua Li 		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
10944b482044SShaohua Li 				GFP_NOIO, 0);
10954b482044SShaohua Li 	}
10964b482044SShaohua Li }
10974b482044SShaohua Li 
1098*a39f7afdSSong Liu /*
1099*a39f7afdSSong Liu  * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1100*a39f7afdSSong Liu  * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1101*a39f7afdSSong Liu  *
1102*a39f7afdSSong Liu  * must hold conf->device_lock
1103*a39f7afdSSong Liu  */
1104*a39f7afdSSong Liu static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1105*a39f7afdSSong Liu {
1106*a39f7afdSSong Liu 	BUG_ON(list_empty(&sh->lru));
1107*a39f7afdSSong Liu 	BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1108*a39f7afdSSong Liu 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1109*a39f7afdSSong Liu 
1110*a39f7afdSSong Liu 	/*
1111*a39f7afdSSong Liu 	 * The stripe is not ON_RELEASE_LIST, so it is safe to call
1112*a39f7afdSSong Liu 	 * raid5_release_stripe() while holding conf->device_lock
1113*a39f7afdSSong Liu 	 */
1114*a39f7afdSSong Liu 	BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1115*a39f7afdSSong Liu 	assert_spin_locked(&conf->device_lock);
1116*a39f7afdSSong Liu 
1117*a39f7afdSSong Liu 	list_del_init(&sh->lru);
1118*a39f7afdSSong Liu 	atomic_inc(&sh->count);
1119*a39f7afdSSong Liu 
1120*a39f7afdSSong Liu 	set_bit(STRIPE_HANDLE, &sh->state);
1121*a39f7afdSSong Liu 	atomic_inc(&conf->active_stripes);
1122*a39f7afdSSong Liu 	r5c_make_stripe_write_out(sh);
1123*a39f7afdSSong Liu 
1124*a39f7afdSSong Liu 	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1125*a39f7afdSSong Liu 		atomic_inc(&conf->preread_active_stripes);
1126*a39f7afdSSong Liu 	raid5_release_stripe(sh);
1127*a39f7afdSSong Liu }
1128*a39f7afdSSong Liu 
1129*a39f7afdSSong Liu /*
1130*a39f7afdSSong Liu  * if num == 0, flush all full stripes
1131*a39f7afdSSong Liu  * if num > 0, flush all full stripes. If less than num full stripes are
1132*a39f7afdSSong Liu  *             flushed, flush some partial stripes until totally num stripes are
1133*a39f7afdSSong Liu  *             flushed or there is no more cached stripes.
1134*a39f7afdSSong Liu  */
1135*a39f7afdSSong Liu void r5c_flush_cache(struct r5conf *conf, int num)
1136*a39f7afdSSong Liu {
1137*a39f7afdSSong Liu 	int count;
1138*a39f7afdSSong Liu 	struct stripe_head *sh, *next;
1139*a39f7afdSSong Liu 
1140*a39f7afdSSong Liu 	assert_spin_locked(&conf->device_lock);
1141*a39f7afdSSong Liu 	if (!conf->log)
1142*a39f7afdSSong Liu 		return;
1143*a39f7afdSSong Liu 
1144*a39f7afdSSong Liu 	count = 0;
1145*a39f7afdSSong Liu 	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1146*a39f7afdSSong Liu 		r5c_flush_stripe(conf, sh);
1147*a39f7afdSSong Liu 		count++;
1148*a39f7afdSSong Liu 	}
1149*a39f7afdSSong Liu 
1150*a39f7afdSSong Liu 	if (count >= num)
1151*a39f7afdSSong Liu 		return;
1152*a39f7afdSSong Liu 	list_for_each_entry_safe(sh, next,
1153*a39f7afdSSong Liu 				 &conf->r5c_partial_stripe_list, lru) {
1154*a39f7afdSSong Liu 		r5c_flush_stripe(conf, sh);
1155*a39f7afdSSong Liu 		if (++count >= num)
1156*a39f7afdSSong Liu 			break;
1157*a39f7afdSSong Liu 	}
1158*a39f7afdSSong Liu }
1159*a39f7afdSSong Liu 
1160*a39f7afdSSong Liu static void r5c_do_reclaim(struct r5conf *conf)
1161*a39f7afdSSong Liu {
1162*a39f7afdSSong Liu 	struct r5l_log *log = conf->log;
1163*a39f7afdSSong Liu 	struct stripe_head *sh;
1164*a39f7afdSSong Liu 	int count = 0;
1165*a39f7afdSSong Liu 	unsigned long flags;
1166*a39f7afdSSong Liu 	int total_cached;
1167*a39f7afdSSong Liu 	int stripes_to_flush;
1168*a39f7afdSSong Liu 
1169*a39f7afdSSong Liu 	if (!r5c_is_writeback(log))
1170*a39f7afdSSong Liu 		return;
1171*a39f7afdSSong Liu 
1172*a39f7afdSSong Liu 	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1173*a39f7afdSSong Liu 		atomic_read(&conf->r5c_cached_full_stripes);
1174*a39f7afdSSong Liu 
1175*a39f7afdSSong Liu 	if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1176*a39f7afdSSong Liu 	    atomic_read(&conf->empty_inactive_list_nr) > 0)
1177*a39f7afdSSong Liu 		/*
1178*a39f7afdSSong Liu 		 * if stripe cache pressure high, flush all full stripes and
1179*a39f7afdSSong Liu 		 * some partial stripes
1180*a39f7afdSSong Liu 		 */
1181*a39f7afdSSong Liu 		stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1182*a39f7afdSSong Liu 	else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1183*a39f7afdSSong Liu 		 atomic_read(&conf->r5c_cached_full_stripes) >
1184*a39f7afdSSong Liu 		 R5C_FULL_STRIPE_FLUSH_BATCH)
1185*a39f7afdSSong Liu 		/*
1186*a39f7afdSSong Liu 		 * if stripe cache pressure moderate, or if there is many full
1187*a39f7afdSSong Liu 		 * stripes,flush all full stripes
1188*a39f7afdSSong Liu 		 */
1189*a39f7afdSSong Liu 		stripes_to_flush = 0;
1190*a39f7afdSSong Liu 	else
1191*a39f7afdSSong Liu 		/* no need to flush */
1192*a39f7afdSSong Liu 		stripes_to_flush = -1;
1193*a39f7afdSSong Liu 
1194*a39f7afdSSong Liu 	if (stripes_to_flush >= 0) {
1195*a39f7afdSSong Liu 		spin_lock_irqsave(&conf->device_lock, flags);
1196*a39f7afdSSong Liu 		r5c_flush_cache(conf, stripes_to_flush);
1197*a39f7afdSSong Liu 		spin_unlock_irqrestore(&conf->device_lock, flags);
1198*a39f7afdSSong Liu 	}
1199*a39f7afdSSong Liu 
1200*a39f7afdSSong Liu 	/* if log space is tight, flush stripes on stripe_in_journal_list */
1201*a39f7afdSSong Liu 	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1202*a39f7afdSSong Liu 		spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1203*a39f7afdSSong Liu 		spin_lock(&conf->device_lock);
1204*a39f7afdSSong Liu 		list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1205*a39f7afdSSong Liu 			/*
1206*a39f7afdSSong Liu 			 * stripes on stripe_in_journal_list could be in any
1207*a39f7afdSSong Liu 			 * state of the stripe_cache state machine. In this
1208*a39f7afdSSong Liu 			 * case, we only want to flush stripe on
1209*a39f7afdSSong Liu 			 * r5c_cached_full/partial_stripes. The following
1210*a39f7afdSSong Liu 			 * condition makes sure the stripe is on one of the
1211*a39f7afdSSong Liu 			 * two lists.
1212*a39f7afdSSong Liu 			 */
1213*a39f7afdSSong Liu 			if (!list_empty(&sh->lru) &&
1214*a39f7afdSSong Liu 			    !test_bit(STRIPE_HANDLE, &sh->state) &&
1215*a39f7afdSSong Liu 			    atomic_read(&sh->count) == 0) {
1216*a39f7afdSSong Liu 				r5c_flush_stripe(conf, sh);
1217*a39f7afdSSong Liu 			}
1218*a39f7afdSSong Liu 			if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1219*a39f7afdSSong Liu 				break;
1220*a39f7afdSSong Liu 		}
1221*a39f7afdSSong Liu 		spin_unlock(&conf->device_lock);
1222*a39f7afdSSong Liu 		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1223*a39f7afdSSong Liu 	}
1224*a39f7afdSSong Liu 	md_wakeup_thread(conf->mddev->thread);
1225*a39f7afdSSong Liu }
1226*a39f7afdSSong Liu 
12270576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log)
12280576b1c6SShaohua Li {
1229*a39f7afdSSong Liu 	struct r5conf *conf = log->rdev->mddev->private;
12300576b1c6SShaohua Li 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
123117036461SChristoph Hellwig 	sector_t reclaimable;
123217036461SChristoph Hellwig 	sector_t next_checkpoint;
1233*a39f7afdSSong Liu 	bool write_super;
12340576b1c6SShaohua Li 
12350576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
1236*a39f7afdSSong Liu 	write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1237*a39f7afdSSong Liu 		reclaim_target != 0 || !list_empty(&log->no_space_stripes);
12380576b1c6SShaohua Li 	/*
12390576b1c6SShaohua Li 	 * move proper io_unit to reclaim list. We should not change the order.
12400576b1c6SShaohua Li 	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
12410576b1c6SShaohua Li 	 * shouldn't reuse space of an unreclaimable io_unit
12420576b1c6SShaohua Li 	 */
12430576b1c6SShaohua Li 	while (1) {
124417036461SChristoph Hellwig 		reclaimable = r5l_reclaimable_space(log);
124517036461SChristoph Hellwig 		if (reclaimable >= reclaim_target ||
12460576b1c6SShaohua Li 		    (list_empty(&log->running_ios) &&
12470576b1c6SShaohua Li 		     list_empty(&log->io_end_ios) &&
1248a8c34f91SShaohua Li 		     list_empty(&log->flushing_ios) &&
124904732f74SChristoph Hellwig 		     list_empty(&log->finished_ios)))
12500576b1c6SShaohua Li 			break;
12510576b1c6SShaohua Li 
125217036461SChristoph Hellwig 		md_wakeup_thread(log->rdev->mddev->thread);
125317036461SChristoph Hellwig 		wait_event_lock_irq(log->iounit_wait,
125417036461SChristoph Hellwig 				    r5l_reclaimable_space(log) > reclaimable,
125517036461SChristoph Hellwig 				    log->io_list_lock);
12560576b1c6SShaohua Li 	}
125717036461SChristoph Hellwig 
1258*a39f7afdSSong Liu 	next_checkpoint = r5c_calculate_new_cp(conf);
12590576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
12600576b1c6SShaohua Li 
126117036461SChristoph Hellwig 	BUG_ON(reclaimable < 0);
1262*a39f7afdSSong Liu 
1263*a39f7afdSSong Liu 	if (reclaimable == 0 || !write_super)
12640576b1c6SShaohua Li 		return;
12650576b1c6SShaohua Li 
12660576b1c6SShaohua Li 	/*
12670576b1c6SShaohua Li 	 * write_super will flush cache of each raid disk. We must write super
12680576b1c6SShaohua Li 	 * here, because the log area might be reused soon and we don't want to
12690576b1c6SShaohua Li 	 * confuse recovery
12700576b1c6SShaohua Li 	 */
12714b482044SShaohua Li 	r5l_write_super_and_discard_space(log, next_checkpoint);
12720576b1c6SShaohua Li 
12730576b1c6SShaohua Li 	mutex_lock(&log->io_mutex);
127417036461SChristoph Hellwig 	log->last_checkpoint = next_checkpoint;
1275*a39f7afdSSong Liu 	r5c_update_log_state(log);
12760576b1c6SShaohua Li 	mutex_unlock(&log->io_mutex);
12770576b1c6SShaohua Li 
127817036461SChristoph Hellwig 	r5l_run_no_space_stripes(log);
12790576b1c6SShaohua Li }
12800576b1c6SShaohua Li 
12810576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread)
12820576b1c6SShaohua Li {
12830576b1c6SShaohua Li 	struct mddev *mddev = thread->mddev;
12840576b1c6SShaohua Li 	struct r5conf *conf = mddev->private;
12850576b1c6SShaohua Li 	struct r5l_log *log = conf->log;
12860576b1c6SShaohua Li 
12870576b1c6SShaohua Li 	if (!log)
12880576b1c6SShaohua Li 		return;
1289*a39f7afdSSong Liu 	r5c_do_reclaim(conf);
12900576b1c6SShaohua Li 	r5l_do_reclaim(log);
12910576b1c6SShaohua Li }
12920576b1c6SShaohua Li 
1293*a39f7afdSSong Liu void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1294f6bed0efSShaohua Li {
12950576b1c6SShaohua Li 	unsigned long target;
12960576b1c6SShaohua Li 	unsigned long new = (unsigned long)space; /* overflow in theory */
12970576b1c6SShaohua Li 
1298*a39f7afdSSong Liu 	if (!log)
1299*a39f7afdSSong Liu 		return;
13000576b1c6SShaohua Li 	do {
13010576b1c6SShaohua Li 		target = log->reclaim_target;
13020576b1c6SShaohua Li 		if (new < target)
13030576b1c6SShaohua Li 			return;
13040576b1c6SShaohua Li 	} while (cmpxchg(&log->reclaim_target, target, new) != target);
13050576b1c6SShaohua Li 	md_wakeup_thread(log->reclaim_thread);
1306f6bed0efSShaohua Li }
1307f6bed0efSShaohua Li 
1308e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state)
1309e6c033f7SShaohua Li {
13104b482044SShaohua Li 	struct mddev *mddev;
1311e6c033f7SShaohua Li 	if (!log || state == 2)
1312e6c033f7SShaohua Li 		return;
1313e6c033f7SShaohua Li 	if (state == 0) {
131416a43f6aSShaohua Li 		/*
131516a43f6aSShaohua Li 		 * This is a special case for hotadd. In suspend, the array has
131616a43f6aSShaohua Li 		 * no journal. In resume, journal is initialized as well as the
131716a43f6aSShaohua Li 		 * reclaim thread.
131816a43f6aSShaohua Li 		 */
131916a43f6aSShaohua Li 		if (log->reclaim_thread)
132016a43f6aSShaohua Li 			return;
1321e6c033f7SShaohua Li 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1322e6c033f7SShaohua Li 					log->rdev->mddev, "reclaim");
1323*a39f7afdSSong Liu 		log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1324e6c033f7SShaohua Li 	} else if (state == 1) {
13254b482044SShaohua Li 		/* make sure r5l_write_super_and_discard_space exits */
13264b482044SShaohua Li 		mddev = log->rdev->mddev;
13274b482044SShaohua Li 		wake_up(&mddev->sb_wait);
1328*a39f7afdSSong Liu 		r5l_wake_reclaim(log, MaxSector);
1329e6c033f7SShaohua Li 		md_unregister_thread(&log->reclaim_thread);
1330e6c033f7SShaohua Li 		r5l_do_reclaim(log);
1331e6c033f7SShaohua Li 	}
1332e6c033f7SShaohua Li }
1333e6c033f7SShaohua Li 
13346e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf)
13356e74a9cfSShaohua Li {
1336f6b6ec5cSShaohua Li 	struct r5l_log *log;
1337f6b6ec5cSShaohua Li 	bool ret;
13387dde2ad3SShaohua Li 	/* don't allow write if journal disk is missing */
1339f6b6ec5cSShaohua Li 	rcu_read_lock();
1340f6b6ec5cSShaohua Li 	log = rcu_dereference(conf->log);
1341f6b6ec5cSShaohua Li 
1342f6b6ec5cSShaohua Li 	if (!log)
1343f6b6ec5cSShaohua Li 		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1344f6b6ec5cSShaohua Li 	else
1345f6b6ec5cSShaohua Li 		ret = test_bit(Faulty, &log->rdev->flags);
1346f6b6ec5cSShaohua Li 	rcu_read_unlock();
1347f6b6ec5cSShaohua Li 	return ret;
13486e74a9cfSShaohua Li }
13496e74a9cfSShaohua Li 
1350355810d1SShaohua Li struct r5l_recovery_ctx {
1351355810d1SShaohua Li 	struct page *meta_page;		/* current meta */
1352355810d1SShaohua Li 	sector_t meta_total_blocks;	/* total size of current meta and data */
1353355810d1SShaohua Li 	sector_t pos;			/* recovery position */
1354355810d1SShaohua Li 	u64 seq;			/* recovery position seq */
1355355810d1SShaohua Li };
1356355810d1SShaohua Li 
1357355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log,
1358355810d1SShaohua Li 			       struct r5l_recovery_ctx *ctx)
1359355810d1SShaohua Li {
1360355810d1SShaohua Li 	struct page *page = ctx->meta_page;
1361355810d1SShaohua Li 	struct r5l_meta_block *mb;
1362355810d1SShaohua Li 	u32 crc, stored_crc;
1363355810d1SShaohua Li 
1364796a5cf0SMike Christie 	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1365796a5cf0SMike Christie 			  false))
1366355810d1SShaohua Li 		return -EIO;
1367355810d1SShaohua Li 
1368355810d1SShaohua Li 	mb = page_address(page);
1369355810d1SShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
1370355810d1SShaohua Li 	mb->checksum = 0;
1371355810d1SShaohua Li 
1372355810d1SShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1373355810d1SShaohua Li 	    le64_to_cpu(mb->seq) != ctx->seq ||
1374355810d1SShaohua Li 	    mb->version != R5LOG_VERSION ||
1375355810d1SShaohua Li 	    le64_to_cpu(mb->position) != ctx->pos)
1376355810d1SShaohua Li 		return -EINVAL;
1377355810d1SShaohua Li 
13785cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1379355810d1SShaohua Li 	if (stored_crc != crc)
1380355810d1SShaohua Li 		return -EINVAL;
1381355810d1SShaohua Li 
1382355810d1SShaohua Li 	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1383355810d1SShaohua Li 		return -EINVAL;
1384355810d1SShaohua Li 
1385355810d1SShaohua Li 	ctx->meta_total_blocks = BLOCK_SECTORS;
1386355810d1SShaohua Li 
1387355810d1SShaohua Li 	return 0;
1388355810d1SShaohua Li }
1389355810d1SShaohua Li 
1390355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
1391355810d1SShaohua Li 					 struct r5l_recovery_ctx *ctx,
1392355810d1SShaohua Li 					 sector_t stripe_sect,
13933fd880afSJackieLiu 					 int *offset)
1394355810d1SShaohua Li {
1395355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
1396355810d1SShaohua Li 	struct stripe_head *sh;
1397355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
1398355810d1SShaohua Li 	int disk_index;
1399355810d1SShaohua Li 
1400355810d1SShaohua Li 	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
1401355810d1SShaohua Li 	while (1) {
14023fd880afSJackieLiu 		sector_t log_offset = r5l_ring_add(log, ctx->pos,
14033fd880afSJackieLiu 				ctx->meta_total_blocks);
1404355810d1SShaohua Li 		payload = page_address(ctx->meta_page) + *offset;
1405355810d1SShaohua Li 
1406355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
1407355810d1SShaohua Li 			raid5_compute_sector(conf,
1408355810d1SShaohua Li 					     le64_to_cpu(payload->location), 0,
1409355810d1SShaohua Li 					     &disk_index, sh);
1410355810d1SShaohua Li 
14113fd880afSJackieLiu 			sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1412796a5cf0SMike Christie 				     sh->dev[disk_index].page, REQ_OP_READ, 0,
1413796a5cf0SMike Christie 				     false);
1414355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
1415355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
1416355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1417355810d1SShaohua Li 		} else {
1418355810d1SShaohua Li 			disk_index = sh->pd_idx;
14193fd880afSJackieLiu 			sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1420796a5cf0SMike Christie 				     sh->dev[disk_index].page, REQ_OP_READ, 0,
1421796a5cf0SMike Christie 				     false);
1422355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
1423355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
1424355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1425355810d1SShaohua Li 
1426355810d1SShaohua Li 			if (sh->qd_idx >= 0) {
1427355810d1SShaohua Li 				disk_index = sh->qd_idx;
1428355810d1SShaohua Li 				sync_page_io(log->rdev,
14293fd880afSJackieLiu 					     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1430355810d1SShaohua Li 					     PAGE_SIZE, sh->dev[disk_index].page,
1431796a5cf0SMike Christie 					     REQ_OP_READ, 0, false);
1432355810d1SShaohua Li 				sh->dev[disk_index].log_checksum =
1433355810d1SShaohua Li 					le32_to_cpu(payload->checksum[1]);
1434355810d1SShaohua Li 				set_bit(R5_Wantwrite,
1435355810d1SShaohua Li 					&sh->dev[disk_index].flags);
1436355810d1SShaohua Li 			}
1437355810d1SShaohua Li 		}
1438355810d1SShaohua Li 
14393fd880afSJackieLiu 		ctx->meta_total_blocks += le32_to_cpu(payload->size);
1440355810d1SShaohua Li 		*offset += sizeof(struct r5l_payload_data_parity) +
1441355810d1SShaohua Li 			sizeof(__le32) *
1442355810d1SShaohua Li 			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1443355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
1444355810d1SShaohua Li 			break;
1445355810d1SShaohua Li 	}
1446355810d1SShaohua Li 
1447355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1448355810d1SShaohua Li 		void *addr;
1449355810d1SShaohua Li 		u32 checksum;
1450355810d1SShaohua Li 
1451355810d1SShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1452355810d1SShaohua Li 			continue;
1453355810d1SShaohua Li 		addr = kmap_atomic(sh->dev[disk_index].page);
14545cb2fbd6SShaohua Li 		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1455355810d1SShaohua Li 		kunmap_atomic(addr);
1456355810d1SShaohua Li 		if (checksum != sh->dev[disk_index].log_checksum)
1457355810d1SShaohua Li 			goto error;
1458355810d1SShaohua Li 	}
1459355810d1SShaohua Li 
1460355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1461355810d1SShaohua Li 		struct md_rdev *rdev, *rrdev;
1462355810d1SShaohua Li 
1463355810d1SShaohua Li 		if (!test_and_clear_bit(R5_Wantwrite,
1464355810d1SShaohua Li 					&sh->dev[disk_index].flags))
1465355810d1SShaohua Li 			continue;
1466355810d1SShaohua Li 
1467355810d1SShaohua Li 		/* in case device is broken */
1468354b445bSShaohua Li 		rcu_read_lock();
1469355810d1SShaohua Li 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
1470354b445bSShaohua Li 		if (rdev) {
1471354b445bSShaohua Li 			atomic_inc(&rdev->nr_pending);
1472354b445bSShaohua Li 			rcu_read_unlock();
1473355810d1SShaohua Li 			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
1474796a5cf0SMike Christie 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1475796a5cf0SMike Christie 				     false);
1476354b445bSShaohua Li 			rdev_dec_pending(rdev, rdev->mddev);
1477354b445bSShaohua Li 			rcu_read_lock();
1478354b445bSShaohua Li 		}
1479355810d1SShaohua Li 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1480354b445bSShaohua Li 		if (rrdev) {
1481354b445bSShaohua Li 			atomic_inc(&rrdev->nr_pending);
1482354b445bSShaohua Li 			rcu_read_unlock();
1483355810d1SShaohua Li 			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
1484796a5cf0SMike Christie 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1485796a5cf0SMike Christie 				     false);
1486354b445bSShaohua Li 			rdev_dec_pending(rrdev, rrdev->mddev);
1487354b445bSShaohua Li 			rcu_read_lock();
1488354b445bSShaohua Li 		}
1489354b445bSShaohua Li 		rcu_read_unlock();
1490355810d1SShaohua Li 	}
1491355810d1SShaohua Li 	raid5_release_stripe(sh);
1492355810d1SShaohua Li 	return 0;
1493355810d1SShaohua Li 
1494355810d1SShaohua Li error:
1495355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++)
1496355810d1SShaohua Li 		sh->dev[disk_index].flags = 0;
1497355810d1SShaohua Li 	raid5_release_stripe(sh);
1498355810d1SShaohua Li 	return -EINVAL;
1499355810d1SShaohua Li }
1500355810d1SShaohua Li 
1501355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log,
1502355810d1SShaohua Li 				       struct r5l_recovery_ctx *ctx)
1503355810d1SShaohua Li {
1504355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
1505355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
1506355810d1SShaohua Li 	struct r5l_meta_block *mb;
1507355810d1SShaohua Li 	int offset;
1508355810d1SShaohua Li 	sector_t stripe_sector;
1509355810d1SShaohua Li 
1510355810d1SShaohua Li 	mb = page_address(ctx->meta_page);
1511355810d1SShaohua Li 	offset = sizeof(struct r5l_meta_block);
1512355810d1SShaohua Li 
1513355810d1SShaohua Li 	while (offset < le32_to_cpu(mb->meta_size)) {
1514355810d1SShaohua Li 		int dd;
1515355810d1SShaohua Li 
1516355810d1SShaohua Li 		payload = (void *)mb + offset;
1517355810d1SShaohua Li 		stripe_sector = raid5_compute_sector(conf,
1518355810d1SShaohua Li 						     le64_to_cpu(payload->location), 0, &dd, NULL);
1519355810d1SShaohua Li 		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
15203fd880afSJackieLiu 						  &offset))
1521355810d1SShaohua Li 			return -EINVAL;
1522355810d1SShaohua Li 	}
1523355810d1SShaohua Li 	return 0;
1524355810d1SShaohua Li }
1525355810d1SShaohua Li 
1526355810d1SShaohua Li /* copy data/parity from log to raid disks */
1527355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log,
1528355810d1SShaohua Li 				   struct r5l_recovery_ctx *ctx)
1529355810d1SShaohua Li {
1530355810d1SShaohua Li 	while (1) {
1531355810d1SShaohua Li 		if (r5l_read_meta_block(log, ctx))
1532355810d1SShaohua Li 			return;
1533355810d1SShaohua Li 		if (r5l_recovery_flush_one_meta(log, ctx))
1534355810d1SShaohua Li 			return;
1535355810d1SShaohua Li 		ctx->seq++;
1536355810d1SShaohua Li 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1537355810d1SShaohua Li 	}
1538355810d1SShaohua Li }
1539355810d1SShaohua Li 
1540355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1541355810d1SShaohua Li 					  u64 seq)
1542355810d1SShaohua Li {
1543355810d1SShaohua Li 	struct page *page;
1544355810d1SShaohua Li 	struct r5l_meta_block *mb;
1545355810d1SShaohua Li 	u32 crc;
1546355810d1SShaohua Li 
1547355810d1SShaohua Li 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1548355810d1SShaohua Li 	if (!page)
1549355810d1SShaohua Li 		return -ENOMEM;
1550355810d1SShaohua Li 	mb = page_address(page);
1551355810d1SShaohua Li 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
1552355810d1SShaohua Li 	mb->version = R5LOG_VERSION;
1553355810d1SShaohua Li 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1554355810d1SShaohua Li 	mb->seq = cpu_to_le64(seq);
1555355810d1SShaohua Li 	mb->position = cpu_to_le64(pos);
15565cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1557355810d1SShaohua Li 	mb->checksum = cpu_to_le32(crc);
1558355810d1SShaohua Li 
1559796a5cf0SMike Christie 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1560796a5cf0SMike Christie 			  WRITE_FUA, false)) {
1561355810d1SShaohua Li 		__free_page(page);
1562355810d1SShaohua Li 		return -EIO;
1563355810d1SShaohua Li 	}
1564355810d1SShaohua Li 	__free_page(page);
1565355810d1SShaohua Li 	return 0;
1566355810d1SShaohua Li }
1567355810d1SShaohua Li 
1568f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log)
1569f6bed0efSShaohua Li {
1570355810d1SShaohua Li 	struct r5l_recovery_ctx ctx;
1571355810d1SShaohua Li 
1572355810d1SShaohua Li 	ctx.pos = log->last_checkpoint;
1573355810d1SShaohua Li 	ctx.seq = log->last_cp_seq;
1574355810d1SShaohua Li 	ctx.meta_page = alloc_page(GFP_KERNEL);
1575355810d1SShaohua Li 	if (!ctx.meta_page)
1576355810d1SShaohua Li 		return -ENOMEM;
1577355810d1SShaohua Li 
1578355810d1SShaohua Li 	r5l_recovery_flush_log(log, &ctx);
1579355810d1SShaohua Li 	__free_page(ctx.meta_page);
1580355810d1SShaohua Li 
1581355810d1SShaohua Li 	/*
1582355810d1SShaohua Li 	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
1583355810d1SShaohua Li 	 * log will start here. but we can't let superblock point to last valid
1584355810d1SShaohua Li 	 * meta block. The log might looks like:
1585355810d1SShaohua Li 	 * | meta 1| meta 2| meta 3|
1586355810d1SShaohua Li 	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1587355810d1SShaohua Li 	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
1588355810d1SShaohua Li 	 * happens again, new recovery will start from meta 1. Since meta 2n is
1589355810d1SShaohua Li 	 * valid now, recovery will think meta 3 is valid, which is wrong.
1590355810d1SShaohua Li 	 * The solution is we create a new meta in meta2 with its seq == meta
1591355810d1SShaohua Li 	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
1592355810d1SShaohua Li 	 * not think meta 3 is a valid meta, because its seq doesn't match
1593355810d1SShaohua Li 	 */
15949a8b27faSShaohua Li 	if (ctx.seq > log->last_cp_seq) {
1595355810d1SShaohua Li 		int ret;
1596355810d1SShaohua Li 
1597355810d1SShaohua Li 		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1598355810d1SShaohua Li 		if (ret)
1599355810d1SShaohua Li 			return ret;
1600355810d1SShaohua Li 		log->seq = ctx.seq + 11;
1601355810d1SShaohua Li 		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1602355810d1SShaohua Li 		r5l_write_super(log, ctx.pos);
160328cd88e2SZhengyuan Liu 		log->last_checkpoint = ctx.pos;
160428cd88e2SZhengyuan Liu 		log->next_checkpoint = ctx.pos;
1605355810d1SShaohua Li 	} else {
1606355810d1SShaohua Li 		log->log_start = ctx.pos;
1607355810d1SShaohua Li 		log->seq = ctx.seq;
1608355810d1SShaohua Li 	}
1609f6bed0efSShaohua Li 	return 0;
1610f6bed0efSShaohua Li }
1611f6bed0efSShaohua Li 
1612f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp)
1613f6bed0efSShaohua Li {
1614f6bed0efSShaohua Li 	struct mddev *mddev = log->rdev->mddev;
1615f6bed0efSShaohua Li 
1616f6bed0efSShaohua Li 	log->rdev->journal_tail = cp;
1617f6bed0efSShaohua Li 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1618f6bed0efSShaohua Li }
1619f6bed0efSShaohua Li 
16202ded3703SSong Liu /*
16212ded3703SSong Liu  * Try handle write operation in caching phase. This function should only
16222ded3703SSong Liu  * be called in write-back mode.
16232ded3703SSong Liu  *
16242ded3703SSong Liu  * If all outstanding writes can be handled in caching phase, returns 0
16252ded3703SSong Liu  * If writes requires write-out phase, call r5c_make_stripe_write_out()
16262ded3703SSong Liu  * and returns -EAGAIN
16272ded3703SSong Liu  */
16282ded3703SSong Liu int r5c_try_caching_write(struct r5conf *conf,
16292ded3703SSong Liu 			  struct stripe_head *sh,
16302ded3703SSong Liu 			  struct stripe_head_state *s,
16312ded3703SSong Liu 			  int disks)
16322ded3703SSong Liu {
16332ded3703SSong Liu 	struct r5l_log *log = conf->log;
16341e6d690bSSong Liu 	int i;
16351e6d690bSSong Liu 	struct r5dev *dev;
16361e6d690bSSong Liu 	int to_cache = 0;
16372ded3703SSong Liu 
16382ded3703SSong Liu 	BUG_ON(!r5c_is_writeback(log));
16392ded3703SSong Liu 
16401e6d690bSSong Liu 	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
16411e6d690bSSong Liu 		/*
16421e6d690bSSong Liu 		 * There are two different scenarios here:
16431e6d690bSSong Liu 		 *  1. The stripe has some data cached, and it is sent to
16441e6d690bSSong Liu 		 *     write-out phase for reclaim
16451e6d690bSSong Liu 		 *  2. The stripe is clean, and this is the first write
16461e6d690bSSong Liu 		 *
16471e6d690bSSong Liu 		 * For 1, return -EAGAIN, so we continue with
16481e6d690bSSong Liu 		 * handle_stripe_dirtying().
16491e6d690bSSong Liu 		 *
16501e6d690bSSong Liu 		 * For 2, set STRIPE_R5C_CACHING and continue with caching
16511e6d690bSSong Liu 		 * write.
16521e6d690bSSong Liu 		 */
16531e6d690bSSong Liu 
16541e6d690bSSong Liu 		/* case 1: anything injournal or anything in written */
16551e6d690bSSong Liu 		if (s->injournal > 0 || s->written > 0)
16561e6d690bSSong Liu 			return -EAGAIN;
16571e6d690bSSong Liu 		/* case 2 */
16581e6d690bSSong Liu 		set_bit(STRIPE_R5C_CACHING, &sh->state);
16591e6d690bSSong Liu 	}
16601e6d690bSSong Liu 
16611e6d690bSSong Liu 	for (i = disks; i--; ) {
16621e6d690bSSong Liu 		dev = &sh->dev[i];
16631e6d690bSSong Liu 		/* if non-overwrite, use writing-out phase */
16641e6d690bSSong Liu 		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
16651e6d690bSSong Liu 		    !test_bit(R5_InJournal, &dev->flags)) {
16662ded3703SSong Liu 			r5c_make_stripe_write_out(sh);
16672ded3703SSong Liu 			return -EAGAIN;
16682ded3703SSong Liu 		}
16691e6d690bSSong Liu 	}
16701e6d690bSSong Liu 
16711e6d690bSSong Liu 	for (i = disks; i--; ) {
16721e6d690bSSong Liu 		dev = &sh->dev[i];
16731e6d690bSSong Liu 		if (dev->towrite) {
16741e6d690bSSong Liu 			set_bit(R5_Wantwrite, &dev->flags);
16751e6d690bSSong Liu 			set_bit(R5_Wantdrain, &dev->flags);
16761e6d690bSSong Liu 			set_bit(R5_LOCKED, &dev->flags);
16771e6d690bSSong Liu 			to_cache++;
16781e6d690bSSong Liu 		}
16791e6d690bSSong Liu 	}
16801e6d690bSSong Liu 
16811e6d690bSSong Liu 	if (to_cache) {
16821e6d690bSSong Liu 		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
16831e6d690bSSong Liu 		/*
16841e6d690bSSong Liu 		 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
16851e6d690bSSong Liu 		 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
16861e6d690bSSong Liu 		 * r5c_handle_data_cached()
16871e6d690bSSong Liu 		 */
16881e6d690bSSong Liu 		set_bit(STRIPE_LOG_TRAPPED, &sh->state);
16891e6d690bSSong Liu 	}
16901e6d690bSSong Liu 
16911e6d690bSSong Liu 	return 0;
16921e6d690bSSong Liu }
16931e6d690bSSong Liu 
16941e6d690bSSong Liu /*
16951e6d690bSSong Liu  * free extra pages (orig_page) we allocated for prexor
16961e6d690bSSong Liu  */
16971e6d690bSSong Liu void r5c_release_extra_page(struct stripe_head *sh)
16981e6d690bSSong Liu {
16991e6d690bSSong Liu 	int i;
17001e6d690bSSong Liu 
17011e6d690bSSong Liu 	for (i = sh->disks; i--; )
17021e6d690bSSong Liu 		if (sh->dev[i].page != sh->dev[i].orig_page) {
17031e6d690bSSong Liu 			struct page *p = sh->dev[i].orig_page;
17041e6d690bSSong Liu 
17051e6d690bSSong Liu 			sh->dev[i].orig_page = sh->dev[i].page;
17061e6d690bSSong Liu 			put_page(p);
17071e6d690bSSong Liu 		}
17081e6d690bSSong Liu }
17092ded3703SSong Liu 
17102ded3703SSong Liu /*
17112ded3703SSong Liu  * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
17122ded3703SSong Liu  * stripe is committed to RAID disks.
17132ded3703SSong Liu  */
17142ded3703SSong Liu void r5c_finish_stripe_write_out(struct r5conf *conf,
17152ded3703SSong Liu 				 struct stripe_head *sh,
17162ded3703SSong Liu 				 struct stripe_head_state *s)
17172ded3703SSong Liu {
17181e6d690bSSong Liu 	int i;
17191e6d690bSSong Liu 	int do_wakeup = 0;
17201e6d690bSSong Liu 
17212ded3703SSong Liu 	if (!conf->log ||
17222ded3703SSong Liu 	    !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
17232ded3703SSong Liu 		return;
17242ded3703SSong Liu 
17252ded3703SSong Liu 	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
17262ded3703SSong Liu 	clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
17272ded3703SSong Liu 
17282ded3703SSong Liu 	if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
17292ded3703SSong Liu 		return;
17301e6d690bSSong Liu 
17311e6d690bSSong Liu 	for (i = sh->disks; i--; ) {
17321e6d690bSSong Liu 		clear_bit(R5_InJournal, &sh->dev[i].flags);
17331e6d690bSSong Liu 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
17341e6d690bSSong Liu 			do_wakeup = 1;
17351e6d690bSSong Liu 	}
17361e6d690bSSong Liu 
17371e6d690bSSong Liu 	/*
17381e6d690bSSong Liu 	 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
17391e6d690bSSong Liu 	 * We updated R5_InJournal, so we also update s->injournal.
17401e6d690bSSong Liu 	 */
17411e6d690bSSong Liu 	s->injournal = 0;
17421e6d690bSSong Liu 
17431e6d690bSSong Liu 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
17441e6d690bSSong Liu 		if (atomic_dec_and_test(&conf->pending_full_writes))
17451e6d690bSSong Liu 			md_wakeup_thread(conf->mddev->thread);
17461e6d690bSSong Liu 
17471e6d690bSSong Liu 	if (do_wakeup)
17481e6d690bSSong Liu 		wake_up(&conf->wait_for_overlap);
1749*a39f7afdSSong Liu 
1750*a39f7afdSSong Liu 	if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1751*a39f7afdSSong Liu 		return;
1752*a39f7afdSSong Liu 
1753*a39f7afdSSong Liu 	spin_lock_irq(&conf->log->stripe_in_journal_lock);
1754*a39f7afdSSong Liu 	list_del_init(&sh->r5c);
1755*a39f7afdSSong Liu 	spin_unlock_irq(&conf->log->stripe_in_journal_lock);
1756*a39f7afdSSong Liu 	sh->log_start = MaxSector;
1757*a39f7afdSSong Liu 	atomic_dec(&conf->log->stripe_in_journal_count);
17581e6d690bSSong Liu }
17591e6d690bSSong Liu 
17601e6d690bSSong Liu int
17611e6d690bSSong Liu r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
17621e6d690bSSong Liu 	       struct stripe_head_state *s)
17631e6d690bSSong Liu {
1764*a39f7afdSSong Liu 	struct r5conf *conf = sh->raid_conf;
17651e6d690bSSong Liu 	int pages = 0;
17661e6d690bSSong Liu 	int reserve;
17671e6d690bSSong Liu 	int i;
17681e6d690bSSong Liu 	int ret = 0;
17691e6d690bSSong Liu 
17701e6d690bSSong Liu 	BUG_ON(!log);
17711e6d690bSSong Liu 
17721e6d690bSSong Liu 	for (i = 0; i < sh->disks; i++) {
17731e6d690bSSong Liu 		void *addr;
17741e6d690bSSong Liu 
17751e6d690bSSong Liu 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
17761e6d690bSSong Liu 			continue;
17771e6d690bSSong Liu 		addr = kmap_atomic(sh->dev[i].page);
17781e6d690bSSong Liu 		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
17791e6d690bSSong Liu 						    addr, PAGE_SIZE);
17801e6d690bSSong Liu 		kunmap_atomic(addr);
17811e6d690bSSong Liu 		pages++;
17821e6d690bSSong Liu 	}
17831e6d690bSSong Liu 	WARN_ON(pages == 0);
17841e6d690bSSong Liu 
17851e6d690bSSong Liu 	/*
17861e6d690bSSong Liu 	 * The stripe must enter state machine again to call endio, so
17871e6d690bSSong Liu 	 * don't delay.
17881e6d690bSSong Liu 	 */
17891e6d690bSSong Liu 	clear_bit(STRIPE_DELAYED, &sh->state);
17901e6d690bSSong Liu 	atomic_inc(&sh->count);
17911e6d690bSSong Liu 
17921e6d690bSSong Liu 	mutex_lock(&log->io_mutex);
17931e6d690bSSong Liu 	/* meta + data */
17941e6d690bSSong Liu 	reserve = (1 + pages) << (PAGE_SHIFT - 9);
17951e6d690bSSong Liu 
1796*a39f7afdSSong Liu 	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1797*a39f7afdSSong Liu 	    sh->log_start == MaxSector)
1798*a39f7afdSSong Liu 		r5l_add_no_space_stripe(log, sh);
1799*a39f7afdSSong Liu 	else if (!r5l_has_free_space(log, reserve)) {
1800*a39f7afdSSong Liu 		if (sh->log_start == log->last_checkpoint)
1801*a39f7afdSSong Liu 			BUG();
1802*a39f7afdSSong Liu 		else
1803*a39f7afdSSong Liu 			r5l_add_no_space_stripe(log, sh);
18041e6d690bSSong Liu 	} else {
18051e6d690bSSong Liu 		ret = r5l_log_stripe(log, sh, pages, 0);
18061e6d690bSSong Liu 		if (ret) {
18071e6d690bSSong Liu 			spin_lock_irq(&log->io_list_lock);
18081e6d690bSSong Liu 			list_add_tail(&sh->log_list, &log->no_mem_stripes);
18091e6d690bSSong Liu 			spin_unlock_irq(&log->io_list_lock);
18101e6d690bSSong Liu 		}
18111e6d690bSSong Liu 	}
18121e6d690bSSong Liu 
18131e6d690bSSong Liu 	mutex_unlock(&log->io_mutex);
18141e6d690bSSong Liu 	return 0;
18152ded3703SSong Liu }
18162ded3703SSong Liu 
1817f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log)
1818f6bed0efSShaohua Li {
1819f6bed0efSShaohua Li 	struct md_rdev *rdev = log->rdev;
1820f6bed0efSShaohua Li 	struct page *page;
1821f6bed0efSShaohua Li 	struct r5l_meta_block *mb;
1822f6bed0efSShaohua Li 	sector_t cp = log->rdev->journal_tail;
1823f6bed0efSShaohua Li 	u32 stored_crc, expected_crc;
1824f6bed0efSShaohua Li 	bool create_super = false;
1825f6bed0efSShaohua Li 	int ret;
1826f6bed0efSShaohua Li 
1827f6bed0efSShaohua Li 	/* Make sure it's valid */
1828f6bed0efSShaohua Li 	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1829f6bed0efSShaohua Li 		cp = 0;
1830f6bed0efSShaohua Li 	page = alloc_page(GFP_KERNEL);
1831f6bed0efSShaohua Li 	if (!page)
1832f6bed0efSShaohua Li 		return -ENOMEM;
1833f6bed0efSShaohua Li 
1834796a5cf0SMike Christie 	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
1835f6bed0efSShaohua Li 		ret = -EIO;
1836f6bed0efSShaohua Li 		goto ioerr;
1837f6bed0efSShaohua Li 	}
1838f6bed0efSShaohua Li 	mb = page_address(page);
1839f6bed0efSShaohua Li 
1840f6bed0efSShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1841f6bed0efSShaohua Li 	    mb->version != R5LOG_VERSION) {
1842f6bed0efSShaohua Li 		create_super = true;
1843f6bed0efSShaohua Li 		goto create;
1844f6bed0efSShaohua Li 	}
1845f6bed0efSShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
1846f6bed0efSShaohua Li 	mb->checksum = 0;
18475cb2fbd6SShaohua Li 	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1848f6bed0efSShaohua Li 	if (stored_crc != expected_crc) {
1849f6bed0efSShaohua Li 		create_super = true;
1850f6bed0efSShaohua Li 		goto create;
1851f6bed0efSShaohua Li 	}
1852f6bed0efSShaohua Li 	if (le64_to_cpu(mb->position) != cp) {
1853f6bed0efSShaohua Li 		create_super = true;
1854f6bed0efSShaohua Li 		goto create;
1855f6bed0efSShaohua Li 	}
1856f6bed0efSShaohua Li create:
1857f6bed0efSShaohua Li 	if (create_super) {
1858f6bed0efSShaohua Li 		log->last_cp_seq = prandom_u32();
1859f6bed0efSShaohua Li 		cp = 0;
186056056c2eSZhengyuan Liu 		r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
1861f6bed0efSShaohua Li 		/*
1862f6bed0efSShaohua Li 		 * Make sure super points to correct address. Log might have
1863f6bed0efSShaohua Li 		 * data very soon. If super hasn't correct log tail address,
1864f6bed0efSShaohua Li 		 * recovery can't find the log
1865f6bed0efSShaohua Li 		 */
1866f6bed0efSShaohua Li 		r5l_write_super(log, cp);
1867f6bed0efSShaohua Li 	} else
1868f6bed0efSShaohua Li 		log->last_cp_seq = le64_to_cpu(mb->seq);
1869f6bed0efSShaohua Li 
1870f6bed0efSShaohua Li 	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
18710576b1c6SShaohua Li 	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
18720576b1c6SShaohua Li 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
18730576b1c6SShaohua Li 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1874f6bed0efSShaohua Li 	log->last_checkpoint = cp;
187528cd88e2SZhengyuan Liu 	log->next_checkpoint = cp;
1876*a39f7afdSSong Liu 	mutex_lock(&log->io_mutex);
1877*a39f7afdSSong Liu 	r5c_update_log_state(log);
1878*a39f7afdSSong Liu 	mutex_unlock(&log->io_mutex);
1879f6bed0efSShaohua Li 
1880f6bed0efSShaohua Li 	__free_page(page);
1881f6bed0efSShaohua Li 
1882f6bed0efSShaohua Li 	return r5l_recovery_log(log);
1883f6bed0efSShaohua Li ioerr:
1884f6bed0efSShaohua Li 	__free_page(page);
1885f6bed0efSShaohua Li 	return ret;
1886f6bed0efSShaohua Li }
1887f6bed0efSShaohua Li 
1888f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1889f6bed0efSShaohua Li {
1890c888a8f9SJens Axboe 	struct request_queue *q = bdev_get_queue(rdev->bdev);
1891f6bed0efSShaohua Li 	struct r5l_log *log;
1892f6bed0efSShaohua Li 
1893f6bed0efSShaohua Li 	if (PAGE_SIZE != 4096)
1894f6bed0efSShaohua Li 		return -EINVAL;
1895c757ec95SSong Liu 
1896c757ec95SSong Liu 	/*
1897c757ec95SSong Liu 	 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
1898c757ec95SSong Liu 	 * raid_disks r5l_payload_data_parity.
1899c757ec95SSong Liu 	 *
1900c757ec95SSong Liu 	 * Write journal and cache does not work for very big array
1901c757ec95SSong Liu 	 * (raid_disks > 203)
1902c757ec95SSong Liu 	 */
1903c757ec95SSong Liu 	if (sizeof(struct r5l_meta_block) +
1904c757ec95SSong Liu 	    ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
1905c757ec95SSong Liu 	     conf->raid_disks) > PAGE_SIZE) {
1906c757ec95SSong Liu 		pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
1907c757ec95SSong Liu 		       mdname(conf->mddev), conf->raid_disks);
1908c757ec95SSong Liu 		return -EINVAL;
1909c757ec95SSong Liu 	}
1910c757ec95SSong Liu 
1911f6bed0efSShaohua Li 	log = kzalloc(sizeof(*log), GFP_KERNEL);
1912f6bed0efSShaohua Li 	if (!log)
1913f6bed0efSShaohua Li 		return -ENOMEM;
1914f6bed0efSShaohua Li 	log->rdev = rdev;
1915f6bed0efSShaohua Li 
1916c888a8f9SJens Axboe 	log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
191756fef7c6SChristoph Hellwig 
19185cb2fbd6SShaohua Li 	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1919f6bed0efSShaohua Li 				       sizeof(rdev->mddev->uuid));
1920f6bed0efSShaohua Li 
1921f6bed0efSShaohua Li 	mutex_init(&log->io_mutex);
1922f6bed0efSShaohua Li 
1923f6bed0efSShaohua Li 	spin_lock_init(&log->io_list_lock);
1924f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->running_ios);
19250576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->io_end_ios);
1926a8c34f91SShaohua Li 	INIT_LIST_HEAD(&log->flushing_ios);
192704732f74SChristoph Hellwig 	INIT_LIST_HEAD(&log->finished_ios);
1928a8c34f91SShaohua Li 	bio_init(&log->flush_bio);
1929f6bed0efSShaohua Li 
1930f6bed0efSShaohua Li 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1931f6bed0efSShaohua Li 	if (!log->io_kc)
1932f6bed0efSShaohua Li 		goto io_kc;
1933f6bed0efSShaohua Li 
19345036c390SChristoph Hellwig 	log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
19355036c390SChristoph Hellwig 	if (!log->io_pool)
19365036c390SChristoph Hellwig 		goto io_pool;
19375036c390SChristoph Hellwig 
1938c38d29b3SChristoph Hellwig 	log->bs = bioset_create(R5L_POOL_SIZE, 0);
1939c38d29b3SChristoph Hellwig 	if (!log->bs)
1940c38d29b3SChristoph Hellwig 		goto io_bs;
1941c38d29b3SChristoph Hellwig 
1942e8deb638SChristoph Hellwig 	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
1943e8deb638SChristoph Hellwig 	if (!log->meta_pool)
1944e8deb638SChristoph Hellwig 		goto out_mempool;
1945e8deb638SChristoph Hellwig 
19460576b1c6SShaohua Li 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
19470576b1c6SShaohua Li 						 log->rdev->mddev, "reclaim");
19480576b1c6SShaohua Li 	if (!log->reclaim_thread)
19490576b1c6SShaohua Li 		goto reclaim_thread;
1950*a39f7afdSSong Liu 	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1951*a39f7afdSSong Liu 
19520fd22b45SShaohua Li 	init_waitqueue_head(&log->iounit_wait);
19530576b1c6SShaohua Li 
19545036c390SChristoph Hellwig 	INIT_LIST_HEAD(&log->no_mem_stripes);
19555036c390SChristoph Hellwig 
1956f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->no_space_stripes);
1957f6bed0efSShaohua Li 	spin_lock_init(&log->no_space_stripes_lock);
1958f6bed0efSShaohua Li 
19592ded3703SSong Liu 	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1960*a39f7afdSSong Liu 	INIT_LIST_HEAD(&log->stripe_in_journal_list);
1961*a39f7afdSSong Liu 	spin_lock_init(&log->stripe_in_journal_lock);
1962*a39f7afdSSong Liu 	atomic_set(&log->stripe_in_journal_count, 0);
19632ded3703SSong Liu 
1964f6bed0efSShaohua Li 	if (r5l_load_log(log))
1965f6bed0efSShaohua Li 		goto error;
1966f6bed0efSShaohua Li 
1967f6b6ec5cSShaohua Li 	rcu_assign_pointer(conf->log, log);
1968a62ab49eSShaohua Li 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1969f6bed0efSShaohua Li 	return 0;
1970e8deb638SChristoph Hellwig 
1971f6bed0efSShaohua Li error:
19720576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
19730576b1c6SShaohua Li reclaim_thread:
1974e8deb638SChristoph Hellwig 	mempool_destroy(log->meta_pool);
1975e8deb638SChristoph Hellwig out_mempool:
1976c38d29b3SChristoph Hellwig 	bioset_free(log->bs);
1977c38d29b3SChristoph Hellwig io_bs:
19785036c390SChristoph Hellwig 	mempool_destroy(log->io_pool);
19795036c390SChristoph Hellwig io_pool:
1980f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1981f6bed0efSShaohua Li io_kc:
1982f6bed0efSShaohua Li 	kfree(log);
1983f6bed0efSShaohua Li 	return -EINVAL;
1984f6bed0efSShaohua Li }
1985f6bed0efSShaohua Li 
1986f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log)
1987f6bed0efSShaohua Li {
19880576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
1989e8deb638SChristoph Hellwig 	mempool_destroy(log->meta_pool);
1990c38d29b3SChristoph Hellwig 	bioset_free(log->bs);
19915036c390SChristoph Hellwig 	mempool_destroy(log->io_pool);
1992f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1993f6bed0efSShaohua Li 	kfree(log);
1994f6bed0efSShaohua Li }
1995