xref: /linux/drivers/md/raid5-cache.c (revision e8deb6381051bf3ce9d817020e8ba972b405a070)
1f6bed0efSShaohua Li /*
2f6bed0efSShaohua Li  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3f6bed0efSShaohua Li  *
4f6bed0efSShaohua Li  * This program is free software; you can redistribute it and/or modify it
5f6bed0efSShaohua Li  * under the terms and conditions of the GNU General Public License,
6f6bed0efSShaohua Li  * version 2, as published by the Free Software Foundation.
7f6bed0efSShaohua Li  *
8f6bed0efSShaohua Li  * This program is distributed in the hope it will be useful, but WITHOUT
9f6bed0efSShaohua Li  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10f6bed0efSShaohua Li  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11f6bed0efSShaohua Li  * more details.
12f6bed0efSShaohua Li  *
13f6bed0efSShaohua Li  */
14f6bed0efSShaohua Li #include <linux/kernel.h>
15f6bed0efSShaohua Li #include <linux/wait.h>
16f6bed0efSShaohua Li #include <linux/blkdev.h>
17f6bed0efSShaohua Li #include <linux/slab.h>
18f6bed0efSShaohua Li #include <linux/raid/md_p.h>
195cb2fbd6SShaohua Li #include <linux/crc32c.h>
20f6bed0efSShaohua Li #include <linux/random.h>
21f6bed0efSShaohua Li #include "md.h"
22f6bed0efSShaohua Li #include "raid5.h"
23f6bed0efSShaohua Li 
24f6bed0efSShaohua Li /*
25f6bed0efSShaohua Li  * metadata/data stored in disk with 4k size unit (a block) regardless
26f6bed0efSShaohua Li  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27f6bed0efSShaohua Li  */
28f6bed0efSShaohua Li #define BLOCK_SECTORS (8)
29f6bed0efSShaohua Li 
300576b1c6SShaohua Li /*
310576b1c6SShaohua Li  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
320576b1c6SShaohua Li  * recovery scans a very long log
330576b1c6SShaohua Li  */
340576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
350576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
360576b1c6SShaohua Li 
37c38d29b3SChristoph Hellwig /*
38c38d29b3SChristoph Hellwig  * We only need 2 bios per I/O unit to make progress, but ensure we
39c38d29b3SChristoph Hellwig  * have a few more available to not get too tight.
40c38d29b3SChristoph Hellwig  */
41c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE	4
42c38d29b3SChristoph Hellwig 
43f6bed0efSShaohua Li struct r5l_log {
44f6bed0efSShaohua Li 	struct md_rdev *rdev;
45f6bed0efSShaohua Li 
46f6bed0efSShaohua Li 	u32 uuid_checksum;
47f6bed0efSShaohua Li 
48f6bed0efSShaohua Li 	sector_t device_size;		/* log device size, round to
49f6bed0efSShaohua Li 					 * BLOCK_SECTORS */
500576b1c6SShaohua Li 	sector_t max_free_space;	/* reclaim run if free space is at
510576b1c6SShaohua Li 					 * this size */
52f6bed0efSShaohua Li 
53f6bed0efSShaohua Li 	sector_t last_checkpoint;	/* log tail. where recovery scan
54f6bed0efSShaohua Li 					 * starts from */
55f6bed0efSShaohua Li 	u64 last_cp_seq;		/* log tail sequence */
56f6bed0efSShaohua Li 
57f6bed0efSShaohua Li 	sector_t log_start;		/* log head. where new data appends */
58f6bed0efSShaohua Li 	u64 seq;			/* log head sequence */
59f6bed0efSShaohua Li 
6017036461SChristoph Hellwig 	sector_t next_checkpoint;
6117036461SChristoph Hellwig 	u64 next_cp_seq;
6217036461SChristoph Hellwig 
63f6bed0efSShaohua Li 	struct mutex io_mutex;
64f6bed0efSShaohua Li 	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
65f6bed0efSShaohua Li 
66f6bed0efSShaohua Li 	spinlock_t io_list_lock;
67f6bed0efSShaohua Li 	struct list_head running_ios;	/* io_units which are still running,
68f6bed0efSShaohua Li 					 * and have not yet been completely
69f6bed0efSShaohua Li 					 * written to the log */
70f6bed0efSShaohua Li 	struct list_head io_end_ios;	/* io_units which have been completely
71f6bed0efSShaohua Li 					 * written to the log but not yet written
72f6bed0efSShaohua Li 					 * to the RAID */
73a8c34f91SShaohua Li 	struct list_head flushing_ios;	/* io_units which are waiting for log
74a8c34f91SShaohua Li 					 * cache flush */
7504732f74SChristoph Hellwig 	struct list_head finished_ios;	/* io_units which settle down in log disk */
76a8c34f91SShaohua Li 	struct bio flush_bio;
77f6bed0efSShaohua Li 
78f6bed0efSShaohua Li 	struct kmem_cache *io_kc;
79c38d29b3SChristoph Hellwig 	struct bio_set *bs;
80*e8deb638SChristoph Hellwig 	mempool_t *meta_pool;
81f6bed0efSShaohua Li 
820576b1c6SShaohua Li 	struct md_thread *reclaim_thread;
830576b1c6SShaohua Li 	unsigned long reclaim_target;	/* number of space that need to be
840576b1c6SShaohua Li 					 * reclaimed.  if it's 0, reclaim spaces
850576b1c6SShaohua Li 					 * used by io_units which are in
860576b1c6SShaohua Li 					 * IO_UNIT_STRIPE_END state (eg, reclaim
870576b1c6SShaohua Li 					 * dones't wait for specific io_unit
880576b1c6SShaohua Li 					 * switching to IO_UNIT_STRIPE_END
890576b1c6SShaohua Li 					 * state) */
900fd22b45SShaohua Li 	wait_queue_head_t iounit_wait;
910576b1c6SShaohua Li 
92f6bed0efSShaohua Li 	struct list_head no_space_stripes; /* pending stripes, log has no space */
93f6bed0efSShaohua Li 	spinlock_t no_space_stripes_lock;
9456fef7c6SChristoph Hellwig 
9556fef7c6SChristoph Hellwig 	bool need_cache_flush;
964b482044SShaohua Li 	bool in_teardown;
97f6bed0efSShaohua Li };
98f6bed0efSShaohua Li 
99f6bed0efSShaohua Li /*
100f6bed0efSShaohua Li  * an IO range starts from a meta data block and end at the next meta data
101f6bed0efSShaohua Li  * block. The io unit's the meta data block tracks data/parity followed it. io
102f6bed0efSShaohua Li  * unit is written to log disk with normal write, as we always flush log disk
103f6bed0efSShaohua Li  * first and then start move data to raid disks, there is no requirement to
104f6bed0efSShaohua Li  * write io unit with FLUSH/FUA
105f6bed0efSShaohua Li  */
106f6bed0efSShaohua Li struct r5l_io_unit {
107f6bed0efSShaohua Li 	struct r5l_log *log;
108f6bed0efSShaohua Li 
109f6bed0efSShaohua Li 	struct page *meta_page;	/* store meta block */
110f6bed0efSShaohua Li 	int meta_offset;	/* current offset in meta_page */
111f6bed0efSShaohua Li 
112f6bed0efSShaohua Li 	struct bio *current_bio;/* current_bio accepting new data */
113f6bed0efSShaohua Li 
114f6bed0efSShaohua Li 	atomic_t pending_stripe;/* how many stripes not flushed to raid */
115f6bed0efSShaohua Li 	u64 seq;		/* seq number of the metablock */
116f6bed0efSShaohua Li 	sector_t log_start;	/* where the io_unit starts */
117f6bed0efSShaohua Li 	sector_t log_end;	/* where the io_unit ends */
118f6bed0efSShaohua Li 	struct list_head log_sibling; /* log->running_ios */
119f6bed0efSShaohua Li 	struct list_head stripe_list; /* stripes added to the io_unit */
120f6bed0efSShaohua Li 
121f6bed0efSShaohua Li 	int state;
1226143e2ceSChristoph Hellwig 	bool need_split_bio;
123f6bed0efSShaohua Li };
124f6bed0efSShaohua Li 
125f6bed0efSShaohua Li /* r5l_io_unit state */
126f6bed0efSShaohua Li enum r5l_io_unit_state {
127f6bed0efSShaohua Li 	IO_UNIT_RUNNING = 0,	/* accepting new IO */
128f6bed0efSShaohua Li 	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
129f6bed0efSShaohua Li 				 * don't accepting new bio */
130f6bed0efSShaohua Li 	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
131a8c34f91SShaohua Li 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
132f6bed0efSShaohua Li };
133f6bed0efSShaohua Li 
134f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
135f6bed0efSShaohua Li {
136f6bed0efSShaohua Li 	start += inc;
137f6bed0efSShaohua Li 	if (start >= log->device_size)
138f6bed0efSShaohua Li 		start = start - log->device_size;
139f6bed0efSShaohua Li 	return start;
140f6bed0efSShaohua Li }
141f6bed0efSShaohua Li 
142f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
143f6bed0efSShaohua Li 				  sector_t end)
144f6bed0efSShaohua Li {
145f6bed0efSShaohua Li 	if (end >= start)
146f6bed0efSShaohua Li 		return end - start;
147f6bed0efSShaohua Li 	else
148f6bed0efSShaohua Li 		return end + log->device_size - start;
149f6bed0efSShaohua Li }
150f6bed0efSShaohua Li 
151f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
152f6bed0efSShaohua Li {
153f6bed0efSShaohua Li 	sector_t used_size;
154f6bed0efSShaohua Li 
155f6bed0efSShaohua Li 	used_size = r5l_ring_distance(log, log->last_checkpoint,
156f6bed0efSShaohua Li 					log->log_start);
157f6bed0efSShaohua Li 
158f6bed0efSShaohua Li 	return log->device_size > used_size + size;
159f6bed0efSShaohua Li }
160f6bed0efSShaohua Li 
161f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
162f6bed0efSShaohua Li 				    enum r5l_io_unit_state state)
163f6bed0efSShaohua Li {
164f6bed0efSShaohua Li 	if (WARN_ON(io->state >= state))
165f6bed0efSShaohua Li 		return;
166f6bed0efSShaohua Li 	io->state = state;
167f6bed0efSShaohua Li }
168f6bed0efSShaohua Li 
169d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io)
170d8858f43SChristoph Hellwig {
171d8858f43SChristoph Hellwig 	struct stripe_head *sh, *next;
172d8858f43SChristoph Hellwig 
173d8858f43SChristoph Hellwig 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
174d8858f43SChristoph Hellwig 		list_del_init(&sh->log_list);
175d8858f43SChristoph Hellwig 		set_bit(STRIPE_HANDLE, &sh->state);
176d8858f43SChristoph Hellwig 		raid5_release_stripe(sh);
177d8858f43SChristoph Hellwig 	}
178d8858f43SChristoph Hellwig }
179d8858f43SChristoph Hellwig 
18056fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log)
18156fef7c6SChristoph Hellwig {
18256fef7c6SChristoph Hellwig 	struct r5l_io_unit *io, *next;
18356fef7c6SChristoph Hellwig 
18456fef7c6SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
18556fef7c6SChristoph Hellwig 
18656fef7c6SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
18756fef7c6SChristoph Hellwig 		/* don't change list order */
18856fef7c6SChristoph Hellwig 		if (io->state < IO_UNIT_IO_END)
18956fef7c6SChristoph Hellwig 			break;
19056fef7c6SChristoph Hellwig 
19156fef7c6SChristoph Hellwig 		list_move_tail(&io->log_sibling, &log->finished_ios);
19256fef7c6SChristoph Hellwig 		r5l_io_run_stripes(io);
19356fef7c6SChristoph Hellwig 	}
19456fef7c6SChristoph Hellwig }
19556fef7c6SChristoph Hellwig 
1963848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log)
1973848c0bcSChristoph Hellwig {
1983848c0bcSChristoph Hellwig 	struct r5l_io_unit *io, *next;
1993848c0bcSChristoph Hellwig 
2003848c0bcSChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
2013848c0bcSChristoph Hellwig 
2023848c0bcSChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
2033848c0bcSChristoph Hellwig 		/* don't change list order */
2043848c0bcSChristoph Hellwig 		if (io->state < IO_UNIT_IO_END)
2053848c0bcSChristoph Hellwig 			break;
2063848c0bcSChristoph Hellwig 		list_move_tail(&io->log_sibling, &log->io_end_ios);
2073848c0bcSChristoph Hellwig 	}
2083848c0bcSChristoph Hellwig }
2093848c0bcSChristoph Hellwig 
210f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio)
211f6bed0efSShaohua Li {
212f6bed0efSShaohua Li 	struct r5l_io_unit *io = bio->bi_private;
213f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
214509ffec7SChristoph Hellwig 	unsigned long flags;
215f6bed0efSShaohua Li 
2166e74a9cfSShaohua Li 	if (bio->bi_error)
2176e74a9cfSShaohua Li 		md_error(log->rdev->mddev, log->rdev);
2186e74a9cfSShaohua Li 
219f6bed0efSShaohua Li 	bio_put(bio);
220*e8deb638SChristoph Hellwig 	mempool_free(io->meta_page, log->meta_pool);
221f6bed0efSShaohua Li 
222509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
223509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
22456fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
2253848c0bcSChristoph Hellwig 		r5l_move_to_end_ios(log);
22656fef7c6SChristoph Hellwig 	else
22756fef7c6SChristoph Hellwig 		r5l_log_run_stripes(log);
228509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
229509ffec7SChristoph Hellwig 
23056fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
231f6bed0efSShaohua Li 		md_wakeup_thread(log->rdev->mddev->thread);
232f6bed0efSShaohua Li }
233f6bed0efSShaohua Li 
234f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log)
235f6bed0efSShaohua Li {
236f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
237f6bed0efSShaohua Li 	struct r5l_meta_block *block;
238509ffec7SChristoph Hellwig 	unsigned long flags;
239f6bed0efSShaohua Li 	u32 crc;
240f6bed0efSShaohua Li 
241f6bed0efSShaohua Li 	if (!io)
242f6bed0efSShaohua Li 		return;
243f6bed0efSShaohua Li 
244f6bed0efSShaohua Li 	block = page_address(io->meta_page);
245f6bed0efSShaohua Li 	block->meta_size = cpu_to_le32(io->meta_offset);
2465cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
247f6bed0efSShaohua Li 	block->checksum = cpu_to_le32(crc);
248f6bed0efSShaohua Li 
249f6bed0efSShaohua Li 	log->current_io = NULL;
250509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
251509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
252509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
253f6bed0efSShaohua Li 
2546143e2ceSChristoph Hellwig 	submit_bio(WRITE, io->current_bio);
255f6bed0efSShaohua Li }
256f6bed0efSShaohua Li 
2576143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log)
258b349feb3SChristoph Hellwig {
259c38d29b3SChristoph Hellwig 	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
260b349feb3SChristoph Hellwig 
261b349feb3SChristoph Hellwig 	bio->bi_rw = WRITE;
262b349feb3SChristoph Hellwig 	bio->bi_bdev = log->rdev->bdev;
2631e932a37SChristoph Hellwig 	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
264b349feb3SChristoph Hellwig 
265b349feb3SChristoph Hellwig 	return bio;
266b349feb3SChristoph Hellwig }
267b349feb3SChristoph Hellwig 
268c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
269c1b99198SChristoph Hellwig {
270c1b99198SChristoph Hellwig 	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
271c1b99198SChristoph Hellwig 
272c1b99198SChristoph Hellwig 	/*
273c1b99198SChristoph Hellwig 	 * If we filled up the log device start from the beginning again,
274c1b99198SChristoph Hellwig 	 * which will require a new bio.
275c1b99198SChristoph Hellwig 	 *
276c1b99198SChristoph Hellwig 	 * Note: for this to work properly the log size needs to me a multiple
277c1b99198SChristoph Hellwig 	 * of BLOCK_SECTORS.
278c1b99198SChristoph Hellwig 	 */
279c1b99198SChristoph Hellwig 	if (log->log_start == 0)
2806143e2ceSChristoph Hellwig 		io->need_split_bio = true;
281c1b99198SChristoph Hellwig 
282c1b99198SChristoph Hellwig 	io->log_end = log->log_start;
283c1b99198SChristoph Hellwig }
284c1b99198SChristoph Hellwig 
285f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
286f6bed0efSShaohua Li {
287f6bed0efSShaohua Li 	struct r5l_io_unit *io;
288f6bed0efSShaohua Li 	struct r5l_meta_block *block;
289f6bed0efSShaohua Li 
29051039cd0SChristoph Hellwig 	/* We can't handle memory allocate failure so far */
29151039cd0SChristoph Hellwig 	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
29251039cd0SChristoph Hellwig 	io->log = log;
29351039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->log_sibling);
29451039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->stripe_list);
29551039cd0SChristoph Hellwig 	io->state = IO_UNIT_RUNNING;
296f6bed0efSShaohua Li 
297*e8deb638SChristoph Hellwig 	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
298f6bed0efSShaohua Li 	block = page_address(io->meta_page);
299*e8deb638SChristoph Hellwig 	clear_page(block);
300f6bed0efSShaohua Li 	block->magic = cpu_to_le32(R5LOG_MAGIC);
301f6bed0efSShaohua Li 	block->version = R5LOG_VERSION;
302f6bed0efSShaohua Li 	block->seq = cpu_to_le64(log->seq);
303f6bed0efSShaohua Li 	block->position = cpu_to_le64(log->log_start);
304f6bed0efSShaohua Li 
305f6bed0efSShaohua Li 	io->log_start = log->log_start;
306f6bed0efSShaohua Li 	io->meta_offset = sizeof(struct r5l_meta_block);
3072b8ef16eSChristoph Hellwig 	io->seq = log->seq++;
308f6bed0efSShaohua Li 
3096143e2ceSChristoph Hellwig 	io->current_bio = r5l_bio_alloc(log);
3106143e2ceSChristoph Hellwig 	io->current_bio->bi_end_io = r5l_log_endio;
3116143e2ceSChristoph Hellwig 	io->current_bio->bi_private = io;
312b349feb3SChristoph Hellwig 	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
313f6bed0efSShaohua Li 
314c1b99198SChristoph Hellwig 	r5_reserve_log_entry(log, io);
315f6bed0efSShaohua Li 
316f6bed0efSShaohua Li 	spin_lock_irq(&log->io_list_lock);
317f6bed0efSShaohua Li 	list_add_tail(&io->log_sibling, &log->running_ios);
318f6bed0efSShaohua Li 	spin_unlock_irq(&log->io_list_lock);
319f6bed0efSShaohua Li 
320f6bed0efSShaohua Li 	return io;
321f6bed0efSShaohua Li }
322f6bed0efSShaohua Li 
323f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
324f6bed0efSShaohua Li {
32522581f58SChristoph Hellwig 	if (log->current_io &&
32622581f58SChristoph Hellwig 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
327f6bed0efSShaohua Li 		r5l_submit_current_io(log);
328f6bed0efSShaohua Li 
32922581f58SChristoph Hellwig 	if (!log->current_io)
330f6bed0efSShaohua Li 		log->current_io = r5l_new_meta(log);
331f6bed0efSShaohua Li 	return 0;
332f6bed0efSShaohua Li }
333f6bed0efSShaohua Li 
334f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
335f6bed0efSShaohua Li 				    sector_t location,
336f6bed0efSShaohua Li 				    u32 checksum1, u32 checksum2,
337f6bed0efSShaohua Li 				    bool checksum2_valid)
338f6bed0efSShaohua Li {
339f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
340f6bed0efSShaohua Li 	struct r5l_payload_data_parity *payload;
341f6bed0efSShaohua Li 
342f6bed0efSShaohua Li 	payload = page_address(io->meta_page) + io->meta_offset;
343f6bed0efSShaohua Li 	payload->header.type = cpu_to_le16(type);
344f6bed0efSShaohua Li 	payload->header.flags = cpu_to_le16(0);
345f6bed0efSShaohua Li 	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
346f6bed0efSShaohua Li 				    (PAGE_SHIFT - 9));
347f6bed0efSShaohua Li 	payload->location = cpu_to_le64(location);
348f6bed0efSShaohua Li 	payload->checksum[0] = cpu_to_le32(checksum1);
349f6bed0efSShaohua Li 	if (checksum2_valid)
350f6bed0efSShaohua Li 		payload->checksum[1] = cpu_to_le32(checksum2);
351f6bed0efSShaohua Li 
352f6bed0efSShaohua Li 	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
353f6bed0efSShaohua Li 		sizeof(__le32) * (1 + !!checksum2_valid);
354f6bed0efSShaohua Li }
355f6bed0efSShaohua Li 
356f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
357f6bed0efSShaohua Li {
358f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
359f6bed0efSShaohua Li 
3606143e2ceSChristoph Hellwig 	if (io->need_split_bio) {
3616143e2ceSChristoph Hellwig 		struct bio *prev = io->current_bio;
362f6bed0efSShaohua Li 
3636143e2ceSChristoph Hellwig 		io->current_bio = r5l_bio_alloc(log);
3646143e2ceSChristoph Hellwig 		bio_chain(io->current_bio, prev);
3656143e2ceSChristoph Hellwig 
3666143e2ceSChristoph Hellwig 		submit_bio(WRITE, prev);
367f6bed0efSShaohua Li 	}
368f6bed0efSShaohua Li 
3696143e2ceSChristoph Hellwig 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
3706143e2ceSChristoph Hellwig 		BUG();
3716143e2ceSChristoph Hellwig 
372c1b99198SChristoph Hellwig 	r5_reserve_log_entry(log, io);
373f6bed0efSShaohua Li }
374f6bed0efSShaohua Li 
375f6bed0efSShaohua Li static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
376f6bed0efSShaohua Li 			   int data_pages, int parity_pages)
377f6bed0efSShaohua Li {
378f6bed0efSShaohua Li 	int i;
379f6bed0efSShaohua Li 	int meta_size;
380f6bed0efSShaohua Li 	struct r5l_io_unit *io;
381f6bed0efSShaohua Li 
382f6bed0efSShaohua Li 	meta_size =
383f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
384f6bed0efSShaohua Li 		 * data_pages) +
385f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
386f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
387f6bed0efSShaohua Li 
388f6bed0efSShaohua Li 	r5l_get_meta(log, meta_size);
389f6bed0efSShaohua Li 	io = log->current_io;
390f6bed0efSShaohua Li 
391f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
392f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
393f6bed0efSShaohua Li 			continue;
394f6bed0efSShaohua Li 		if (i == sh->pd_idx || i == sh->qd_idx)
395f6bed0efSShaohua Li 			continue;
396f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
397f6bed0efSShaohua Li 					raid5_compute_blocknr(sh, i, 0),
398f6bed0efSShaohua Li 					sh->dev[i].log_checksum, 0, false);
399f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[i].page);
400f6bed0efSShaohua Li 	}
401f6bed0efSShaohua Li 
402f6bed0efSShaohua Li 	if (sh->qd_idx >= 0) {
403f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
404f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
405f6bed0efSShaohua Li 					sh->dev[sh->qd_idx].log_checksum, true);
406f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
407f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
408f6bed0efSShaohua Li 	} else {
409f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
410f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
411f6bed0efSShaohua Li 					0, false);
412f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
413f6bed0efSShaohua Li 	}
414f6bed0efSShaohua Li 
415f6bed0efSShaohua Li 	list_add_tail(&sh->log_list, &io->stripe_list);
416f6bed0efSShaohua Li 	atomic_inc(&io->pending_stripe);
417f6bed0efSShaohua Li 	sh->log_io = io;
418f6bed0efSShaohua Li }
419f6bed0efSShaohua Li 
420509ffec7SChristoph Hellwig static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
421f6bed0efSShaohua Li /*
422f6bed0efSShaohua Li  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
423f6bed0efSShaohua Li  * data from log to raid disks), so we shouldn't wait for reclaim here
424f6bed0efSShaohua Li  */
425f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
426f6bed0efSShaohua Li {
427f6bed0efSShaohua Li 	int write_disks = 0;
428f6bed0efSShaohua Li 	int data_pages, parity_pages;
429f6bed0efSShaohua Li 	int meta_size;
430f6bed0efSShaohua Li 	int reserve;
431f6bed0efSShaohua Li 	int i;
432f6bed0efSShaohua Li 
433f6bed0efSShaohua Li 	if (!log)
434f6bed0efSShaohua Li 		return -EAGAIN;
435f6bed0efSShaohua Li 	/* Don't support stripe batch */
436f6bed0efSShaohua Li 	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
437f6bed0efSShaohua Li 	    test_bit(STRIPE_SYNCING, &sh->state)) {
438f6bed0efSShaohua Li 		/* the stripe is written to log, we start writing it to raid */
439f6bed0efSShaohua Li 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
440f6bed0efSShaohua Li 		return -EAGAIN;
441f6bed0efSShaohua Li 	}
442f6bed0efSShaohua Li 
443f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
444f6bed0efSShaohua Li 		void *addr;
445f6bed0efSShaohua Li 
446f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
447f6bed0efSShaohua Li 			continue;
448f6bed0efSShaohua Li 		write_disks++;
449f6bed0efSShaohua Li 		/* checksum is already calculated in last run */
450f6bed0efSShaohua Li 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
451f6bed0efSShaohua Li 			continue;
452f6bed0efSShaohua Li 		addr = kmap_atomic(sh->dev[i].page);
4535cb2fbd6SShaohua Li 		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
454f6bed0efSShaohua Li 						    addr, PAGE_SIZE);
455f6bed0efSShaohua Li 		kunmap_atomic(addr);
456f6bed0efSShaohua Li 	}
457f6bed0efSShaohua Li 	parity_pages = 1 + !!(sh->qd_idx >= 0);
458f6bed0efSShaohua Li 	data_pages = write_disks - parity_pages;
459f6bed0efSShaohua Li 
460f6bed0efSShaohua Li 	meta_size =
461f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
462f6bed0efSShaohua Li 		 * data_pages) +
463f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
464f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
465f6bed0efSShaohua Li 	/* Doesn't work with very big raid array */
466f6bed0efSShaohua Li 	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
467f6bed0efSShaohua Li 		return -EINVAL;
468f6bed0efSShaohua Li 
469f6bed0efSShaohua Li 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
470253f9fd4SShaohua Li 	/*
471253f9fd4SShaohua Li 	 * The stripe must enter state machine again to finish the write, so
472253f9fd4SShaohua Li 	 * don't delay.
473253f9fd4SShaohua Li 	 */
474253f9fd4SShaohua Li 	clear_bit(STRIPE_DELAYED, &sh->state);
475f6bed0efSShaohua Li 	atomic_inc(&sh->count);
476f6bed0efSShaohua Li 
477f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
478f6bed0efSShaohua Li 	/* meta + data */
479f6bed0efSShaohua Li 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
480f6bed0efSShaohua Li 	if (r5l_has_free_space(log, reserve))
481f6bed0efSShaohua Li 		r5l_log_stripe(log, sh, data_pages, parity_pages);
482f6bed0efSShaohua Li 	else {
483f6bed0efSShaohua Li 		spin_lock(&log->no_space_stripes_lock);
484f6bed0efSShaohua Li 		list_add_tail(&sh->log_list, &log->no_space_stripes);
485f6bed0efSShaohua Li 		spin_unlock(&log->no_space_stripes_lock);
486f6bed0efSShaohua Li 
487f6bed0efSShaohua Li 		r5l_wake_reclaim(log, reserve);
488f6bed0efSShaohua Li 	}
489f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
490f6bed0efSShaohua Li 
491f6bed0efSShaohua Li 	return 0;
492f6bed0efSShaohua Li }
493f6bed0efSShaohua Li 
494f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log)
495f6bed0efSShaohua Li {
496f6bed0efSShaohua Li 	if (!log)
497f6bed0efSShaohua Li 		return;
498f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
499f6bed0efSShaohua Li 	r5l_submit_current_io(log);
500f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
501f6bed0efSShaohua Li }
502f6bed0efSShaohua Li 
503828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
504828cbe98SShaohua Li {
505828cbe98SShaohua Li 	if (!log)
506828cbe98SShaohua Li 		return -ENODEV;
507828cbe98SShaohua Li 	/*
508828cbe98SShaohua Li 	 * we flush log disk cache first, then write stripe data to raid disks.
509828cbe98SShaohua Li 	 * So if bio is finished, the log disk cache is flushed already. The
510828cbe98SShaohua Li 	 * recovery guarantees we can recovery the bio from log disk, so we
511828cbe98SShaohua Li 	 * don't need to flush again
512828cbe98SShaohua Li 	 */
513828cbe98SShaohua Li 	if (bio->bi_iter.bi_size == 0) {
514828cbe98SShaohua Li 		bio_endio(bio);
515828cbe98SShaohua Li 		return 0;
516828cbe98SShaohua Li 	}
517828cbe98SShaohua Li 	bio->bi_rw &= ~REQ_FLUSH;
518828cbe98SShaohua Li 	return -EAGAIN;
519828cbe98SShaohua Li }
520828cbe98SShaohua Li 
521f6bed0efSShaohua Li /* This will run after log space is reclaimed */
522f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log)
523f6bed0efSShaohua Li {
524f6bed0efSShaohua Li 	struct stripe_head *sh;
525f6bed0efSShaohua Li 
526f6bed0efSShaohua Li 	spin_lock(&log->no_space_stripes_lock);
527f6bed0efSShaohua Li 	while (!list_empty(&log->no_space_stripes)) {
528f6bed0efSShaohua Li 		sh = list_first_entry(&log->no_space_stripes,
529f6bed0efSShaohua Li 				      struct stripe_head, log_list);
530f6bed0efSShaohua Li 		list_del_init(&sh->log_list);
531f6bed0efSShaohua Li 		set_bit(STRIPE_HANDLE, &sh->state);
532f6bed0efSShaohua Li 		raid5_release_stripe(sh);
533f6bed0efSShaohua Li 	}
534f6bed0efSShaohua Li 	spin_unlock(&log->no_space_stripes_lock);
535f6bed0efSShaohua Li }
536f6bed0efSShaohua Li 
53717036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log)
53817036461SChristoph Hellwig {
53917036461SChristoph Hellwig 	return r5l_ring_distance(log, log->last_checkpoint,
54017036461SChristoph Hellwig 				 log->next_checkpoint);
54117036461SChristoph Hellwig }
54217036461SChristoph Hellwig 
54304732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log)
54417036461SChristoph Hellwig {
54517036461SChristoph Hellwig 	struct r5l_io_unit *io, *next;
54617036461SChristoph Hellwig 	bool found = false;
54717036461SChristoph Hellwig 
54817036461SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
54917036461SChristoph Hellwig 
55004732f74SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
55117036461SChristoph Hellwig 		/* don't change list order */
55217036461SChristoph Hellwig 		if (io->state < IO_UNIT_STRIPE_END)
55317036461SChristoph Hellwig 			break;
55417036461SChristoph Hellwig 
55517036461SChristoph Hellwig 		log->next_checkpoint = io->log_start;
55617036461SChristoph Hellwig 		log->next_cp_seq = io->seq;
55717036461SChristoph Hellwig 
55817036461SChristoph Hellwig 		list_del(&io->log_sibling);
559ad66d445SChristoph Hellwig 		kmem_cache_free(log->io_kc, io);
56017036461SChristoph Hellwig 
56117036461SChristoph Hellwig 		found = true;
56217036461SChristoph Hellwig 	}
56317036461SChristoph Hellwig 
56417036461SChristoph Hellwig 	return found;
56517036461SChristoph Hellwig }
56617036461SChristoph Hellwig 
567509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
568509ffec7SChristoph Hellwig {
569509ffec7SChristoph Hellwig 	struct r5l_log *log = io->log;
570509ffec7SChristoph Hellwig 	unsigned long flags;
571509ffec7SChristoph Hellwig 
572509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
573509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
57417036461SChristoph Hellwig 
57504732f74SChristoph Hellwig 	if (!r5l_complete_finished_ios(log)) {
57685f2f9a4SShaohua Li 		spin_unlock_irqrestore(&log->io_list_lock, flags);
57785f2f9a4SShaohua Li 		return;
57885f2f9a4SShaohua Li 	}
579509ffec7SChristoph Hellwig 
58017036461SChristoph Hellwig 	if (r5l_reclaimable_space(log) > log->max_free_space)
581509ffec7SChristoph Hellwig 		r5l_wake_reclaim(log, 0);
582509ffec7SChristoph Hellwig 
583509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
584509ffec7SChristoph Hellwig 	wake_up(&log->iounit_wait);
585509ffec7SChristoph Hellwig }
586509ffec7SChristoph Hellwig 
5870576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh)
5880576b1c6SShaohua Li {
5890576b1c6SShaohua Li 	struct r5l_io_unit *io;
5900576b1c6SShaohua Li 
5910576b1c6SShaohua Li 	io = sh->log_io;
5920576b1c6SShaohua Li 	sh->log_io = NULL;
5930576b1c6SShaohua Li 
594509ffec7SChristoph Hellwig 	if (io && atomic_dec_and_test(&io->pending_stripe))
595509ffec7SChristoph Hellwig 		__r5l_stripe_write_finished(io);
5960576b1c6SShaohua Li }
5970576b1c6SShaohua Li 
598a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio)
599a8c34f91SShaohua Li {
600a8c34f91SShaohua Li 	struct r5l_log *log = container_of(bio, struct r5l_log,
601a8c34f91SShaohua Li 		flush_bio);
602a8c34f91SShaohua Li 	unsigned long flags;
603a8c34f91SShaohua Li 	struct r5l_io_unit *io;
604a8c34f91SShaohua Li 
6056e74a9cfSShaohua Li 	if (bio->bi_error)
6066e74a9cfSShaohua Li 		md_error(log->rdev->mddev, log->rdev);
6076e74a9cfSShaohua Li 
608a8c34f91SShaohua Li 	spin_lock_irqsave(&log->io_list_lock, flags);
609d8858f43SChristoph Hellwig 	list_for_each_entry(io, &log->flushing_ios, log_sibling)
610d8858f43SChristoph Hellwig 		r5l_io_run_stripes(io);
61104732f74SChristoph Hellwig 	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
612a8c34f91SShaohua Li 	spin_unlock_irqrestore(&log->io_list_lock, flags);
613a8c34f91SShaohua Li }
614a8c34f91SShaohua Li 
6150576b1c6SShaohua Li /*
6160576b1c6SShaohua Li  * Starting dispatch IO to raid.
6170576b1c6SShaohua Li  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
6180576b1c6SShaohua Li  * broken meta in the middle of a log causes recovery can't find meta at the
6190576b1c6SShaohua Li  * head of log. If operations require meta at the head persistent in log, we
6200576b1c6SShaohua Li  * must make sure meta before it persistent in log too. A case is:
6210576b1c6SShaohua Li  *
6220576b1c6SShaohua Li  * stripe data/parity is in log, we start write stripe to raid disks. stripe
6230576b1c6SShaohua Li  * data/parity must be persistent in log before we do the write to raid disks.
6240576b1c6SShaohua Li  *
6250576b1c6SShaohua Li  * The solution is we restrictly maintain io_unit list order. In this case, we
6260576b1c6SShaohua Li  * only write stripes of an io_unit to raid disks till the io_unit is the first
6270576b1c6SShaohua Li  * one whose data/parity is in log.
6280576b1c6SShaohua Li  */
6290576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log)
6300576b1c6SShaohua Li {
631a8c34f91SShaohua Li 	bool do_flush;
63256fef7c6SChristoph Hellwig 
63356fef7c6SChristoph Hellwig 	if (!log || !log->need_cache_flush)
6340576b1c6SShaohua Li 		return;
6350576b1c6SShaohua Li 
636a8c34f91SShaohua Li 	spin_lock_irq(&log->io_list_lock);
637a8c34f91SShaohua Li 	/* flush bio is running */
638a8c34f91SShaohua Li 	if (!list_empty(&log->flushing_ios)) {
639a8c34f91SShaohua Li 		spin_unlock_irq(&log->io_list_lock);
6400576b1c6SShaohua Li 		return;
6410576b1c6SShaohua Li 	}
642a8c34f91SShaohua Li 	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
643a8c34f91SShaohua Li 	do_flush = !list_empty(&log->flushing_ios);
6440576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
645a8c34f91SShaohua Li 
646a8c34f91SShaohua Li 	if (!do_flush)
647a8c34f91SShaohua Li 		return;
648a8c34f91SShaohua Li 	bio_reset(&log->flush_bio);
649a8c34f91SShaohua Li 	log->flush_bio.bi_bdev = log->rdev->bdev;
650a8c34f91SShaohua Li 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
651a8c34f91SShaohua Li 	submit_bio(WRITE_FLUSH, &log->flush_bio);
6520576b1c6SShaohua Li }
6530576b1c6SShaohua Li 
6540576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp);
6554b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log,
6564b482044SShaohua Li 	sector_t end)
6574b482044SShaohua Li {
6584b482044SShaohua Li 	struct block_device *bdev = log->rdev->bdev;
6594b482044SShaohua Li 	struct mddev *mddev;
6604b482044SShaohua Li 
6614b482044SShaohua Li 	r5l_write_super(log, end);
6624b482044SShaohua Li 
6634b482044SShaohua Li 	if (!blk_queue_discard(bdev_get_queue(bdev)))
6644b482044SShaohua Li 		return;
6654b482044SShaohua Li 
6664b482044SShaohua Li 	mddev = log->rdev->mddev;
6674b482044SShaohua Li 	/*
6684b482044SShaohua Li 	 * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
6694b482044SShaohua Li 	 * wait for this thread to finish. This thread waits for
6704b482044SShaohua Li 	 * MD_CHANGE_PENDING clear, which is supposed to be done in
6714b482044SShaohua Li 	 * md_check_recovery(). md_check_recovery() tries to get
6724b482044SShaohua Li 	 * reconfig_mutex. Since r5l_quiesce already holds the mutex,
6734b482044SShaohua Li 	 * md_check_recovery() fails, so the PENDING never get cleared. The
6744b482044SShaohua Li 	 * in_teardown check workaround this issue.
6754b482044SShaohua Li 	 */
6764b482044SShaohua Li 	if (!log->in_teardown) {
6774b482044SShaohua Li 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
6784b482044SShaohua Li 		set_bit(MD_CHANGE_PENDING, &mddev->flags);
6794b482044SShaohua Li 		md_wakeup_thread(mddev->thread);
6804b482044SShaohua Li 		wait_event(mddev->sb_wait,
6814b482044SShaohua Li 			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
6824b482044SShaohua Li 			log->in_teardown);
6834b482044SShaohua Li 		/*
6844b482044SShaohua Li 		 * r5l_quiesce could run after in_teardown check and hold
6854b482044SShaohua Li 		 * mutex first. Superblock might get updated twice.
6864b482044SShaohua Li 		 */
6874b482044SShaohua Li 		if (log->in_teardown)
6884b482044SShaohua Li 			md_update_sb(mddev, 1);
6894b482044SShaohua Li 	} else {
6904b482044SShaohua Li 		WARN_ON(!mddev_is_locked(mddev));
6914b482044SShaohua Li 		md_update_sb(mddev, 1);
6924b482044SShaohua Li 	}
6934b482044SShaohua Li 
6946e74a9cfSShaohua Li 	/* discard IO error really doesn't matter, ignore it */
6954b482044SShaohua Li 	if (log->last_checkpoint < end) {
6964b482044SShaohua Li 		blkdev_issue_discard(bdev,
6974b482044SShaohua Li 				log->last_checkpoint + log->rdev->data_offset,
6984b482044SShaohua Li 				end - log->last_checkpoint, GFP_NOIO, 0);
6994b482044SShaohua Li 	} else {
7004b482044SShaohua Li 		blkdev_issue_discard(bdev,
7014b482044SShaohua Li 				log->last_checkpoint + log->rdev->data_offset,
7024b482044SShaohua Li 				log->device_size - log->last_checkpoint,
7034b482044SShaohua Li 				GFP_NOIO, 0);
7044b482044SShaohua Li 		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
7054b482044SShaohua Li 				GFP_NOIO, 0);
7064b482044SShaohua Li 	}
7074b482044SShaohua Li }
7084b482044SShaohua Li 
7094b482044SShaohua Li 
7100576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log)
7110576b1c6SShaohua Li {
7120576b1c6SShaohua Li 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
71317036461SChristoph Hellwig 	sector_t reclaimable;
71417036461SChristoph Hellwig 	sector_t next_checkpoint;
71517036461SChristoph Hellwig 	u64 next_cp_seq;
7160576b1c6SShaohua Li 
7170576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
7180576b1c6SShaohua Li 	/*
7190576b1c6SShaohua Li 	 * move proper io_unit to reclaim list. We should not change the order.
7200576b1c6SShaohua Li 	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
7210576b1c6SShaohua Li 	 * shouldn't reuse space of an unreclaimable io_unit
7220576b1c6SShaohua Li 	 */
7230576b1c6SShaohua Li 	while (1) {
72417036461SChristoph Hellwig 		reclaimable = r5l_reclaimable_space(log);
72517036461SChristoph Hellwig 		if (reclaimable >= reclaim_target ||
7260576b1c6SShaohua Li 		    (list_empty(&log->running_ios) &&
7270576b1c6SShaohua Li 		     list_empty(&log->io_end_ios) &&
728a8c34f91SShaohua Li 		     list_empty(&log->flushing_ios) &&
72904732f74SChristoph Hellwig 		     list_empty(&log->finished_ios)))
7300576b1c6SShaohua Li 			break;
7310576b1c6SShaohua Li 
73217036461SChristoph Hellwig 		md_wakeup_thread(log->rdev->mddev->thread);
73317036461SChristoph Hellwig 		wait_event_lock_irq(log->iounit_wait,
73417036461SChristoph Hellwig 				    r5l_reclaimable_space(log) > reclaimable,
73517036461SChristoph Hellwig 				    log->io_list_lock);
7360576b1c6SShaohua Li 	}
73717036461SChristoph Hellwig 
73817036461SChristoph Hellwig 	next_checkpoint = log->next_checkpoint;
73917036461SChristoph Hellwig 	next_cp_seq = log->next_cp_seq;
7400576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
7410576b1c6SShaohua Li 
74217036461SChristoph Hellwig 	BUG_ON(reclaimable < 0);
74317036461SChristoph Hellwig 	if (reclaimable == 0)
7440576b1c6SShaohua Li 		return;
7450576b1c6SShaohua Li 
7460576b1c6SShaohua Li 	/*
7470576b1c6SShaohua Li 	 * write_super will flush cache of each raid disk. We must write super
7480576b1c6SShaohua Li 	 * here, because the log area might be reused soon and we don't want to
7490576b1c6SShaohua Li 	 * confuse recovery
7500576b1c6SShaohua Li 	 */
7514b482044SShaohua Li 	r5l_write_super_and_discard_space(log, next_checkpoint);
7520576b1c6SShaohua Li 
7530576b1c6SShaohua Li 	mutex_lock(&log->io_mutex);
75417036461SChristoph Hellwig 	log->last_checkpoint = next_checkpoint;
75517036461SChristoph Hellwig 	log->last_cp_seq = next_cp_seq;
7560576b1c6SShaohua Li 	mutex_unlock(&log->io_mutex);
7570576b1c6SShaohua Li 
75817036461SChristoph Hellwig 	r5l_run_no_space_stripes(log);
7590576b1c6SShaohua Li }
7600576b1c6SShaohua Li 
7610576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread)
7620576b1c6SShaohua Li {
7630576b1c6SShaohua Li 	struct mddev *mddev = thread->mddev;
7640576b1c6SShaohua Li 	struct r5conf *conf = mddev->private;
7650576b1c6SShaohua Li 	struct r5l_log *log = conf->log;
7660576b1c6SShaohua Li 
7670576b1c6SShaohua Li 	if (!log)
7680576b1c6SShaohua Li 		return;
7690576b1c6SShaohua Li 	r5l_do_reclaim(log);
7700576b1c6SShaohua Li }
7710576b1c6SShaohua Li 
772f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
773f6bed0efSShaohua Li {
7740576b1c6SShaohua Li 	unsigned long target;
7750576b1c6SShaohua Li 	unsigned long new = (unsigned long)space; /* overflow in theory */
7760576b1c6SShaohua Li 
7770576b1c6SShaohua Li 	do {
7780576b1c6SShaohua Li 		target = log->reclaim_target;
7790576b1c6SShaohua Li 		if (new < target)
7800576b1c6SShaohua Li 			return;
7810576b1c6SShaohua Li 	} while (cmpxchg(&log->reclaim_target, target, new) != target);
7820576b1c6SShaohua Li 	md_wakeup_thread(log->reclaim_thread);
783f6bed0efSShaohua Li }
784f6bed0efSShaohua Li 
785e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state)
786e6c033f7SShaohua Li {
7874b482044SShaohua Li 	struct mddev *mddev;
788e6c033f7SShaohua Li 	if (!log || state == 2)
789e6c033f7SShaohua Li 		return;
790e6c033f7SShaohua Li 	if (state == 0) {
7914b482044SShaohua Li 		log->in_teardown = 0;
792e6c033f7SShaohua Li 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
793e6c033f7SShaohua Li 					log->rdev->mddev, "reclaim");
794e6c033f7SShaohua Li 	} else if (state == 1) {
795e6c033f7SShaohua Li 		/*
796e6c033f7SShaohua Li 		 * at this point all stripes are finished, so io_unit is at
797e6c033f7SShaohua Li 		 * least in STRIPE_END state
798e6c033f7SShaohua Li 		 */
7994b482044SShaohua Li 		log->in_teardown = 1;
8004b482044SShaohua Li 		/* make sure r5l_write_super_and_discard_space exits */
8014b482044SShaohua Li 		mddev = log->rdev->mddev;
8024b482044SShaohua Li 		wake_up(&mddev->sb_wait);
803e6c033f7SShaohua Li 		r5l_wake_reclaim(log, -1L);
804e6c033f7SShaohua Li 		md_unregister_thread(&log->reclaim_thread);
805e6c033f7SShaohua Li 		r5l_do_reclaim(log);
806e6c033f7SShaohua Li 	}
807e6c033f7SShaohua Li }
808e6c033f7SShaohua Li 
8096e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf)
8106e74a9cfSShaohua Li {
811f6b6ec5cSShaohua Li 	struct r5l_log *log;
812f6b6ec5cSShaohua Li 	bool ret;
8137dde2ad3SShaohua Li 	/* don't allow write if journal disk is missing */
814f6b6ec5cSShaohua Li 	rcu_read_lock();
815f6b6ec5cSShaohua Li 	log = rcu_dereference(conf->log);
816f6b6ec5cSShaohua Li 
817f6b6ec5cSShaohua Li 	if (!log)
818f6b6ec5cSShaohua Li 		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
819f6b6ec5cSShaohua Li 	else
820f6b6ec5cSShaohua Li 		ret = test_bit(Faulty, &log->rdev->flags);
821f6b6ec5cSShaohua Li 	rcu_read_unlock();
822f6b6ec5cSShaohua Li 	return ret;
8236e74a9cfSShaohua Li }
8246e74a9cfSShaohua Li 
825355810d1SShaohua Li struct r5l_recovery_ctx {
826355810d1SShaohua Li 	struct page *meta_page;		/* current meta */
827355810d1SShaohua Li 	sector_t meta_total_blocks;	/* total size of current meta and data */
828355810d1SShaohua Li 	sector_t pos;			/* recovery position */
829355810d1SShaohua Li 	u64 seq;			/* recovery position seq */
830355810d1SShaohua Li };
831355810d1SShaohua Li 
832355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log,
833355810d1SShaohua Li 			       struct r5l_recovery_ctx *ctx)
834355810d1SShaohua Li {
835355810d1SShaohua Li 	struct page *page = ctx->meta_page;
836355810d1SShaohua Li 	struct r5l_meta_block *mb;
837355810d1SShaohua Li 	u32 crc, stored_crc;
838355810d1SShaohua Li 
839355810d1SShaohua Li 	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
840355810d1SShaohua Li 		return -EIO;
841355810d1SShaohua Li 
842355810d1SShaohua Li 	mb = page_address(page);
843355810d1SShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
844355810d1SShaohua Li 	mb->checksum = 0;
845355810d1SShaohua Li 
846355810d1SShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
847355810d1SShaohua Li 	    le64_to_cpu(mb->seq) != ctx->seq ||
848355810d1SShaohua Li 	    mb->version != R5LOG_VERSION ||
849355810d1SShaohua Li 	    le64_to_cpu(mb->position) != ctx->pos)
850355810d1SShaohua Li 		return -EINVAL;
851355810d1SShaohua Li 
8525cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
853355810d1SShaohua Li 	if (stored_crc != crc)
854355810d1SShaohua Li 		return -EINVAL;
855355810d1SShaohua Li 
856355810d1SShaohua Li 	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
857355810d1SShaohua Li 		return -EINVAL;
858355810d1SShaohua Li 
859355810d1SShaohua Li 	ctx->meta_total_blocks = BLOCK_SECTORS;
860355810d1SShaohua Li 
861355810d1SShaohua Li 	return 0;
862355810d1SShaohua Li }
863355810d1SShaohua Li 
864355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
865355810d1SShaohua Li 					 struct r5l_recovery_ctx *ctx,
866355810d1SShaohua Li 					 sector_t stripe_sect,
867355810d1SShaohua Li 					 int *offset, sector_t *log_offset)
868355810d1SShaohua Li {
869355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
870355810d1SShaohua Li 	struct stripe_head *sh;
871355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
872355810d1SShaohua Li 	int disk_index;
873355810d1SShaohua Li 
874355810d1SShaohua Li 	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
875355810d1SShaohua Li 	while (1) {
876355810d1SShaohua Li 		payload = page_address(ctx->meta_page) + *offset;
877355810d1SShaohua Li 
878355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
879355810d1SShaohua Li 			raid5_compute_sector(conf,
880355810d1SShaohua Li 					     le64_to_cpu(payload->location), 0,
881355810d1SShaohua Li 					     &disk_index, sh);
882355810d1SShaohua Li 
883355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
884355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
885355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
886355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
887355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
888355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS;
889355810d1SShaohua Li 		} else {
890355810d1SShaohua Li 			disk_index = sh->pd_idx;
891355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
892355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
893355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
894355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
895355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
896355810d1SShaohua Li 
897355810d1SShaohua Li 			if (sh->qd_idx >= 0) {
898355810d1SShaohua Li 				disk_index = sh->qd_idx;
899355810d1SShaohua Li 				sync_page_io(log->rdev,
900355810d1SShaohua Li 					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
901355810d1SShaohua Li 					     PAGE_SIZE, sh->dev[disk_index].page,
902355810d1SShaohua Li 					     READ, false);
903355810d1SShaohua Li 				sh->dev[disk_index].log_checksum =
904355810d1SShaohua Li 					le32_to_cpu(payload->checksum[1]);
905355810d1SShaohua Li 				set_bit(R5_Wantwrite,
906355810d1SShaohua Li 					&sh->dev[disk_index].flags);
907355810d1SShaohua Li 			}
908355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
909355810d1SShaohua Li 		}
910355810d1SShaohua Li 
911355810d1SShaohua Li 		*log_offset = r5l_ring_add(log, *log_offset,
912355810d1SShaohua Li 					   le32_to_cpu(payload->size));
913355810d1SShaohua Li 		*offset += sizeof(struct r5l_payload_data_parity) +
914355810d1SShaohua Li 			sizeof(__le32) *
915355810d1SShaohua Li 			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
916355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
917355810d1SShaohua Li 			break;
918355810d1SShaohua Li 	}
919355810d1SShaohua Li 
920355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
921355810d1SShaohua Li 		void *addr;
922355810d1SShaohua Li 		u32 checksum;
923355810d1SShaohua Li 
924355810d1SShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
925355810d1SShaohua Li 			continue;
926355810d1SShaohua Li 		addr = kmap_atomic(sh->dev[disk_index].page);
9275cb2fbd6SShaohua Li 		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
928355810d1SShaohua Li 		kunmap_atomic(addr);
929355810d1SShaohua Li 		if (checksum != sh->dev[disk_index].log_checksum)
930355810d1SShaohua Li 			goto error;
931355810d1SShaohua Li 	}
932355810d1SShaohua Li 
933355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
934355810d1SShaohua Li 		struct md_rdev *rdev, *rrdev;
935355810d1SShaohua Li 
936355810d1SShaohua Li 		if (!test_and_clear_bit(R5_Wantwrite,
937355810d1SShaohua Li 					&sh->dev[disk_index].flags))
938355810d1SShaohua Li 			continue;
939355810d1SShaohua Li 
940355810d1SShaohua Li 		/* in case device is broken */
941355810d1SShaohua Li 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
942355810d1SShaohua Li 		if (rdev)
943355810d1SShaohua Li 			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
944355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
945355810d1SShaohua Li 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
946355810d1SShaohua Li 		if (rrdev)
947355810d1SShaohua Li 			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
948355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
949355810d1SShaohua Li 	}
950355810d1SShaohua Li 	raid5_release_stripe(sh);
951355810d1SShaohua Li 	return 0;
952355810d1SShaohua Li 
953355810d1SShaohua Li error:
954355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++)
955355810d1SShaohua Li 		sh->dev[disk_index].flags = 0;
956355810d1SShaohua Li 	raid5_release_stripe(sh);
957355810d1SShaohua Li 	return -EINVAL;
958355810d1SShaohua Li }
959355810d1SShaohua Li 
960355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log,
961355810d1SShaohua Li 				       struct r5l_recovery_ctx *ctx)
962355810d1SShaohua Li {
963355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
964355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
965355810d1SShaohua Li 	struct r5l_meta_block *mb;
966355810d1SShaohua Li 	int offset;
967355810d1SShaohua Li 	sector_t log_offset;
968355810d1SShaohua Li 	sector_t stripe_sector;
969355810d1SShaohua Li 
970355810d1SShaohua Li 	mb = page_address(ctx->meta_page);
971355810d1SShaohua Li 	offset = sizeof(struct r5l_meta_block);
972355810d1SShaohua Li 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
973355810d1SShaohua Li 
974355810d1SShaohua Li 	while (offset < le32_to_cpu(mb->meta_size)) {
975355810d1SShaohua Li 		int dd;
976355810d1SShaohua Li 
977355810d1SShaohua Li 		payload = (void *)mb + offset;
978355810d1SShaohua Li 		stripe_sector = raid5_compute_sector(conf,
979355810d1SShaohua Li 						     le64_to_cpu(payload->location), 0, &dd, NULL);
980355810d1SShaohua Li 		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
981355810d1SShaohua Li 						  &offset, &log_offset))
982355810d1SShaohua Li 			return -EINVAL;
983355810d1SShaohua Li 	}
984355810d1SShaohua Li 	return 0;
985355810d1SShaohua Li }
986355810d1SShaohua Li 
987355810d1SShaohua Li /* copy data/parity from log to raid disks */
988355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log,
989355810d1SShaohua Li 				   struct r5l_recovery_ctx *ctx)
990355810d1SShaohua Li {
991355810d1SShaohua Li 	while (1) {
992355810d1SShaohua Li 		if (r5l_read_meta_block(log, ctx))
993355810d1SShaohua Li 			return;
994355810d1SShaohua Li 		if (r5l_recovery_flush_one_meta(log, ctx))
995355810d1SShaohua Li 			return;
996355810d1SShaohua Li 		ctx->seq++;
997355810d1SShaohua Li 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
998355810d1SShaohua Li 	}
999355810d1SShaohua Li }
1000355810d1SShaohua Li 
1001355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1002355810d1SShaohua Li 					  u64 seq)
1003355810d1SShaohua Li {
1004355810d1SShaohua Li 	struct page *page;
1005355810d1SShaohua Li 	struct r5l_meta_block *mb;
1006355810d1SShaohua Li 	u32 crc;
1007355810d1SShaohua Li 
1008355810d1SShaohua Li 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1009355810d1SShaohua Li 	if (!page)
1010355810d1SShaohua Li 		return -ENOMEM;
1011355810d1SShaohua Li 	mb = page_address(page);
1012355810d1SShaohua Li 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
1013355810d1SShaohua Li 	mb->version = R5LOG_VERSION;
1014355810d1SShaohua Li 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1015355810d1SShaohua Li 	mb->seq = cpu_to_le64(seq);
1016355810d1SShaohua Li 	mb->position = cpu_to_le64(pos);
10175cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1018355810d1SShaohua Li 	mb->checksum = cpu_to_le32(crc);
1019355810d1SShaohua Li 
1020355810d1SShaohua Li 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
1021355810d1SShaohua Li 		__free_page(page);
1022355810d1SShaohua Li 		return -EIO;
1023355810d1SShaohua Li 	}
1024355810d1SShaohua Li 	__free_page(page);
1025355810d1SShaohua Li 	return 0;
1026355810d1SShaohua Li }
1027355810d1SShaohua Li 
1028f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log)
1029f6bed0efSShaohua Li {
1030355810d1SShaohua Li 	struct r5l_recovery_ctx ctx;
1031355810d1SShaohua Li 
1032355810d1SShaohua Li 	ctx.pos = log->last_checkpoint;
1033355810d1SShaohua Li 	ctx.seq = log->last_cp_seq;
1034355810d1SShaohua Li 	ctx.meta_page = alloc_page(GFP_KERNEL);
1035355810d1SShaohua Li 	if (!ctx.meta_page)
1036355810d1SShaohua Li 		return -ENOMEM;
1037355810d1SShaohua Li 
1038355810d1SShaohua Li 	r5l_recovery_flush_log(log, &ctx);
1039355810d1SShaohua Li 	__free_page(ctx.meta_page);
1040355810d1SShaohua Li 
1041355810d1SShaohua Li 	/*
1042355810d1SShaohua Li 	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
1043355810d1SShaohua Li 	 * log will start here. but we can't let superblock point to last valid
1044355810d1SShaohua Li 	 * meta block. The log might looks like:
1045355810d1SShaohua Li 	 * | meta 1| meta 2| meta 3|
1046355810d1SShaohua Li 	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1047355810d1SShaohua Li 	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
1048355810d1SShaohua Li 	 * happens again, new recovery will start from meta 1. Since meta 2n is
1049355810d1SShaohua Li 	 * valid now, recovery will think meta 3 is valid, which is wrong.
1050355810d1SShaohua Li 	 * The solution is we create a new meta in meta2 with its seq == meta
1051355810d1SShaohua Li 	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
1052355810d1SShaohua Li 	 * not think meta 3 is a valid meta, because its seq doesn't match
1053355810d1SShaohua Li 	 */
1054355810d1SShaohua Li 	if (ctx.seq > log->last_cp_seq + 1) {
1055355810d1SShaohua Li 		int ret;
1056355810d1SShaohua Li 
1057355810d1SShaohua Li 		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1058355810d1SShaohua Li 		if (ret)
1059355810d1SShaohua Li 			return ret;
1060355810d1SShaohua Li 		log->seq = ctx.seq + 11;
1061355810d1SShaohua Li 		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1062355810d1SShaohua Li 		r5l_write_super(log, ctx.pos);
1063355810d1SShaohua Li 	} else {
1064355810d1SShaohua Li 		log->log_start = ctx.pos;
1065355810d1SShaohua Li 		log->seq = ctx.seq;
1066355810d1SShaohua Li 	}
1067f6bed0efSShaohua Li 	return 0;
1068f6bed0efSShaohua Li }
1069f6bed0efSShaohua Li 
1070f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp)
1071f6bed0efSShaohua Li {
1072f6bed0efSShaohua Li 	struct mddev *mddev = log->rdev->mddev;
1073f6bed0efSShaohua Li 
1074f6bed0efSShaohua Li 	log->rdev->journal_tail = cp;
1075f6bed0efSShaohua Li 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1076f6bed0efSShaohua Li }
1077f6bed0efSShaohua Li 
1078f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log)
1079f6bed0efSShaohua Li {
1080f6bed0efSShaohua Li 	struct md_rdev *rdev = log->rdev;
1081f6bed0efSShaohua Li 	struct page *page;
1082f6bed0efSShaohua Li 	struct r5l_meta_block *mb;
1083f6bed0efSShaohua Li 	sector_t cp = log->rdev->journal_tail;
1084f6bed0efSShaohua Li 	u32 stored_crc, expected_crc;
1085f6bed0efSShaohua Li 	bool create_super = false;
1086f6bed0efSShaohua Li 	int ret;
1087f6bed0efSShaohua Li 
1088f6bed0efSShaohua Li 	/* Make sure it's valid */
1089f6bed0efSShaohua Li 	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1090f6bed0efSShaohua Li 		cp = 0;
1091f6bed0efSShaohua Li 	page = alloc_page(GFP_KERNEL);
1092f6bed0efSShaohua Li 	if (!page)
1093f6bed0efSShaohua Li 		return -ENOMEM;
1094f6bed0efSShaohua Li 
1095f6bed0efSShaohua Li 	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1096f6bed0efSShaohua Li 		ret = -EIO;
1097f6bed0efSShaohua Li 		goto ioerr;
1098f6bed0efSShaohua Li 	}
1099f6bed0efSShaohua Li 	mb = page_address(page);
1100f6bed0efSShaohua Li 
1101f6bed0efSShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1102f6bed0efSShaohua Li 	    mb->version != R5LOG_VERSION) {
1103f6bed0efSShaohua Li 		create_super = true;
1104f6bed0efSShaohua Li 		goto create;
1105f6bed0efSShaohua Li 	}
1106f6bed0efSShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
1107f6bed0efSShaohua Li 	mb->checksum = 0;
11085cb2fbd6SShaohua Li 	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1109f6bed0efSShaohua Li 	if (stored_crc != expected_crc) {
1110f6bed0efSShaohua Li 		create_super = true;
1111f6bed0efSShaohua Li 		goto create;
1112f6bed0efSShaohua Li 	}
1113f6bed0efSShaohua Li 	if (le64_to_cpu(mb->position) != cp) {
1114f6bed0efSShaohua Li 		create_super = true;
1115f6bed0efSShaohua Li 		goto create;
1116f6bed0efSShaohua Li 	}
1117f6bed0efSShaohua Li create:
1118f6bed0efSShaohua Li 	if (create_super) {
1119f6bed0efSShaohua Li 		log->last_cp_seq = prandom_u32();
1120f6bed0efSShaohua Li 		cp = 0;
1121f6bed0efSShaohua Li 		/*
1122f6bed0efSShaohua Li 		 * Make sure super points to correct address. Log might have
1123f6bed0efSShaohua Li 		 * data very soon. If super hasn't correct log tail address,
1124f6bed0efSShaohua Li 		 * recovery can't find the log
1125f6bed0efSShaohua Li 		 */
1126f6bed0efSShaohua Li 		r5l_write_super(log, cp);
1127f6bed0efSShaohua Li 	} else
1128f6bed0efSShaohua Li 		log->last_cp_seq = le64_to_cpu(mb->seq);
1129f6bed0efSShaohua Li 
1130f6bed0efSShaohua Li 	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
11310576b1c6SShaohua Li 	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
11320576b1c6SShaohua Li 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
11330576b1c6SShaohua Li 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1134f6bed0efSShaohua Li 	log->last_checkpoint = cp;
1135f6bed0efSShaohua Li 
1136f6bed0efSShaohua Li 	__free_page(page);
1137f6bed0efSShaohua Li 
1138f6bed0efSShaohua Li 	return r5l_recovery_log(log);
1139f6bed0efSShaohua Li ioerr:
1140f6bed0efSShaohua Li 	__free_page(page);
1141f6bed0efSShaohua Li 	return ret;
1142f6bed0efSShaohua Li }
1143f6bed0efSShaohua Li 
1144f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1145f6bed0efSShaohua Li {
1146f6bed0efSShaohua Li 	struct r5l_log *log;
1147f6bed0efSShaohua Li 
1148f6bed0efSShaohua Li 	if (PAGE_SIZE != 4096)
1149f6bed0efSShaohua Li 		return -EINVAL;
1150f6bed0efSShaohua Li 	log = kzalloc(sizeof(*log), GFP_KERNEL);
1151f6bed0efSShaohua Li 	if (!log)
1152f6bed0efSShaohua Li 		return -ENOMEM;
1153f6bed0efSShaohua Li 	log->rdev = rdev;
1154f6bed0efSShaohua Li 
115556fef7c6SChristoph Hellwig 	log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
115656fef7c6SChristoph Hellwig 
11575cb2fbd6SShaohua Li 	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1158f6bed0efSShaohua Li 				       sizeof(rdev->mddev->uuid));
1159f6bed0efSShaohua Li 
1160f6bed0efSShaohua Li 	mutex_init(&log->io_mutex);
1161f6bed0efSShaohua Li 
1162f6bed0efSShaohua Li 	spin_lock_init(&log->io_list_lock);
1163f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->running_ios);
11640576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->io_end_ios);
1165a8c34f91SShaohua Li 	INIT_LIST_HEAD(&log->flushing_ios);
116604732f74SChristoph Hellwig 	INIT_LIST_HEAD(&log->finished_ios);
1167a8c34f91SShaohua Li 	bio_init(&log->flush_bio);
1168f6bed0efSShaohua Li 
1169f6bed0efSShaohua Li 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1170f6bed0efSShaohua Li 	if (!log->io_kc)
1171f6bed0efSShaohua Li 		goto io_kc;
1172f6bed0efSShaohua Li 
1173c38d29b3SChristoph Hellwig 	log->bs = bioset_create(R5L_POOL_SIZE, 0);
1174c38d29b3SChristoph Hellwig 	if (!log->bs)
1175c38d29b3SChristoph Hellwig 		goto io_bs;
1176c38d29b3SChristoph Hellwig 
1177*e8deb638SChristoph Hellwig 	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
1178*e8deb638SChristoph Hellwig 	if (!log->meta_pool)
1179*e8deb638SChristoph Hellwig 		goto out_mempool;
1180*e8deb638SChristoph Hellwig 
11810576b1c6SShaohua Li 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
11820576b1c6SShaohua Li 						 log->rdev->mddev, "reclaim");
11830576b1c6SShaohua Li 	if (!log->reclaim_thread)
11840576b1c6SShaohua Li 		goto reclaim_thread;
11850fd22b45SShaohua Li 	init_waitqueue_head(&log->iounit_wait);
11860576b1c6SShaohua Li 
1187f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->no_space_stripes);
1188f6bed0efSShaohua Li 	spin_lock_init(&log->no_space_stripes_lock);
1189f6bed0efSShaohua Li 
1190f6bed0efSShaohua Li 	if (r5l_load_log(log))
1191f6bed0efSShaohua Li 		goto error;
1192f6bed0efSShaohua Li 
1193f6b6ec5cSShaohua Li 	rcu_assign_pointer(conf->log, log);
1194f6bed0efSShaohua Li 	return 0;
1195*e8deb638SChristoph Hellwig 
1196f6bed0efSShaohua Li error:
11970576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
11980576b1c6SShaohua Li reclaim_thread:
1199*e8deb638SChristoph Hellwig 	mempool_destroy(log->meta_pool);
1200*e8deb638SChristoph Hellwig out_mempool:
1201c38d29b3SChristoph Hellwig 	bioset_free(log->bs);
1202c38d29b3SChristoph Hellwig io_bs:
1203f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1204f6bed0efSShaohua Li io_kc:
1205f6bed0efSShaohua Li 	kfree(log);
1206f6bed0efSShaohua Li 	return -EINVAL;
1207f6bed0efSShaohua Li }
1208f6bed0efSShaohua Li 
1209f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log)
1210f6bed0efSShaohua Li {
12110576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
1212*e8deb638SChristoph Hellwig 	mempool_destroy(log->meta_pool);
1213c38d29b3SChristoph Hellwig 	bioset_free(log->bs);
1214f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1215f6bed0efSShaohua Li 	kfree(log);
1216f6bed0efSShaohua Li }
1217