xref: /linux/drivers/md/raid5-cache.c (revision 6e74a9cfb5a55b0a4214809321b67d7065e55555)
1f6bed0efSShaohua Li /*
2f6bed0efSShaohua Li  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3f6bed0efSShaohua Li  *
4f6bed0efSShaohua Li  * This program is free software; you can redistribute it and/or modify it
5f6bed0efSShaohua Li  * under the terms and conditions of the GNU General Public License,
6f6bed0efSShaohua Li  * version 2, as published by the Free Software Foundation.
7f6bed0efSShaohua Li  *
8f6bed0efSShaohua Li  * This program is distributed in the hope it will be useful, but WITHOUT
9f6bed0efSShaohua Li  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10f6bed0efSShaohua Li  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11f6bed0efSShaohua Li  * more details.
12f6bed0efSShaohua Li  *
13f6bed0efSShaohua Li  */
14f6bed0efSShaohua Li #include <linux/kernel.h>
15f6bed0efSShaohua Li #include <linux/wait.h>
16f6bed0efSShaohua Li #include <linux/blkdev.h>
17f6bed0efSShaohua Li #include <linux/slab.h>
18f6bed0efSShaohua Li #include <linux/raid/md_p.h>
195cb2fbd6SShaohua Li #include <linux/crc32c.h>
20f6bed0efSShaohua Li #include <linux/random.h>
21f6bed0efSShaohua Li #include "md.h"
22f6bed0efSShaohua Li #include "raid5.h"
23f6bed0efSShaohua Li 
24f6bed0efSShaohua Li /*
25f6bed0efSShaohua Li  * metadata/data stored in disk with 4k size unit (a block) regardless
26f6bed0efSShaohua Li  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27f6bed0efSShaohua Li  */
28f6bed0efSShaohua Li #define BLOCK_SECTORS (8)
29f6bed0efSShaohua Li 
300576b1c6SShaohua Li /*
310576b1c6SShaohua Li  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
320576b1c6SShaohua Li  * recovery scans a very long log
330576b1c6SShaohua Li  */
340576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
350576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
360576b1c6SShaohua Li 
37f6bed0efSShaohua Li struct r5l_log {
38f6bed0efSShaohua Li 	struct md_rdev *rdev;
39f6bed0efSShaohua Li 
40f6bed0efSShaohua Li 	u32 uuid_checksum;
41f6bed0efSShaohua Li 
42f6bed0efSShaohua Li 	sector_t device_size;		/* log device size, round to
43f6bed0efSShaohua Li 					 * BLOCK_SECTORS */
440576b1c6SShaohua Li 	sector_t max_free_space;	/* reclaim run if free space is at
450576b1c6SShaohua Li 					 * this size */
46f6bed0efSShaohua Li 
47f6bed0efSShaohua Li 	sector_t last_checkpoint;	/* log tail. where recovery scan
48f6bed0efSShaohua Li 					 * starts from */
49f6bed0efSShaohua Li 	u64 last_cp_seq;		/* log tail sequence */
50f6bed0efSShaohua Li 
51f6bed0efSShaohua Li 	sector_t log_start;		/* log head. where new data appends */
52f6bed0efSShaohua Li 	u64 seq;			/* log head sequence */
53f6bed0efSShaohua Li 
5417036461SChristoph Hellwig 	sector_t next_checkpoint;
5517036461SChristoph Hellwig 	u64 next_cp_seq;
5617036461SChristoph Hellwig 
57f6bed0efSShaohua Li 	struct mutex io_mutex;
58f6bed0efSShaohua Li 	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
59f6bed0efSShaohua Li 
60f6bed0efSShaohua Li 	spinlock_t io_list_lock;
61f6bed0efSShaohua Li 	struct list_head running_ios;	/* io_units which are still running,
62f6bed0efSShaohua Li 					 * and have not yet been completely
63f6bed0efSShaohua Li 					 * written to the log */
64f6bed0efSShaohua Li 	struct list_head io_end_ios;	/* io_units which have been completely
65f6bed0efSShaohua Li 					 * written to the log but not yet written
66f6bed0efSShaohua Li 					 * to the RAID */
67a8c34f91SShaohua Li 	struct list_head flushing_ios;	/* io_units which are waiting for log
68a8c34f91SShaohua Li 					 * cache flush */
6904732f74SChristoph Hellwig 	struct list_head finished_ios;	/* io_units which settle down in log disk */
70a8c34f91SShaohua Li 	struct bio flush_bio;
71f6bed0efSShaohua Li 
72f6bed0efSShaohua Li 	struct kmem_cache *io_kc;
73f6bed0efSShaohua Li 
740576b1c6SShaohua Li 	struct md_thread *reclaim_thread;
750576b1c6SShaohua Li 	unsigned long reclaim_target;	/* number of space that need to be
760576b1c6SShaohua Li 					 * reclaimed.  if it's 0, reclaim spaces
770576b1c6SShaohua Li 					 * used by io_units which are in
780576b1c6SShaohua Li 					 * IO_UNIT_STRIPE_END state (eg, reclaim
790576b1c6SShaohua Li 					 * dones't wait for specific io_unit
800576b1c6SShaohua Li 					 * switching to IO_UNIT_STRIPE_END
810576b1c6SShaohua Li 					 * state) */
820fd22b45SShaohua Li 	wait_queue_head_t iounit_wait;
830576b1c6SShaohua Li 
84f6bed0efSShaohua Li 	struct list_head no_space_stripes; /* pending stripes, log has no space */
85f6bed0efSShaohua Li 	spinlock_t no_space_stripes_lock;
8656fef7c6SChristoph Hellwig 
8756fef7c6SChristoph Hellwig 	bool need_cache_flush;
884b482044SShaohua Li 	bool in_teardown;
89f6bed0efSShaohua Li };
90f6bed0efSShaohua Li 
91f6bed0efSShaohua Li /*
92f6bed0efSShaohua Li  * an IO range starts from a meta data block and end at the next meta data
93f6bed0efSShaohua Li  * block. The io unit's the meta data block tracks data/parity followed it. io
94f6bed0efSShaohua Li  * unit is written to log disk with normal write, as we always flush log disk
95f6bed0efSShaohua Li  * first and then start move data to raid disks, there is no requirement to
96f6bed0efSShaohua Li  * write io unit with FLUSH/FUA
97f6bed0efSShaohua Li  */
98f6bed0efSShaohua Li struct r5l_io_unit {
99f6bed0efSShaohua Li 	struct r5l_log *log;
100f6bed0efSShaohua Li 
101f6bed0efSShaohua Li 	struct page *meta_page;	/* store meta block */
102f6bed0efSShaohua Li 	int meta_offset;	/* current offset in meta_page */
103f6bed0efSShaohua Li 
104f6bed0efSShaohua Li 	struct bio *current_bio;/* current_bio accepting new data */
105f6bed0efSShaohua Li 
106f6bed0efSShaohua Li 	atomic_t pending_stripe;/* how many stripes not flushed to raid */
107f6bed0efSShaohua Li 	u64 seq;		/* seq number of the metablock */
108f6bed0efSShaohua Li 	sector_t log_start;	/* where the io_unit starts */
109f6bed0efSShaohua Li 	sector_t log_end;	/* where the io_unit ends */
110f6bed0efSShaohua Li 	struct list_head log_sibling; /* log->running_ios */
111f6bed0efSShaohua Li 	struct list_head stripe_list; /* stripes added to the io_unit */
112f6bed0efSShaohua Li 
113f6bed0efSShaohua Li 	int state;
1146143e2ceSChristoph Hellwig 	bool need_split_bio;
115f6bed0efSShaohua Li };
116f6bed0efSShaohua Li 
117f6bed0efSShaohua Li /* r5l_io_unit state */
118f6bed0efSShaohua Li enum r5l_io_unit_state {
119f6bed0efSShaohua Li 	IO_UNIT_RUNNING = 0,	/* accepting new IO */
120f6bed0efSShaohua Li 	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
121f6bed0efSShaohua Li 				 * don't accepting new bio */
122f6bed0efSShaohua Li 	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
123a8c34f91SShaohua Li 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
124f6bed0efSShaohua Li };
125f6bed0efSShaohua Li 
126f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
127f6bed0efSShaohua Li {
128f6bed0efSShaohua Li 	start += inc;
129f6bed0efSShaohua Li 	if (start >= log->device_size)
130f6bed0efSShaohua Li 		start = start - log->device_size;
131f6bed0efSShaohua Li 	return start;
132f6bed0efSShaohua Li }
133f6bed0efSShaohua Li 
134f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
135f6bed0efSShaohua Li 				  sector_t end)
136f6bed0efSShaohua Li {
137f6bed0efSShaohua Li 	if (end >= start)
138f6bed0efSShaohua Li 		return end - start;
139f6bed0efSShaohua Li 	else
140f6bed0efSShaohua Li 		return end + log->device_size - start;
141f6bed0efSShaohua Li }
142f6bed0efSShaohua Li 
143f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
144f6bed0efSShaohua Li {
145f6bed0efSShaohua Li 	sector_t used_size;
146f6bed0efSShaohua Li 
147f6bed0efSShaohua Li 	used_size = r5l_ring_distance(log, log->last_checkpoint,
148f6bed0efSShaohua Li 					log->log_start);
149f6bed0efSShaohua Li 
150f6bed0efSShaohua Li 	return log->device_size > used_size + size;
151f6bed0efSShaohua Li }
152f6bed0efSShaohua Li 
153f6bed0efSShaohua Li static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
154f6bed0efSShaohua Li {
155f6bed0efSShaohua Li 	__free_page(io->meta_page);
156f6bed0efSShaohua Li 	kmem_cache_free(log->io_kc, io);
157f6bed0efSShaohua Li }
158f6bed0efSShaohua Li 
159f6bed0efSShaohua Li static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
160f6bed0efSShaohua Li 				  enum r5l_io_unit_state state)
161f6bed0efSShaohua Li {
162f6bed0efSShaohua Li 	struct r5l_io_unit *io;
163f6bed0efSShaohua Li 
164f6bed0efSShaohua Li 	while (!list_empty(from)) {
165f6bed0efSShaohua Li 		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
166f6bed0efSShaohua Li 		/* don't change list order */
167f6bed0efSShaohua Li 		if (io->state >= state)
168f6bed0efSShaohua Li 			list_move_tail(&io->log_sibling, to);
169f6bed0efSShaohua Li 		else
170f6bed0efSShaohua Li 			break;
171f6bed0efSShaohua Li 	}
172f6bed0efSShaohua Li }
173f6bed0efSShaohua Li 
174f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
175f6bed0efSShaohua Li 				    enum r5l_io_unit_state state)
176f6bed0efSShaohua Li {
177f6bed0efSShaohua Li 	if (WARN_ON(io->state >= state))
178f6bed0efSShaohua Li 		return;
179f6bed0efSShaohua Li 	io->state = state;
180f6bed0efSShaohua Li }
181f6bed0efSShaohua Li 
182d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io)
183d8858f43SChristoph Hellwig {
184d8858f43SChristoph Hellwig 	struct stripe_head *sh, *next;
185d8858f43SChristoph Hellwig 
186d8858f43SChristoph Hellwig 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
187d8858f43SChristoph Hellwig 		list_del_init(&sh->log_list);
188d8858f43SChristoph Hellwig 		set_bit(STRIPE_HANDLE, &sh->state);
189d8858f43SChristoph Hellwig 		raid5_release_stripe(sh);
190d8858f43SChristoph Hellwig 	}
191d8858f43SChristoph Hellwig }
192d8858f43SChristoph Hellwig 
19356fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log)
19456fef7c6SChristoph Hellwig {
19556fef7c6SChristoph Hellwig 	struct r5l_io_unit *io, *next;
19656fef7c6SChristoph Hellwig 
19756fef7c6SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
19856fef7c6SChristoph Hellwig 
19956fef7c6SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
20056fef7c6SChristoph Hellwig 		/* don't change list order */
20156fef7c6SChristoph Hellwig 		if (io->state < IO_UNIT_IO_END)
20256fef7c6SChristoph Hellwig 			break;
20356fef7c6SChristoph Hellwig 
20456fef7c6SChristoph Hellwig 		list_move_tail(&io->log_sibling, &log->finished_ios);
20556fef7c6SChristoph Hellwig 		r5l_io_run_stripes(io);
20656fef7c6SChristoph Hellwig 	}
20756fef7c6SChristoph Hellwig }
20856fef7c6SChristoph Hellwig 
209f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio)
210f6bed0efSShaohua Li {
211f6bed0efSShaohua Li 	struct r5l_io_unit *io = bio->bi_private;
212f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
213509ffec7SChristoph Hellwig 	unsigned long flags;
214f6bed0efSShaohua Li 
215*6e74a9cfSShaohua Li 	if (bio->bi_error)
216*6e74a9cfSShaohua Li 		md_error(log->rdev->mddev, log->rdev);
217*6e74a9cfSShaohua Li 
218f6bed0efSShaohua Li 	bio_put(bio);
219f6bed0efSShaohua Li 
220509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
221509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
22256fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
223509ffec7SChristoph Hellwig 		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
224509ffec7SChristoph Hellwig 				      IO_UNIT_IO_END);
22556fef7c6SChristoph Hellwig 	else
22656fef7c6SChristoph Hellwig 		r5l_log_run_stripes(log);
227509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
228509ffec7SChristoph Hellwig 
22956fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
230f6bed0efSShaohua Li 		md_wakeup_thread(log->rdev->mddev->thread);
231f6bed0efSShaohua Li }
232f6bed0efSShaohua Li 
233f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log)
234f6bed0efSShaohua Li {
235f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
236f6bed0efSShaohua Li 	struct r5l_meta_block *block;
237509ffec7SChristoph Hellwig 	unsigned long flags;
238f6bed0efSShaohua Li 	u32 crc;
239f6bed0efSShaohua Li 
240f6bed0efSShaohua Li 	if (!io)
241f6bed0efSShaohua Li 		return;
242f6bed0efSShaohua Li 
243f6bed0efSShaohua Li 	block = page_address(io->meta_page);
244f6bed0efSShaohua Li 	block->meta_size = cpu_to_le32(io->meta_offset);
2455cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
246f6bed0efSShaohua Li 	block->checksum = cpu_to_le32(crc);
247f6bed0efSShaohua Li 
248f6bed0efSShaohua Li 	log->current_io = NULL;
249509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
250509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
251509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
252f6bed0efSShaohua Li 
2536143e2ceSChristoph Hellwig 	submit_bio(WRITE, io->current_bio);
254f6bed0efSShaohua Li }
255f6bed0efSShaohua Li 
2566143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log)
257b349feb3SChristoph Hellwig {
258b349feb3SChristoph Hellwig 	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
259b349feb3SChristoph Hellwig 
260b349feb3SChristoph Hellwig 	bio->bi_rw = WRITE;
261b349feb3SChristoph Hellwig 	bio->bi_bdev = log->rdev->bdev;
2621e932a37SChristoph Hellwig 	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
263b349feb3SChristoph Hellwig 
264b349feb3SChristoph Hellwig 	return bio;
265b349feb3SChristoph Hellwig }
266b349feb3SChristoph Hellwig 
267c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
268c1b99198SChristoph Hellwig {
269c1b99198SChristoph Hellwig 	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
270c1b99198SChristoph Hellwig 
271c1b99198SChristoph Hellwig 	/*
272c1b99198SChristoph Hellwig 	 * If we filled up the log device start from the beginning again,
273c1b99198SChristoph Hellwig 	 * which will require a new bio.
274c1b99198SChristoph Hellwig 	 *
275c1b99198SChristoph Hellwig 	 * Note: for this to work properly the log size needs to me a multiple
276c1b99198SChristoph Hellwig 	 * of BLOCK_SECTORS.
277c1b99198SChristoph Hellwig 	 */
278c1b99198SChristoph Hellwig 	if (log->log_start == 0)
2796143e2ceSChristoph Hellwig 		io->need_split_bio = true;
280c1b99198SChristoph Hellwig 
281c1b99198SChristoph Hellwig 	io->log_end = log->log_start;
282c1b99198SChristoph Hellwig }
283c1b99198SChristoph Hellwig 
284f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
285f6bed0efSShaohua Li {
286f6bed0efSShaohua Li 	struct r5l_io_unit *io;
287f6bed0efSShaohua Li 	struct r5l_meta_block *block;
288f6bed0efSShaohua Li 
28951039cd0SChristoph Hellwig 	/* We can't handle memory allocate failure so far */
29051039cd0SChristoph Hellwig 	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
29151039cd0SChristoph Hellwig 	io->log = log;
29251039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->log_sibling);
29351039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->stripe_list);
29451039cd0SChristoph Hellwig 	io->state = IO_UNIT_RUNNING;
295f6bed0efSShaohua Li 
29651039cd0SChristoph Hellwig 	io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
297f6bed0efSShaohua Li 	block = page_address(io->meta_page);
298f6bed0efSShaohua Li 	block->magic = cpu_to_le32(R5LOG_MAGIC);
299f6bed0efSShaohua Li 	block->version = R5LOG_VERSION;
300f6bed0efSShaohua Li 	block->seq = cpu_to_le64(log->seq);
301f6bed0efSShaohua Li 	block->position = cpu_to_le64(log->log_start);
302f6bed0efSShaohua Li 
303f6bed0efSShaohua Li 	io->log_start = log->log_start;
304f6bed0efSShaohua Li 	io->meta_offset = sizeof(struct r5l_meta_block);
3052b8ef16eSChristoph Hellwig 	io->seq = log->seq++;
306f6bed0efSShaohua Li 
3076143e2ceSChristoph Hellwig 	io->current_bio = r5l_bio_alloc(log);
3086143e2ceSChristoph Hellwig 	io->current_bio->bi_end_io = r5l_log_endio;
3096143e2ceSChristoph Hellwig 	io->current_bio->bi_private = io;
310b349feb3SChristoph Hellwig 	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
311f6bed0efSShaohua Li 
312c1b99198SChristoph Hellwig 	r5_reserve_log_entry(log, io);
313f6bed0efSShaohua Li 
314f6bed0efSShaohua Li 	spin_lock_irq(&log->io_list_lock);
315f6bed0efSShaohua Li 	list_add_tail(&io->log_sibling, &log->running_ios);
316f6bed0efSShaohua Li 	spin_unlock_irq(&log->io_list_lock);
317f6bed0efSShaohua Li 
318f6bed0efSShaohua Li 	return io;
319f6bed0efSShaohua Li }
320f6bed0efSShaohua Li 
321f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
322f6bed0efSShaohua Li {
32322581f58SChristoph Hellwig 	if (log->current_io &&
32422581f58SChristoph Hellwig 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
325f6bed0efSShaohua Li 		r5l_submit_current_io(log);
326f6bed0efSShaohua Li 
32722581f58SChristoph Hellwig 	if (!log->current_io)
328f6bed0efSShaohua Li 		log->current_io = r5l_new_meta(log);
329f6bed0efSShaohua Li 	return 0;
330f6bed0efSShaohua Li }
331f6bed0efSShaohua Li 
332f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
333f6bed0efSShaohua Li 				    sector_t location,
334f6bed0efSShaohua Li 				    u32 checksum1, u32 checksum2,
335f6bed0efSShaohua Li 				    bool checksum2_valid)
336f6bed0efSShaohua Li {
337f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
338f6bed0efSShaohua Li 	struct r5l_payload_data_parity *payload;
339f6bed0efSShaohua Li 
340f6bed0efSShaohua Li 	payload = page_address(io->meta_page) + io->meta_offset;
341f6bed0efSShaohua Li 	payload->header.type = cpu_to_le16(type);
342f6bed0efSShaohua Li 	payload->header.flags = cpu_to_le16(0);
343f6bed0efSShaohua Li 	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
344f6bed0efSShaohua Li 				    (PAGE_SHIFT - 9));
345f6bed0efSShaohua Li 	payload->location = cpu_to_le64(location);
346f6bed0efSShaohua Li 	payload->checksum[0] = cpu_to_le32(checksum1);
347f6bed0efSShaohua Li 	if (checksum2_valid)
348f6bed0efSShaohua Li 		payload->checksum[1] = cpu_to_le32(checksum2);
349f6bed0efSShaohua Li 
350f6bed0efSShaohua Li 	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
351f6bed0efSShaohua Li 		sizeof(__le32) * (1 + !!checksum2_valid);
352f6bed0efSShaohua Li }
353f6bed0efSShaohua Li 
354f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
355f6bed0efSShaohua Li {
356f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
357f6bed0efSShaohua Li 
3586143e2ceSChristoph Hellwig 	if (io->need_split_bio) {
3596143e2ceSChristoph Hellwig 		struct bio *prev = io->current_bio;
360f6bed0efSShaohua Li 
3616143e2ceSChristoph Hellwig 		io->current_bio = r5l_bio_alloc(log);
3626143e2ceSChristoph Hellwig 		bio_chain(io->current_bio, prev);
3636143e2ceSChristoph Hellwig 
3646143e2ceSChristoph Hellwig 		submit_bio(WRITE, prev);
365f6bed0efSShaohua Li 	}
366f6bed0efSShaohua Li 
3676143e2ceSChristoph Hellwig 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
3686143e2ceSChristoph Hellwig 		BUG();
3696143e2ceSChristoph Hellwig 
370c1b99198SChristoph Hellwig 	r5_reserve_log_entry(log, io);
371f6bed0efSShaohua Li }
372f6bed0efSShaohua Li 
373f6bed0efSShaohua Li static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
374f6bed0efSShaohua Li 			   int data_pages, int parity_pages)
375f6bed0efSShaohua Li {
376f6bed0efSShaohua Li 	int i;
377f6bed0efSShaohua Li 	int meta_size;
378f6bed0efSShaohua Li 	struct r5l_io_unit *io;
379f6bed0efSShaohua Li 
380f6bed0efSShaohua Li 	meta_size =
381f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
382f6bed0efSShaohua Li 		 * data_pages) +
383f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
384f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
385f6bed0efSShaohua Li 
386f6bed0efSShaohua Li 	r5l_get_meta(log, meta_size);
387f6bed0efSShaohua Li 	io = log->current_io;
388f6bed0efSShaohua Li 
389f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
390f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
391f6bed0efSShaohua Li 			continue;
392f6bed0efSShaohua Li 		if (i == sh->pd_idx || i == sh->qd_idx)
393f6bed0efSShaohua Li 			continue;
394f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
395f6bed0efSShaohua Li 					raid5_compute_blocknr(sh, i, 0),
396f6bed0efSShaohua Li 					sh->dev[i].log_checksum, 0, false);
397f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[i].page);
398f6bed0efSShaohua Li 	}
399f6bed0efSShaohua Li 
400f6bed0efSShaohua Li 	if (sh->qd_idx >= 0) {
401f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
402f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
403f6bed0efSShaohua Li 					sh->dev[sh->qd_idx].log_checksum, true);
404f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
405f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
406f6bed0efSShaohua Li 	} else {
407f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
408f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
409f6bed0efSShaohua Li 					0, false);
410f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
411f6bed0efSShaohua Li 	}
412f6bed0efSShaohua Li 
413f6bed0efSShaohua Li 	list_add_tail(&sh->log_list, &io->stripe_list);
414f6bed0efSShaohua Li 	atomic_inc(&io->pending_stripe);
415f6bed0efSShaohua Li 	sh->log_io = io;
416f6bed0efSShaohua Li }
417f6bed0efSShaohua Li 
418509ffec7SChristoph Hellwig static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
419f6bed0efSShaohua Li /*
420f6bed0efSShaohua Li  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
421f6bed0efSShaohua Li  * data from log to raid disks), so we shouldn't wait for reclaim here
422f6bed0efSShaohua Li  */
423f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
424f6bed0efSShaohua Li {
425f6bed0efSShaohua Li 	int write_disks = 0;
426f6bed0efSShaohua Li 	int data_pages, parity_pages;
427f6bed0efSShaohua Li 	int meta_size;
428f6bed0efSShaohua Li 	int reserve;
429f6bed0efSShaohua Li 	int i;
430f6bed0efSShaohua Li 
431f6bed0efSShaohua Li 	if (!log)
432f6bed0efSShaohua Li 		return -EAGAIN;
433f6bed0efSShaohua Li 	/* Don't support stripe batch */
434f6bed0efSShaohua Li 	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
435f6bed0efSShaohua Li 	    test_bit(STRIPE_SYNCING, &sh->state)) {
436f6bed0efSShaohua Li 		/* the stripe is written to log, we start writing it to raid */
437f6bed0efSShaohua Li 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
438f6bed0efSShaohua Li 		return -EAGAIN;
439f6bed0efSShaohua Li 	}
440f6bed0efSShaohua Li 
441f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
442f6bed0efSShaohua Li 		void *addr;
443f6bed0efSShaohua Li 
444f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
445f6bed0efSShaohua Li 			continue;
446f6bed0efSShaohua Li 		write_disks++;
447f6bed0efSShaohua Li 		/* checksum is already calculated in last run */
448f6bed0efSShaohua Li 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
449f6bed0efSShaohua Li 			continue;
450f6bed0efSShaohua Li 		addr = kmap_atomic(sh->dev[i].page);
4515cb2fbd6SShaohua Li 		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
452f6bed0efSShaohua Li 						    addr, PAGE_SIZE);
453f6bed0efSShaohua Li 		kunmap_atomic(addr);
454f6bed0efSShaohua Li 	}
455f6bed0efSShaohua Li 	parity_pages = 1 + !!(sh->qd_idx >= 0);
456f6bed0efSShaohua Li 	data_pages = write_disks - parity_pages;
457f6bed0efSShaohua Li 
458f6bed0efSShaohua Li 	meta_size =
459f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
460f6bed0efSShaohua Li 		 * data_pages) +
461f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
462f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
463f6bed0efSShaohua Li 	/* Doesn't work with very big raid array */
464f6bed0efSShaohua Li 	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
465f6bed0efSShaohua Li 		return -EINVAL;
466f6bed0efSShaohua Li 
467f6bed0efSShaohua Li 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
468253f9fd4SShaohua Li 	/*
469253f9fd4SShaohua Li 	 * The stripe must enter state machine again to finish the write, so
470253f9fd4SShaohua Li 	 * don't delay.
471253f9fd4SShaohua Li 	 */
472253f9fd4SShaohua Li 	clear_bit(STRIPE_DELAYED, &sh->state);
473f6bed0efSShaohua Li 	atomic_inc(&sh->count);
474f6bed0efSShaohua Li 
475f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
476f6bed0efSShaohua Li 	/* meta + data */
477f6bed0efSShaohua Li 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
478f6bed0efSShaohua Li 	if (r5l_has_free_space(log, reserve))
479f6bed0efSShaohua Li 		r5l_log_stripe(log, sh, data_pages, parity_pages);
480f6bed0efSShaohua Li 	else {
481f6bed0efSShaohua Li 		spin_lock(&log->no_space_stripes_lock);
482f6bed0efSShaohua Li 		list_add_tail(&sh->log_list, &log->no_space_stripes);
483f6bed0efSShaohua Li 		spin_unlock(&log->no_space_stripes_lock);
484f6bed0efSShaohua Li 
485f6bed0efSShaohua Li 		r5l_wake_reclaim(log, reserve);
486f6bed0efSShaohua Li 	}
487f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
488f6bed0efSShaohua Li 
489f6bed0efSShaohua Li 	return 0;
490f6bed0efSShaohua Li }
491f6bed0efSShaohua Li 
492f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log)
493f6bed0efSShaohua Li {
494f6bed0efSShaohua Li 	if (!log)
495f6bed0efSShaohua Li 		return;
496f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
497f6bed0efSShaohua Li 	r5l_submit_current_io(log);
498f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
499f6bed0efSShaohua Li }
500f6bed0efSShaohua Li 
501828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
502828cbe98SShaohua Li {
503828cbe98SShaohua Li 	if (!log)
504828cbe98SShaohua Li 		return -ENODEV;
505828cbe98SShaohua Li 	/*
506828cbe98SShaohua Li 	 * we flush log disk cache first, then write stripe data to raid disks.
507828cbe98SShaohua Li 	 * So if bio is finished, the log disk cache is flushed already. The
508828cbe98SShaohua Li 	 * recovery guarantees we can recovery the bio from log disk, so we
509828cbe98SShaohua Li 	 * don't need to flush again
510828cbe98SShaohua Li 	 */
511828cbe98SShaohua Li 	if (bio->bi_iter.bi_size == 0) {
512828cbe98SShaohua Li 		bio_endio(bio);
513828cbe98SShaohua Li 		return 0;
514828cbe98SShaohua Li 	}
515828cbe98SShaohua Li 	bio->bi_rw &= ~REQ_FLUSH;
516828cbe98SShaohua Li 	return -EAGAIN;
517828cbe98SShaohua Li }
518828cbe98SShaohua Li 
519f6bed0efSShaohua Li /* This will run after log space is reclaimed */
520f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log)
521f6bed0efSShaohua Li {
522f6bed0efSShaohua Li 	struct stripe_head *sh;
523f6bed0efSShaohua Li 
524f6bed0efSShaohua Li 	spin_lock(&log->no_space_stripes_lock);
525f6bed0efSShaohua Li 	while (!list_empty(&log->no_space_stripes)) {
526f6bed0efSShaohua Li 		sh = list_first_entry(&log->no_space_stripes,
527f6bed0efSShaohua Li 				      struct stripe_head, log_list);
528f6bed0efSShaohua Li 		list_del_init(&sh->log_list);
529f6bed0efSShaohua Li 		set_bit(STRIPE_HANDLE, &sh->state);
530f6bed0efSShaohua Li 		raid5_release_stripe(sh);
531f6bed0efSShaohua Li 	}
532f6bed0efSShaohua Li 	spin_unlock(&log->no_space_stripes_lock);
533f6bed0efSShaohua Li }
534f6bed0efSShaohua Li 
53517036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log)
53617036461SChristoph Hellwig {
53717036461SChristoph Hellwig 	return r5l_ring_distance(log, log->last_checkpoint,
53817036461SChristoph Hellwig 				 log->next_checkpoint);
53917036461SChristoph Hellwig }
54017036461SChristoph Hellwig 
54104732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log)
54217036461SChristoph Hellwig {
54317036461SChristoph Hellwig 	struct r5l_io_unit *io, *next;
54417036461SChristoph Hellwig 	bool found = false;
54517036461SChristoph Hellwig 
54617036461SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
54717036461SChristoph Hellwig 
54804732f74SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
54917036461SChristoph Hellwig 		/* don't change list order */
55017036461SChristoph Hellwig 		if (io->state < IO_UNIT_STRIPE_END)
55117036461SChristoph Hellwig 			break;
55217036461SChristoph Hellwig 
55317036461SChristoph Hellwig 		log->next_checkpoint = io->log_start;
55417036461SChristoph Hellwig 		log->next_cp_seq = io->seq;
55517036461SChristoph Hellwig 
55617036461SChristoph Hellwig 		list_del(&io->log_sibling);
55717036461SChristoph Hellwig 		r5l_free_io_unit(log, io);
55817036461SChristoph Hellwig 
55917036461SChristoph Hellwig 		found = true;
56017036461SChristoph Hellwig 	}
56117036461SChristoph Hellwig 
56217036461SChristoph Hellwig 	return found;
56317036461SChristoph Hellwig }
56417036461SChristoph Hellwig 
565509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
566509ffec7SChristoph Hellwig {
567509ffec7SChristoph Hellwig 	struct r5l_log *log = io->log;
568509ffec7SChristoph Hellwig 	unsigned long flags;
569509ffec7SChristoph Hellwig 
570509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
571509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
57217036461SChristoph Hellwig 
57304732f74SChristoph Hellwig 	if (!r5l_complete_finished_ios(log)) {
57485f2f9a4SShaohua Li 		spin_unlock_irqrestore(&log->io_list_lock, flags);
57585f2f9a4SShaohua Li 		return;
57685f2f9a4SShaohua Li 	}
577509ffec7SChristoph Hellwig 
57817036461SChristoph Hellwig 	if (r5l_reclaimable_space(log) > log->max_free_space)
579509ffec7SChristoph Hellwig 		r5l_wake_reclaim(log, 0);
580509ffec7SChristoph Hellwig 
581509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
582509ffec7SChristoph Hellwig 	wake_up(&log->iounit_wait);
583509ffec7SChristoph Hellwig }
584509ffec7SChristoph Hellwig 
5850576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh)
5860576b1c6SShaohua Li {
5870576b1c6SShaohua Li 	struct r5l_io_unit *io;
5880576b1c6SShaohua Li 
5890576b1c6SShaohua Li 	io = sh->log_io;
5900576b1c6SShaohua Li 	sh->log_io = NULL;
5910576b1c6SShaohua Li 
592509ffec7SChristoph Hellwig 	if (io && atomic_dec_and_test(&io->pending_stripe))
593509ffec7SChristoph Hellwig 		__r5l_stripe_write_finished(io);
5940576b1c6SShaohua Li }
5950576b1c6SShaohua Li 
596a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio)
597a8c34f91SShaohua Li {
598a8c34f91SShaohua Li 	struct r5l_log *log = container_of(bio, struct r5l_log,
599a8c34f91SShaohua Li 		flush_bio);
600a8c34f91SShaohua Li 	unsigned long flags;
601a8c34f91SShaohua Li 	struct r5l_io_unit *io;
602a8c34f91SShaohua Li 
603*6e74a9cfSShaohua Li 	if (bio->bi_error)
604*6e74a9cfSShaohua Li 		md_error(log->rdev->mddev, log->rdev);
605*6e74a9cfSShaohua Li 
606a8c34f91SShaohua Li 	spin_lock_irqsave(&log->io_list_lock, flags);
607d8858f43SChristoph Hellwig 	list_for_each_entry(io, &log->flushing_ios, log_sibling)
608d8858f43SChristoph Hellwig 		r5l_io_run_stripes(io);
60904732f74SChristoph Hellwig 	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
610a8c34f91SShaohua Li 	spin_unlock_irqrestore(&log->io_list_lock, flags);
611a8c34f91SShaohua Li }
612a8c34f91SShaohua Li 
6130576b1c6SShaohua Li /*
6140576b1c6SShaohua Li  * Starting dispatch IO to raid.
6150576b1c6SShaohua Li  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
6160576b1c6SShaohua Li  * broken meta in the middle of a log causes recovery can't find meta at the
6170576b1c6SShaohua Li  * head of log. If operations require meta at the head persistent in log, we
6180576b1c6SShaohua Li  * must make sure meta before it persistent in log too. A case is:
6190576b1c6SShaohua Li  *
6200576b1c6SShaohua Li  * stripe data/parity is in log, we start write stripe to raid disks. stripe
6210576b1c6SShaohua Li  * data/parity must be persistent in log before we do the write to raid disks.
6220576b1c6SShaohua Li  *
6230576b1c6SShaohua Li  * The solution is we restrictly maintain io_unit list order. In this case, we
6240576b1c6SShaohua Li  * only write stripes of an io_unit to raid disks till the io_unit is the first
6250576b1c6SShaohua Li  * one whose data/parity is in log.
6260576b1c6SShaohua Li  */
6270576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log)
6280576b1c6SShaohua Li {
629a8c34f91SShaohua Li 	bool do_flush;
63056fef7c6SChristoph Hellwig 
63156fef7c6SChristoph Hellwig 	if (!log || !log->need_cache_flush)
6320576b1c6SShaohua Li 		return;
6330576b1c6SShaohua Li 
634a8c34f91SShaohua Li 	spin_lock_irq(&log->io_list_lock);
635a8c34f91SShaohua Li 	/* flush bio is running */
636a8c34f91SShaohua Li 	if (!list_empty(&log->flushing_ios)) {
637a8c34f91SShaohua Li 		spin_unlock_irq(&log->io_list_lock);
6380576b1c6SShaohua Li 		return;
6390576b1c6SShaohua Li 	}
640a8c34f91SShaohua Li 	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
641a8c34f91SShaohua Li 	do_flush = !list_empty(&log->flushing_ios);
6420576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
643a8c34f91SShaohua Li 
644a8c34f91SShaohua Li 	if (!do_flush)
645a8c34f91SShaohua Li 		return;
646a8c34f91SShaohua Li 	bio_reset(&log->flush_bio);
647a8c34f91SShaohua Li 	log->flush_bio.bi_bdev = log->rdev->bdev;
648a8c34f91SShaohua Li 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
649a8c34f91SShaohua Li 	submit_bio(WRITE_FLUSH, &log->flush_bio);
6500576b1c6SShaohua Li }
6510576b1c6SShaohua Li 
6520576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp);
6534b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log,
6544b482044SShaohua Li 	sector_t end)
6554b482044SShaohua Li {
6564b482044SShaohua Li 	struct block_device *bdev = log->rdev->bdev;
6574b482044SShaohua Li 	struct mddev *mddev;
6584b482044SShaohua Li 
6594b482044SShaohua Li 	r5l_write_super(log, end);
6604b482044SShaohua Li 
6614b482044SShaohua Li 	if (!blk_queue_discard(bdev_get_queue(bdev)))
6624b482044SShaohua Li 		return;
6634b482044SShaohua Li 
6644b482044SShaohua Li 	mddev = log->rdev->mddev;
6654b482044SShaohua Li 	/*
6664b482044SShaohua Li 	 * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
6674b482044SShaohua Li 	 * wait for this thread to finish. This thread waits for
6684b482044SShaohua Li 	 * MD_CHANGE_PENDING clear, which is supposed to be done in
6694b482044SShaohua Li 	 * md_check_recovery(). md_check_recovery() tries to get
6704b482044SShaohua Li 	 * reconfig_mutex. Since r5l_quiesce already holds the mutex,
6714b482044SShaohua Li 	 * md_check_recovery() fails, so the PENDING never get cleared. The
6724b482044SShaohua Li 	 * in_teardown check workaround this issue.
6734b482044SShaohua Li 	 */
6744b482044SShaohua Li 	if (!log->in_teardown) {
6754b482044SShaohua Li 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
6764b482044SShaohua Li 		set_bit(MD_CHANGE_PENDING, &mddev->flags);
6774b482044SShaohua Li 		md_wakeup_thread(mddev->thread);
6784b482044SShaohua Li 		wait_event(mddev->sb_wait,
6794b482044SShaohua Li 			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
6804b482044SShaohua Li 			log->in_teardown);
6814b482044SShaohua Li 		/*
6824b482044SShaohua Li 		 * r5l_quiesce could run after in_teardown check and hold
6834b482044SShaohua Li 		 * mutex first. Superblock might get updated twice.
6844b482044SShaohua Li 		 */
6854b482044SShaohua Li 		if (log->in_teardown)
6864b482044SShaohua Li 			md_update_sb(mddev, 1);
6874b482044SShaohua Li 	} else {
6884b482044SShaohua Li 		WARN_ON(!mddev_is_locked(mddev));
6894b482044SShaohua Li 		md_update_sb(mddev, 1);
6904b482044SShaohua Li 	}
6914b482044SShaohua Li 
692*6e74a9cfSShaohua Li 	/* discard IO error really doesn't matter, ignore it */
6934b482044SShaohua Li 	if (log->last_checkpoint < end) {
6944b482044SShaohua Li 		blkdev_issue_discard(bdev,
6954b482044SShaohua Li 				log->last_checkpoint + log->rdev->data_offset,
6964b482044SShaohua Li 				end - log->last_checkpoint, GFP_NOIO, 0);
6974b482044SShaohua Li 	} else {
6984b482044SShaohua Li 		blkdev_issue_discard(bdev,
6994b482044SShaohua Li 				log->last_checkpoint + log->rdev->data_offset,
7004b482044SShaohua Li 				log->device_size - log->last_checkpoint,
7014b482044SShaohua Li 				GFP_NOIO, 0);
7024b482044SShaohua Li 		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
7034b482044SShaohua Li 				GFP_NOIO, 0);
7044b482044SShaohua Li 	}
7054b482044SShaohua Li }
7064b482044SShaohua Li 
7074b482044SShaohua Li 
7080576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log)
7090576b1c6SShaohua Li {
7100576b1c6SShaohua Li 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
71117036461SChristoph Hellwig 	sector_t reclaimable;
71217036461SChristoph Hellwig 	sector_t next_checkpoint;
71317036461SChristoph Hellwig 	u64 next_cp_seq;
7140576b1c6SShaohua Li 
7150576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
7160576b1c6SShaohua Li 	/*
7170576b1c6SShaohua Li 	 * move proper io_unit to reclaim list. We should not change the order.
7180576b1c6SShaohua Li 	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
7190576b1c6SShaohua Li 	 * shouldn't reuse space of an unreclaimable io_unit
7200576b1c6SShaohua Li 	 */
7210576b1c6SShaohua Li 	while (1) {
72217036461SChristoph Hellwig 		reclaimable = r5l_reclaimable_space(log);
72317036461SChristoph Hellwig 		if (reclaimable >= reclaim_target ||
7240576b1c6SShaohua Li 		    (list_empty(&log->running_ios) &&
7250576b1c6SShaohua Li 		     list_empty(&log->io_end_ios) &&
726a8c34f91SShaohua Li 		     list_empty(&log->flushing_ios) &&
72704732f74SChristoph Hellwig 		     list_empty(&log->finished_ios)))
7280576b1c6SShaohua Li 			break;
7290576b1c6SShaohua Li 
73017036461SChristoph Hellwig 		md_wakeup_thread(log->rdev->mddev->thread);
73117036461SChristoph Hellwig 		wait_event_lock_irq(log->iounit_wait,
73217036461SChristoph Hellwig 				    r5l_reclaimable_space(log) > reclaimable,
73317036461SChristoph Hellwig 				    log->io_list_lock);
7340576b1c6SShaohua Li 	}
73517036461SChristoph Hellwig 
73617036461SChristoph Hellwig 	next_checkpoint = log->next_checkpoint;
73717036461SChristoph Hellwig 	next_cp_seq = log->next_cp_seq;
7380576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
7390576b1c6SShaohua Li 
74017036461SChristoph Hellwig 	BUG_ON(reclaimable < 0);
74117036461SChristoph Hellwig 	if (reclaimable == 0)
7420576b1c6SShaohua Li 		return;
7430576b1c6SShaohua Li 
7440576b1c6SShaohua Li 	/*
7450576b1c6SShaohua Li 	 * write_super will flush cache of each raid disk. We must write super
7460576b1c6SShaohua Li 	 * here, because the log area might be reused soon and we don't want to
7470576b1c6SShaohua Li 	 * confuse recovery
7480576b1c6SShaohua Li 	 */
7494b482044SShaohua Li 	r5l_write_super_and_discard_space(log, next_checkpoint);
7500576b1c6SShaohua Li 
7510576b1c6SShaohua Li 	mutex_lock(&log->io_mutex);
75217036461SChristoph Hellwig 	log->last_checkpoint = next_checkpoint;
75317036461SChristoph Hellwig 	log->last_cp_seq = next_cp_seq;
7540576b1c6SShaohua Li 	mutex_unlock(&log->io_mutex);
7550576b1c6SShaohua Li 
75617036461SChristoph Hellwig 	r5l_run_no_space_stripes(log);
7570576b1c6SShaohua Li }
7580576b1c6SShaohua Li 
7590576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread)
7600576b1c6SShaohua Li {
7610576b1c6SShaohua Li 	struct mddev *mddev = thread->mddev;
7620576b1c6SShaohua Li 	struct r5conf *conf = mddev->private;
7630576b1c6SShaohua Li 	struct r5l_log *log = conf->log;
7640576b1c6SShaohua Li 
7650576b1c6SShaohua Li 	if (!log)
7660576b1c6SShaohua Li 		return;
7670576b1c6SShaohua Li 	r5l_do_reclaim(log);
7680576b1c6SShaohua Li }
7690576b1c6SShaohua Li 
770f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
771f6bed0efSShaohua Li {
7720576b1c6SShaohua Li 	unsigned long target;
7730576b1c6SShaohua Li 	unsigned long new = (unsigned long)space; /* overflow in theory */
7740576b1c6SShaohua Li 
7750576b1c6SShaohua Li 	do {
7760576b1c6SShaohua Li 		target = log->reclaim_target;
7770576b1c6SShaohua Li 		if (new < target)
7780576b1c6SShaohua Li 			return;
7790576b1c6SShaohua Li 	} while (cmpxchg(&log->reclaim_target, target, new) != target);
7800576b1c6SShaohua Li 	md_wakeup_thread(log->reclaim_thread);
781f6bed0efSShaohua Li }
782f6bed0efSShaohua Li 
783e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state)
784e6c033f7SShaohua Li {
7854b482044SShaohua Li 	struct mddev *mddev;
786e6c033f7SShaohua Li 	if (!log || state == 2)
787e6c033f7SShaohua Li 		return;
788e6c033f7SShaohua Li 	if (state == 0) {
7894b482044SShaohua Li 		log->in_teardown = 0;
790e6c033f7SShaohua Li 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
791e6c033f7SShaohua Li 					log->rdev->mddev, "reclaim");
792e6c033f7SShaohua Li 	} else if (state == 1) {
793e6c033f7SShaohua Li 		/*
794e6c033f7SShaohua Li 		 * at this point all stripes are finished, so io_unit is at
795e6c033f7SShaohua Li 		 * least in STRIPE_END state
796e6c033f7SShaohua Li 		 */
7974b482044SShaohua Li 		log->in_teardown = 1;
7984b482044SShaohua Li 		/* make sure r5l_write_super_and_discard_space exits */
7994b482044SShaohua Li 		mddev = log->rdev->mddev;
8004b482044SShaohua Li 		wake_up(&mddev->sb_wait);
801e6c033f7SShaohua Li 		r5l_wake_reclaim(log, -1L);
802e6c033f7SShaohua Li 		md_unregister_thread(&log->reclaim_thread);
803e6c033f7SShaohua Li 		r5l_do_reclaim(log);
804e6c033f7SShaohua Li 	}
805e6c033f7SShaohua Li }
806e6c033f7SShaohua Li 
807*6e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf)
808*6e74a9cfSShaohua Li {
809*6e74a9cfSShaohua Li 	if (!conf->log)
810*6e74a9cfSShaohua Li 		return false;
811*6e74a9cfSShaohua Li 	return test_bit(Faulty, &conf->log->rdev->flags);
812*6e74a9cfSShaohua Li }
813*6e74a9cfSShaohua Li 
814355810d1SShaohua Li struct r5l_recovery_ctx {
815355810d1SShaohua Li 	struct page *meta_page;		/* current meta */
816355810d1SShaohua Li 	sector_t meta_total_blocks;	/* total size of current meta and data */
817355810d1SShaohua Li 	sector_t pos;			/* recovery position */
818355810d1SShaohua Li 	u64 seq;			/* recovery position seq */
819355810d1SShaohua Li };
820355810d1SShaohua Li 
821355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log,
822355810d1SShaohua Li 			       struct r5l_recovery_ctx *ctx)
823355810d1SShaohua Li {
824355810d1SShaohua Li 	struct page *page = ctx->meta_page;
825355810d1SShaohua Li 	struct r5l_meta_block *mb;
826355810d1SShaohua Li 	u32 crc, stored_crc;
827355810d1SShaohua Li 
828355810d1SShaohua Li 	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
829355810d1SShaohua Li 		return -EIO;
830355810d1SShaohua Li 
831355810d1SShaohua Li 	mb = page_address(page);
832355810d1SShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
833355810d1SShaohua Li 	mb->checksum = 0;
834355810d1SShaohua Li 
835355810d1SShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
836355810d1SShaohua Li 	    le64_to_cpu(mb->seq) != ctx->seq ||
837355810d1SShaohua Li 	    mb->version != R5LOG_VERSION ||
838355810d1SShaohua Li 	    le64_to_cpu(mb->position) != ctx->pos)
839355810d1SShaohua Li 		return -EINVAL;
840355810d1SShaohua Li 
8415cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
842355810d1SShaohua Li 	if (stored_crc != crc)
843355810d1SShaohua Li 		return -EINVAL;
844355810d1SShaohua Li 
845355810d1SShaohua Li 	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
846355810d1SShaohua Li 		return -EINVAL;
847355810d1SShaohua Li 
848355810d1SShaohua Li 	ctx->meta_total_blocks = BLOCK_SECTORS;
849355810d1SShaohua Li 
850355810d1SShaohua Li 	return 0;
851355810d1SShaohua Li }
852355810d1SShaohua Li 
853355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
854355810d1SShaohua Li 					 struct r5l_recovery_ctx *ctx,
855355810d1SShaohua Li 					 sector_t stripe_sect,
856355810d1SShaohua Li 					 int *offset, sector_t *log_offset)
857355810d1SShaohua Li {
858355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
859355810d1SShaohua Li 	struct stripe_head *sh;
860355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
861355810d1SShaohua Li 	int disk_index;
862355810d1SShaohua Li 
863355810d1SShaohua Li 	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
864355810d1SShaohua Li 	while (1) {
865355810d1SShaohua Li 		payload = page_address(ctx->meta_page) + *offset;
866355810d1SShaohua Li 
867355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
868355810d1SShaohua Li 			raid5_compute_sector(conf,
869355810d1SShaohua Li 					     le64_to_cpu(payload->location), 0,
870355810d1SShaohua Li 					     &disk_index, sh);
871355810d1SShaohua Li 
872355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
873355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
874355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
875355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
876355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
877355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS;
878355810d1SShaohua Li 		} else {
879355810d1SShaohua Li 			disk_index = sh->pd_idx;
880355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
881355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
882355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
883355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
884355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
885355810d1SShaohua Li 
886355810d1SShaohua Li 			if (sh->qd_idx >= 0) {
887355810d1SShaohua Li 				disk_index = sh->qd_idx;
888355810d1SShaohua Li 				sync_page_io(log->rdev,
889355810d1SShaohua Li 					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
890355810d1SShaohua Li 					     PAGE_SIZE, sh->dev[disk_index].page,
891355810d1SShaohua Li 					     READ, false);
892355810d1SShaohua Li 				sh->dev[disk_index].log_checksum =
893355810d1SShaohua Li 					le32_to_cpu(payload->checksum[1]);
894355810d1SShaohua Li 				set_bit(R5_Wantwrite,
895355810d1SShaohua Li 					&sh->dev[disk_index].flags);
896355810d1SShaohua Li 			}
897355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
898355810d1SShaohua Li 		}
899355810d1SShaohua Li 
900355810d1SShaohua Li 		*log_offset = r5l_ring_add(log, *log_offset,
901355810d1SShaohua Li 					   le32_to_cpu(payload->size));
902355810d1SShaohua Li 		*offset += sizeof(struct r5l_payload_data_parity) +
903355810d1SShaohua Li 			sizeof(__le32) *
904355810d1SShaohua Li 			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
905355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
906355810d1SShaohua Li 			break;
907355810d1SShaohua Li 	}
908355810d1SShaohua Li 
909355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
910355810d1SShaohua Li 		void *addr;
911355810d1SShaohua Li 		u32 checksum;
912355810d1SShaohua Li 
913355810d1SShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
914355810d1SShaohua Li 			continue;
915355810d1SShaohua Li 		addr = kmap_atomic(sh->dev[disk_index].page);
9165cb2fbd6SShaohua Li 		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
917355810d1SShaohua Li 		kunmap_atomic(addr);
918355810d1SShaohua Li 		if (checksum != sh->dev[disk_index].log_checksum)
919355810d1SShaohua Li 			goto error;
920355810d1SShaohua Li 	}
921355810d1SShaohua Li 
922355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
923355810d1SShaohua Li 		struct md_rdev *rdev, *rrdev;
924355810d1SShaohua Li 
925355810d1SShaohua Li 		if (!test_and_clear_bit(R5_Wantwrite,
926355810d1SShaohua Li 					&sh->dev[disk_index].flags))
927355810d1SShaohua Li 			continue;
928355810d1SShaohua Li 
929355810d1SShaohua Li 		/* in case device is broken */
930355810d1SShaohua Li 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
931355810d1SShaohua Li 		if (rdev)
932355810d1SShaohua Li 			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
933355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
934355810d1SShaohua Li 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
935355810d1SShaohua Li 		if (rrdev)
936355810d1SShaohua Li 			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
937355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
938355810d1SShaohua Li 	}
939355810d1SShaohua Li 	raid5_release_stripe(sh);
940355810d1SShaohua Li 	return 0;
941355810d1SShaohua Li 
942355810d1SShaohua Li error:
943355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++)
944355810d1SShaohua Li 		sh->dev[disk_index].flags = 0;
945355810d1SShaohua Li 	raid5_release_stripe(sh);
946355810d1SShaohua Li 	return -EINVAL;
947355810d1SShaohua Li }
948355810d1SShaohua Li 
949355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log,
950355810d1SShaohua Li 				       struct r5l_recovery_ctx *ctx)
951355810d1SShaohua Li {
952355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
953355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
954355810d1SShaohua Li 	struct r5l_meta_block *mb;
955355810d1SShaohua Li 	int offset;
956355810d1SShaohua Li 	sector_t log_offset;
957355810d1SShaohua Li 	sector_t stripe_sector;
958355810d1SShaohua Li 
959355810d1SShaohua Li 	mb = page_address(ctx->meta_page);
960355810d1SShaohua Li 	offset = sizeof(struct r5l_meta_block);
961355810d1SShaohua Li 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
962355810d1SShaohua Li 
963355810d1SShaohua Li 	while (offset < le32_to_cpu(mb->meta_size)) {
964355810d1SShaohua Li 		int dd;
965355810d1SShaohua Li 
966355810d1SShaohua Li 		payload = (void *)mb + offset;
967355810d1SShaohua Li 		stripe_sector = raid5_compute_sector(conf,
968355810d1SShaohua Li 						     le64_to_cpu(payload->location), 0, &dd, NULL);
969355810d1SShaohua Li 		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
970355810d1SShaohua Li 						  &offset, &log_offset))
971355810d1SShaohua Li 			return -EINVAL;
972355810d1SShaohua Li 	}
973355810d1SShaohua Li 	return 0;
974355810d1SShaohua Li }
975355810d1SShaohua Li 
976355810d1SShaohua Li /* copy data/parity from log to raid disks */
977355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log,
978355810d1SShaohua Li 				   struct r5l_recovery_ctx *ctx)
979355810d1SShaohua Li {
980355810d1SShaohua Li 	while (1) {
981355810d1SShaohua Li 		if (r5l_read_meta_block(log, ctx))
982355810d1SShaohua Li 			return;
983355810d1SShaohua Li 		if (r5l_recovery_flush_one_meta(log, ctx))
984355810d1SShaohua Li 			return;
985355810d1SShaohua Li 		ctx->seq++;
986355810d1SShaohua Li 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
987355810d1SShaohua Li 	}
988355810d1SShaohua Li }
989355810d1SShaohua Li 
990355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
991355810d1SShaohua Li 					  u64 seq)
992355810d1SShaohua Li {
993355810d1SShaohua Li 	struct page *page;
994355810d1SShaohua Li 	struct r5l_meta_block *mb;
995355810d1SShaohua Li 	u32 crc;
996355810d1SShaohua Li 
997355810d1SShaohua Li 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
998355810d1SShaohua Li 	if (!page)
999355810d1SShaohua Li 		return -ENOMEM;
1000355810d1SShaohua Li 	mb = page_address(page);
1001355810d1SShaohua Li 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
1002355810d1SShaohua Li 	mb->version = R5LOG_VERSION;
1003355810d1SShaohua Li 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1004355810d1SShaohua Li 	mb->seq = cpu_to_le64(seq);
1005355810d1SShaohua Li 	mb->position = cpu_to_le64(pos);
10065cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1007355810d1SShaohua Li 	mb->checksum = cpu_to_le32(crc);
1008355810d1SShaohua Li 
1009355810d1SShaohua Li 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
1010355810d1SShaohua Li 		__free_page(page);
1011355810d1SShaohua Li 		return -EIO;
1012355810d1SShaohua Li 	}
1013355810d1SShaohua Li 	__free_page(page);
1014355810d1SShaohua Li 	return 0;
1015355810d1SShaohua Li }
1016355810d1SShaohua Li 
1017f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log)
1018f6bed0efSShaohua Li {
1019355810d1SShaohua Li 	struct r5l_recovery_ctx ctx;
1020355810d1SShaohua Li 
1021355810d1SShaohua Li 	ctx.pos = log->last_checkpoint;
1022355810d1SShaohua Li 	ctx.seq = log->last_cp_seq;
1023355810d1SShaohua Li 	ctx.meta_page = alloc_page(GFP_KERNEL);
1024355810d1SShaohua Li 	if (!ctx.meta_page)
1025355810d1SShaohua Li 		return -ENOMEM;
1026355810d1SShaohua Li 
1027355810d1SShaohua Li 	r5l_recovery_flush_log(log, &ctx);
1028355810d1SShaohua Li 	__free_page(ctx.meta_page);
1029355810d1SShaohua Li 
1030355810d1SShaohua Li 	/*
1031355810d1SShaohua Li 	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
1032355810d1SShaohua Li 	 * log will start here. but we can't let superblock point to last valid
1033355810d1SShaohua Li 	 * meta block. The log might looks like:
1034355810d1SShaohua Li 	 * | meta 1| meta 2| meta 3|
1035355810d1SShaohua Li 	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1036355810d1SShaohua Li 	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
1037355810d1SShaohua Li 	 * happens again, new recovery will start from meta 1. Since meta 2n is
1038355810d1SShaohua Li 	 * valid now, recovery will think meta 3 is valid, which is wrong.
1039355810d1SShaohua Li 	 * The solution is we create a new meta in meta2 with its seq == meta
1040355810d1SShaohua Li 	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
1041355810d1SShaohua Li 	 * not think meta 3 is a valid meta, because its seq doesn't match
1042355810d1SShaohua Li 	 */
1043355810d1SShaohua Li 	if (ctx.seq > log->last_cp_seq + 1) {
1044355810d1SShaohua Li 		int ret;
1045355810d1SShaohua Li 
1046355810d1SShaohua Li 		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1047355810d1SShaohua Li 		if (ret)
1048355810d1SShaohua Li 			return ret;
1049355810d1SShaohua Li 		log->seq = ctx.seq + 11;
1050355810d1SShaohua Li 		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1051355810d1SShaohua Li 		r5l_write_super(log, ctx.pos);
1052355810d1SShaohua Li 	} else {
1053355810d1SShaohua Li 		log->log_start = ctx.pos;
1054355810d1SShaohua Li 		log->seq = ctx.seq;
1055355810d1SShaohua Li 	}
1056f6bed0efSShaohua Li 	return 0;
1057f6bed0efSShaohua Li }
1058f6bed0efSShaohua Li 
1059f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp)
1060f6bed0efSShaohua Li {
1061f6bed0efSShaohua Li 	struct mddev *mddev = log->rdev->mddev;
1062f6bed0efSShaohua Li 
1063f6bed0efSShaohua Li 	log->rdev->journal_tail = cp;
1064f6bed0efSShaohua Li 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1065f6bed0efSShaohua Li }
1066f6bed0efSShaohua Li 
1067f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log)
1068f6bed0efSShaohua Li {
1069f6bed0efSShaohua Li 	struct md_rdev *rdev = log->rdev;
1070f6bed0efSShaohua Li 	struct page *page;
1071f6bed0efSShaohua Li 	struct r5l_meta_block *mb;
1072f6bed0efSShaohua Li 	sector_t cp = log->rdev->journal_tail;
1073f6bed0efSShaohua Li 	u32 stored_crc, expected_crc;
1074f6bed0efSShaohua Li 	bool create_super = false;
1075f6bed0efSShaohua Li 	int ret;
1076f6bed0efSShaohua Li 
1077f6bed0efSShaohua Li 	/* Make sure it's valid */
1078f6bed0efSShaohua Li 	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1079f6bed0efSShaohua Li 		cp = 0;
1080f6bed0efSShaohua Li 	page = alloc_page(GFP_KERNEL);
1081f6bed0efSShaohua Li 	if (!page)
1082f6bed0efSShaohua Li 		return -ENOMEM;
1083f6bed0efSShaohua Li 
1084f6bed0efSShaohua Li 	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1085f6bed0efSShaohua Li 		ret = -EIO;
1086f6bed0efSShaohua Li 		goto ioerr;
1087f6bed0efSShaohua Li 	}
1088f6bed0efSShaohua Li 	mb = page_address(page);
1089f6bed0efSShaohua Li 
1090f6bed0efSShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1091f6bed0efSShaohua Li 	    mb->version != R5LOG_VERSION) {
1092f6bed0efSShaohua Li 		create_super = true;
1093f6bed0efSShaohua Li 		goto create;
1094f6bed0efSShaohua Li 	}
1095f6bed0efSShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
1096f6bed0efSShaohua Li 	mb->checksum = 0;
10975cb2fbd6SShaohua Li 	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1098f6bed0efSShaohua Li 	if (stored_crc != expected_crc) {
1099f6bed0efSShaohua Li 		create_super = true;
1100f6bed0efSShaohua Li 		goto create;
1101f6bed0efSShaohua Li 	}
1102f6bed0efSShaohua Li 	if (le64_to_cpu(mb->position) != cp) {
1103f6bed0efSShaohua Li 		create_super = true;
1104f6bed0efSShaohua Li 		goto create;
1105f6bed0efSShaohua Li 	}
1106f6bed0efSShaohua Li create:
1107f6bed0efSShaohua Li 	if (create_super) {
1108f6bed0efSShaohua Li 		log->last_cp_seq = prandom_u32();
1109f6bed0efSShaohua Li 		cp = 0;
1110f6bed0efSShaohua Li 		/*
1111f6bed0efSShaohua Li 		 * Make sure super points to correct address. Log might have
1112f6bed0efSShaohua Li 		 * data very soon. If super hasn't correct log tail address,
1113f6bed0efSShaohua Li 		 * recovery can't find the log
1114f6bed0efSShaohua Li 		 */
1115f6bed0efSShaohua Li 		r5l_write_super(log, cp);
1116f6bed0efSShaohua Li 	} else
1117f6bed0efSShaohua Li 		log->last_cp_seq = le64_to_cpu(mb->seq);
1118f6bed0efSShaohua Li 
1119f6bed0efSShaohua Li 	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
11200576b1c6SShaohua Li 	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
11210576b1c6SShaohua Li 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
11220576b1c6SShaohua Li 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1123f6bed0efSShaohua Li 	log->last_checkpoint = cp;
1124f6bed0efSShaohua Li 
1125f6bed0efSShaohua Li 	__free_page(page);
1126f6bed0efSShaohua Li 
1127f6bed0efSShaohua Li 	return r5l_recovery_log(log);
1128f6bed0efSShaohua Li ioerr:
1129f6bed0efSShaohua Li 	__free_page(page);
1130f6bed0efSShaohua Li 	return ret;
1131f6bed0efSShaohua Li }
1132f6bed0efSShaohua Li 
1133f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1134f6bed0efSShaohua Li {
1135f6bed0efSShaohua Li 	struct r5l_log *log;
1136f6bed0efSShaohua Li 
1137f6bed0efSShaohua Li 	if (PAGE_SIZE != 4096)
1138f6bed0efSShaohua Li 		return -EINVAL;
1139f6bed0efSShaohua Li 	log = kzalloc(sizeof(*log), GFP_KERNEL);
1140f6bed0efSShaohua Li 	if (!log)
1141f6bed0efSShaohua Li 		return -ENOMEM;
1142f6bed0efSShaohua Li 	log->rdev = rdev;
1143f6bed0efSShaohua Li 
114456fef7c6SChristoph Hellwig 	log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
114556fef7c6SChristoph Hellwig 
11465cb2fbd6SShaohua Li 	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1147f6bed0efSShaohua Li 				       sizeof(rdev->mddev->uuid));
1148f6bed0efSShaohua Li 
1149f6bed0efSShaohua Li 	mutex_init(&log->io_mutex);
1150f6bed0efSShaohua Li 
1151f6bed0efSShaohua Li 	spin_lock_init(&log->io_list_lock);
1152f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->running_ios);
11530576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->io_end_ios);
1154a8c34f91SShaohua Li 	INIT_LIST_HEAD(&log->flushing_ios);
115504732f74SChristoph Hellwig 	INIT_LIST_HEAD(&log->finished_ios);
1156a8c34f91SShaohua Li 	bio_init(&log->flush_bio);
1157f6bed0efSShaohua Li 
1158f6bed0efSShaohua Li 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1159f6bed0efSShaohua Li 	if (!log->io_kc)
1160f6bed0efSShaohua Li 		goto io_kc;
1161f6bed0efSShaohua Li 
11620576b1c6SShaohua Li 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
11630576b1c6SShaohua Li 						 log->rdev->mddev, "reclaim");
11640576b1c6SShaohua Li 	if (!log->reclaim_thread)
11650576b1c6SShaohua Li 		goto reclaim_thread;
11660fd22b45SShaohua Li 	init_waitqueue_head(&log->iounit_wait);
11670576b1c6SShaohua Li 
1168f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->no_space_stripes);
1169f6bed0efSShaohua Li 	spin_lock_init(&log->no_space_stripes_lock);
1170f6bed0efSShaohua Li 
1171f6bed0efSShaohua Li 	if (r5l_load_log(log))
1172f6bed0efSShaohua Li 		goto error;
1173f6bed0efSShaohua Li 
1174f6bed0efSShaohua Li 	conf->log = log;
1175f6bed0efSShaohua Li 	return 0;
1176f6bed0efSShaohua Li error:
11770576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
11780576b1c6SShaohua Li reclaim_thread:
1179f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1180f6bed0efSShaohua Li io_kc:
1181f6bed0efSShaohua Li 	kfree(log);
1182f6bed0efSShaohua Li 	return -EINVAL;
1183f6bed0efSShaohua Li }
1184f6bed0efSShaohua Li 
1185f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log)
1186f6bed0efSShaohua Li {
11870576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
1188f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1189f6bed0efSShaohua Li 	kfree(log);
1190f6bed0efSShaohua Li }
1191