xref: /linux/drivers/md/raid5-cache.c (revision 51039cd066553689bb82a588b25a6eba7d453837)
1f6bed0efSShaohua Li /*
2f6bed0efSShaohua Li  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3f6bed0efSShaohua Li  *
4f6bed0efSShaohua Li  * This program is free software; you can redistribute it and/or modify it
5f6bed0efSShaohua Li  * under the terms and conditions of the GNU General Public License,
6f6bed0efSShaohua Li  * version 2, as published by the Free Software Foundation.
7f6bed0efSShaohua Li  *
8f6bed0efSShaohua Li  * This program is distributed in the hope it will be useful, but WITHOUT
9f6bed0efSShaohua Li  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10f6bed0efSShaohua Li  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11f6bed0efSShaohua Li  * more details.
12f6bed0efSShaohua Li  *
13f6bed0efSShaohua Li  */
14f6bed0efSShaohua Li #include <linux/kernel.h>
15f6bed0efSShaohua Li #include <linux/wait.h>
16f6bed0efSShaohua Li #include <linux/blkdev.h>
17f6bed0efSShaohua Li #include <linux/slab.h>
18f6bed0efSShaohua Li #include <linux/raid/md_p.h>
195cb2fbd6SShaohua Li #include <linux/crc32c.h>
20f6bed0efSShaohua Li #include <linux/random.h>
21f6bed0efSShaohua Li #include "md.h"
22f6bed0efSShaohua Li #include "raid5.h"
23f6bed0efSShaohua Li 
24f6bed0efSShaohua Li /*
25f6bed0efSShaohua Li  * metadata/data stored in disk with 4k size unit (a block) regardless
26f6bed0efSShaohua Li  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27f6bed0efSShaohua Li  */
28f6bed0efSShaohua Li #define BLOCK_SECTORS (8)
29f6bed0efSShaohua Li 
300576b1c6SShaohua Li /*
310576b1c6SShaohua Li  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
320576b1c6SShaohua Li  * recovery scans a very long log
330576b1c6SShaohua Li  */
340576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
350576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
360576b1c6SShaohua Li 
37f6bed0efSShaohua Li struct r5l_log {
38f6bed0efSShaohua Li 	struct md_rdev *rdev;
39f6bed0efSShaohua Li 
40f6bed0efSShaohua Li 	u32 uuid_checksum;
41f6bed0efSShaohua Li 
42f6bed0efSShaohua Li 	sector_t device_size;		/* log device size, round to
43f6bed0efSShaohua Li 					 * BLOCK_SECTORS */
440576b1c6SShaohua Li 	sector_t max_free_space;	/* reclaim run if free space is at
450576b1c6SShaohua Li 					 * this size */
46f6bed0efSShaohua Li 
47f6bed0efSShaohua Li 	sector_t last_checkpoint;	/* log tail. where recovery scan
48f6bed0efSShaohua Li 					 * starts from */
49f6bed0efSShaohua Li 	u64 last_cp_seq;		/* log tail sequence */
50f6bed0efSShaohua Li 
51f6bed0efSShaohua Li 	sector_t log_start;		/* log head. where new data appends */
52f6bed0efSShaohua Li 	u64 seq;			/* log head sequence */
53f6bed0efSShaohua Li 
5417036461SChristoph Hellwig 	sector_t next_checkpoint;
5517036461SChristoph Hellwig 	u64 next_cp_seq;
5617036461SChristoph Hellwig 
57f6bed0efSShaohua Li 	struct mutex io_mutex;
58f6bed0efSShaohua Li 	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
59f6bed0efSShaohua Li 
60f6bed0efSShaohua Li 	spinlock_t io_list_lock;
61f6bed0efSShaohua Li 	struct list_head running_ios;	/* io_units which are still running,
62f6bed0efSShaohua Li 					 * and have not yet been completely
63f6bed0efSShaohua Li 					 * written to the log */
64f6bed0efSShaohua Li 	struct list_head io_end_ios;	/* io_units which have been completely
65f6bed0efSShaohua Li 					 * written to the log but not yet written
66f6bed0efSShaohua Li 					 * to the RAID */
67a8c34f91SShaohua Li 	struct list_head flushing_ios;	/* io_units which are waiting for log
68a8c34f91SShaohua Li 					 * cache flush */
6904732f74SChristoph Hellwig 	struct list_head finished_ios;	/* io_units which settle down in log disk */
70a8c34f91SShaohua Li 	struct bio flush_bio;
71f6bed0efSShaohua Li 
72f6bed0efSShaohua Li 	struct kmem_cache *io_kc;
73f6bed0efSShaohua Li 
740576b1c6SShaohua Li 	struct md_thread *reclaim_thread;
750576b1c6SShaohua Li 	unsigned long reclaim_target;	/* number of space that need to be
760576b1c6SShaohua Li 					 * reclaimed.  if it's 0, reclaim spaces
770576b1c6SShaohua Li 					 * used by io_units which are in
780576b1c6SShaohua Li 					 * IO_UNIT_STRIPE_END state (eg, reclaim
790576b1c6SShaohua Li 					 * dones't wait for specific io_unit
800576b1c6SShaohua Li 					 * switching to IO_UNIT_STRIPE_END
810576b1c6SShaohua Li 					 * state) */
820fd22b45SShaohua Li 	wait_queue_head_t iounit_wait;
830576b1c6SShaohua Li 
84f6bed0efSShaohua Li 	struct list_head no_space_stripes; /* pending stripes, log has no space */
85f6bed0efSShaohua Li 	spinlock_t no_space_stripes_lock;
8656fef7c6SChristoph Hellwig 
8756fef7c6SChristoph Hellwig 	bool need_cache_flush;
88f6bed0efSShaohua Li };
89f6bed0efSShaohua Li 
90f6bed0efSShaohua Li /*
91f6bed0efSShaohua Li  * an IO range starts from a meta data block and end at the next meta data
92f6bed0efSShaohua Li  * block. The io unit's the meta data block tracks data/parity followed it. io
93f6bed0efSShaohua Li  * unit is written to log disk with normal write, as we always flush log disk
94f6bed0efSShaohua Li  * first and then start move data to raid disks, there is no requirement to
95f6bed0efSShaohua Li  * write io unit with FLUSH/FUA
96f6bed0efSShaohua Li  */
97f6bed0efSShaohua Li struct r5l_io_unit {
98f6bed0efSShaohua Li 	struct r5l_log *log;
99f6bed0efSShaohua Li 
100f6bed0efSShaohua Li 	struct page *meta_page;	/* store meta block */
101f6bed0efSShaohua Li 	int meta_offset;	/* current offset in meta_page */
102f6bed0efSShaohua Li 
103f6bed0efSShaohua Li 	struct bio_list bios;
104f6bed0efSShaohua Li 	atomic_t pending_io;	/* pending bios not written to log yet */
105f6bed0efSShaohua Li 	struct bio *current_bio;/* current_bio accepting new data */
106f6bed0efSShaohua Li 
107f6bed0efSShaohua Li 	atomic_t pending_stripe;/* how many stripes not flushed to raid */
108f6bed0efSShaohua Li 	u64 seq;		/* seq number of the metablock */
109f6bed0efSShaohua Li 	sector_t log_start;	/* where the io_unit starts */
110f6bed0efSShaohua Li 	sector_t log_end;	/* where the io_unit ends */
111f6bed0efSShaohua Li 	struct list_head log_sibling; /* log->running_ios */
112f6bed0efSShaohua Li 	struct list_head stripe_list; /* stripes added to the io_unit */
113f6bed0efSShaohua Li 
114f6bed0efSShaohua Li 	int state;
115f6bed0efSShaohua Li };
116f6bed0efSShaohua Li 
117f6bed0efSShaohua Li /* r5l_io_unit state */
118f6bed0efSShaohua Li enum r5l_io_unit_state {
119f6bed0efSShaohua Li 	IO_UNIT_RUNNING = 0,	/* accepting new IO */
120f6bed0efSShaohua Li 	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
121f6bed0efSShaohua Li 				 * don't accepting new bio */
122f6bed0efSShaohua Li 	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
123a8c34f91SShaohua Li 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
124f6bed0efSShaohua Li };
125f6bed0efSShaohua Li 
126f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
127f6bed0efSShaohua Li {
128f6bed0efSShaohua Li 	start += inc;
129f6bed0efSShaohua Li 	if (start >= log->device_size)
130f6bed0efSShaohua Li 		start = start - log->device_size;
131f6bed0efSShaohua Li 	return start;
132f6bed0efSShaohua Li }
133f6bed0efSShaohua Li 
134f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
135f6bed0efSShaohua Li 				  sector_t end)
136f6bed0efSShaohua Li {
137f6bed0efSShaohua Li 	if (end >= start)
138f6bed0efSShaohua Li 		return end - start;
139f6bed0efSShaohua Li 	else
140f6bed0efSShaohua Li 		return end + log->device_size - start;
141f6bed0efSShaohua Li }
142f6bed0efSShaohua Li 
143f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
144f6bed0efSShaohua Li {
145f6bed0efSShaohua Li 	sector_t used_size;
146f6bed0efSShaohua Li 
147f6bed0efSShaohua Li 	used_size = r5l_ring_distance(log, log->last_checkpoint,
148f6bed0efSShaohua Li 					log->log_start);
149f6bed0efSShaohua Li 
150f6bed0efSShaohua Li 	return log->device_size > used_size + size;
151f6bed0efSShaohua Li }
152f6bed0efSShaohua Li 
153f6bed0efSShaohua Li static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
154f6bed0efSShaohua Li {
155f6bed0efSShaohua Li 	__free_page(io->meta_page);
156f6bed0efSShaohua Li 	kmem_cache_free(log->io_kc, io);
157f6bed0efSShaohua Li }
158f6bed0efSShaohua Li 
159f6bed0efSShaohua Li static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
160f6bed0efSShaohua Li 				  enum r5l_io_unit_state state)
161f6bed0efSShaohua Li {
162f6bed0efSShaohua Li 	struct r5l_io_unit *io;
163f6bed0efSShaohua Li 
164f6bed0efSShaohua Li 	while (!list_empty(from)) {
165f6bed0efSShaohua Li 		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
166f6bed0efSShaohua Li 		/* don't change list order */
167f6bed0efSShaohua Li 		if (io->state >= state)
168f6bed0efSShaohua Li 			list_move_tail(&io->log_sibling, to);
169f6bed0efSShaohua Li 		else
170f6bed0efSShaohua Li 			break;
171f6bed0efSShaohua Li 	}
172f6bed0efSShaohua Li }
173f6bed0efSShaohua Li 
174f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
175f6bed0efSShaohua Li 				    enum r5l_io_unit_state state)
176f6bed0efSShaohua Li {
177f6bed0efSShaohua Li 	if (WARN_ON(io->state >= state))
178f6bed0efSShaohua Li 		return;
179f6bed0efSShaohua Li 	io->state = state;
180f6bed0efSShaohua Li }
181f6bed0efSShaohua Li 
182d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io)
183d8858f43SChristoph Hellwig {
184d8858f43SChristoph Hellwig 	struct stripe_head *sh, *next;
185d8858f43SChristoph Hellwig 
186d8858f43SChristoph Hellwig 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
187d8858f43SChristoph Hellwig 		list_del_init(&sh->log_list);
188d8858f43SChristoph Hellwig 		set_bit(STRIPE_HANDLE, &sh->state);
189d8858f43SChristoph Hellwig 		raid5_release_stripe(sh);
190d8858f43SChristoph Hellwig 	}
191d8858f43SChristoph Hellwig }
192d8858f43SChristoph Hellwig 
193f6bed0efSShaohua Li /* XXX: totally ignores I/O errors */
19456fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log)
19556fef7c6SChristoph Hellwig {
19656fef7c6SChristoph Hellwig 	struct r5l_io_unit *io, *next;
19756fef7c6SChristoph Hellwig 
19856fef7c6SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
19956fef7c6SChristoph Hellwig 
20056fef7c6SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
20156fef7c6SChristoph Hellwig 		/* don't change list order */
20256fef7c6SChristoph Hellwig 		if (io->state < IO_UNIT_IO_END)
20356fef7c6SChristoph Hellwig 			break;
20456fef7c6SChristoph Hellwig 
20556fef7c6SChristoph Hellwig 		list_move_tail(&io->log_sibling, &log->finished_ios);
20656fef7c6SChristoph Hellwig 		r5l_io_run_stripes(io);
20756fef7c6SChristoph Hellwig 	}
20856fef7c6SChristoph Hellwig }
20956fef7c6SChristoph Hellwig 
210f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio)
211f6bed0efSShaohua Li {
212f6bed0efSShaohua Li 	struct r5l_io_unit *io = bio->bi_private;
213f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
214509ffec7SChristoph Hellwig 	unsigned long flags;
215f6bed0efSShaohua Li 
216f6bed0efSShaohua Li 	bio_put(bio);
217f6bed0efSShaohua Li 
218f6bed0efSShaohua Li 	if (!atomic_dec_and_test(&io->pending_io))
219f6bed0efSShaohua Li 		return;
220f6bed0efSShaohua Li 
221509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
222509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
22356fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
224509ffec7SChristoph Hellwig 		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
225509ffec7SChristoph Hellwig 				      IO_UNIT_IO_END);
22656fef7c6SChristoph Hellwig 	else
22756fef7c6SChristoph Hellwig 		r5l_log_run_stripes(log);
228509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
229509ffec7SChristoph Hellwig 
23056fef7c6SChristoph Hellwig 	if (log->need_cache_flush)
231f6bed0efSShaohua Li 		md_wakeup_thread(log->rdev->mddev->thread);
232f6bed0efSShaohua Li }
233f6bed0efSShaohua Li 
234f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log)
235f6bed0efSShaohua Li {
236f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
237f6bed0efSShaohua Li 	struct r5l_meta_block *block;
238f6bed0efSShaohua Li 	struct bio *bio;
239509ffec7SChristoph Hellwig 	unsigned long flags;
240f6bed0efSShaohua Li 	u32 crc;
241f6bed0efSShaohua Li 
242f6bed0efSShaohua Li 	if (!io)
243f6bed0efSShaohua Li 		return;
244f6bed0efSShaohua Li 
245f6bed0efSShaohua Li 	block = page_address(io->meta_page);
246f6bed0efSShaohua Li 	block->meta_size = cpu_to_le32(io->meta_offset);
2475cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
248f6bed0efSShaohua Li 	block->checksum = cpu_to_le32(crc);
249f6bed0efSShaohua Li 
250f6bed0efSShaohua Li 	log->current_io = NULL;
251509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
252509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
253509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
254f6bed0efSShaohua Li 
2551e932a37SChristoph Hellwig 	while ((bio = bio_list_pop(&io->bios)))
256f6bed0efSShaohua Li 		submit_bio(WRITE, bio);
257f6bed0efSShaohua Li }
258f6bed0efSShaohua Li 
259b349feb3SChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log, struct r5l_io_unit *io)
260b349feb3SChristoph Hellwig {
261b349feb3SChristoph Hellwig 	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
262b349feb3SChristoph Hellwig 
263b349feb3SChristoph Hellwig 	bio->bi_rw = WRITE;
264b349feb3SChristoph Hellwig 	bio->bi_bdev = log->rdev->bdev;
2651e932a37SChristoph Hellwig 	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
266b349feb3SChristoph Hellwig 	bio->bi_end_io = r5l_log_endio;
267b349feb3SChristoph Hellwig 	bio->bi_private = io;
268b349feb3SChristoph Hellwig 
269b349feb3SChristoph Hellwig 	bio_list_add(&io->bios, bio);
270b349feb3SChristoph Hellwig 	atomic_inc(&io->pending_io);
271b349feb3SChristoph Hellwig 	return bio;
272b349feb3SChristoph Hellwig }
273b349feb3SChristoph Hellwig 
274f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
275f6bed0efSShaohua Li {
276f6bed0efSShaohua Li 	struct r5l_io_unit *io;
277f6bed0efSShaohua Li 	struct r5l_meta_block *block;
278f6bed0efSShaohua Li 
279*51039cd0SChristoph Hellwig 	/* We can't handle memory allocate failure so far */
280*51039cd0SChristoph Hellwig 	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
281*51039cd0SChristoph Hellwig 	io->log = log;
282*51039cd0SChristoph Hellwig 	bio_list_init(&io->bios);
283*51039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->log_sibling);
284*51039cd0SChristoph Hellwig 	INIT_LIST_HEAD(&io->stripe_list);
285*51039cd0SChristoph Hellwig 	io->state = IO_UNIT_RUNNING;
286f6bed0efSShaohua Li 
287*51039cd0SChristoph Hellwig 	io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
288f6bed0efSShaohua Li 	block = page_address(io->meta_page);
289f6bed0efSShaohua Li 	block->magic = cpu_to_le32(R5LOG_MAGIC);
290f6bed0efSShaohua Li 	block->version = R5LOG_VERSION;
291f6bed0efSShaohua Li 	block->seq = cpu_to_le64(log->seq);
292f6bed0efSShaohua Li 	block->position = cpu_to_le64(log->log_start);
293f6bed0efSShaohua Li 
294f6bed0efSShaohua Li 	io->log_start = log->log_start;
295f6bed0efSShaohua Li 	io->meta_offset = sizeof(struct r5l_meta_block);
296f6bed0efSShaohua Li 	io->seq = log->seq;
297f6bed0efSShaohua Li 
298b349feb3SChristoph Hellwig 	io->current_bio = r5l_bio_alloc(log, io);
299b349feb3SChristoph Hellwig 	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
300f6bed0efSShaohua Li 
301f6bed0efSShaohua Li 	log->seq++;
302f6bed0efSShaohua Li 	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
303f6bed0efSShaohua Li 	io->log_end = log->log_start;
304f6bed0efSShaohua Li 	/* current bio hit disk end */
305f6bed0efSShaohua Li 	if (log->log_start == 0)
306f6bed0efSShaohua Li 		io->current_bio = NULL;
307f6bed0efSShaohua Li 
308f6bed0efSShaohua Li 	spin_lock_irq(&log->io_list_lock);
309f6bed0efSShaohua Li 	list_add_tail(&io->log_sibling, &log->running_ios);
310f6bed0efSShaohua Li 	spin_unlock_irq(&log->io_list_lock);
311f6bed0efSShaohua Li 
312f6bed0efSShaohua Li 	return io;
313f6bed0efSShaohua Li }
314f6bed0efSShaohua Li 
315f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
316f6bed0efSShaohua Li {
31722581f58SChristoph Hellwig 	if (log->current_io &&
31822581f58SChristoph Hellwig 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
319f6bed0efSShaohua Li 		r5l_submit_current_io(log);
320f6bed0efSShaohua Li 
32122581f58SChristoph Hellwig 	if (!log->current_io)
322f6bed0efSShaohua Li 		log->current_io = r5l_new_meta(log);
323f6bed0efSShaohua Li 	return 0;
324f6bed0efSShaohua Li }
325f6bed0efSShaohua Li 
326f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
327f6bed0efSShaohua Li 				    sector_t location,
328f6bed0efSShaohua Li 				    u32 checksum1, u32 checksum2,
329f6bed0efSShaohua Li 				    bool checksum2_valid)
330f6bed0efSShaohua Li {
331f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
332f6bed0efSShaohua Li 	struct r5l_payload_data_parity *payload;
333f6bed0efSShaohua Li 
334f6bed0efSShaohua Li 	payload = page_address(io->meta_page) + io->meta_offset;
335f6bed0efSShaohua Li 	payload->header.type = cpu_to_le16(type);
336f6bed0efSShaohua Li 	payload->header.flags = cpu_to_le16(0);
337f6bed0efSShaohua Li 	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
338f6bed0efSShaohua Li 				    (PAGE_SHIFT - 9));
339f6bed0efSShaohua Li 	payload->location = cpu_to_le64(location);
340f6bed0efSShaohua Li 	payload->checksum[0] = cpu_to_le32(checksum1);
341f6bed0efSShaohua Li 	if (checksum2_valid)
342f6bed0efSShaohua Li 		payload->checksum[1] = cpu_to_le32(checksum2);
343f6bed0efSShaohua Li 
344f6bed0efSShaohua Li 	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
345f6bed0efSShaohua Li 		sizeof(__le32) * (1 + !!checksum2_valid);
346f6bed0efSShaohua Li }
347f6bed0efSShaohua Li 
348f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
349f6bed0efSShaohua Li {
350f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
351f6bed0efSShaohua Li 
352f6bed0efSShaohua Li alloc_bio:
353b349feb3SChristoph Hellwig 	if (!io->current_bio)
354b349feb3SChristoph Hellwig 		io->current_bio = r5l_bio_alloc(log, io);
355f6bed0efSShaohua Li 
356f6bed0efSShaohua Li 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
357f6bed0efSShaohua Li 		io->current_bio = NULL;
358f6bed0efSShaohua Li 		goto alloc_bio;
359f6bed0efSShaohua Li 	}
360f6bed0efSShaohua Li 	log->log_start = r5l_ring_add(log, log->log_start,
361f6bed0efSShaohua Li 				      BLOCK_SECTORS);
362f6bed0efSShaohua Li 	/* current bio hit disk end */
363f6bed0efSShaohua Li 	if (log->log_start == 0)
364f6bed0efSShaohua Li 		io->current_bio = NULL;
365f6bed0efSShaohua Li 
366f6bed0efSShaohua Li 	io->log_end = log->log_start;
367f6bed0efSShaohua Li }
368f6bed0efSShaohua Li 
369f6bed0efSShaohua Li static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
370f6bed0efSShaohua Li 			   int data_pages, int parity_pages)
371f6bed0efSShaohua Li {
372f6bed0efSShaohua Li 	int i;
373f6bed0efSShaohua Li 	int meta_size;
374f6bed0efSShaohua Li 	struct r5l_io_unit *io;
375f6bed0efSShaohua Li 
376f6bed0efSShaohua Li 	meta_size =
377f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
378f6bed0efSShaohua Li 		 * data_pages) +
379f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
380f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
381f6bed0efSShaohua Li 
382f6bed0efSShaohua Li 	r5l_get_meta(log, meta_size);
383f6bed0efSShaohua Li 	io = log->current_io;
384f6bed0efSShaohua Li 
385f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
386f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
387f6bed0efSShaohua Li 			continue;
388f6bed0efSShaohua Li 		if (i == sh->pd_idx || i == sh->qd_idx)
389f6bed0efSShaohua Li 			continue;
390f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
391f6bed0efSShaohua Li 					raid5_compute_blocknr(sh, i, 0),
392f6bed0efSShaohua Li 					sh->dev[i].log_checksum, 0, false);
393f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[i].page);
394f6bed0efSShaohua Li 	}
395f6bed0efSShaohua Li 
396f6bed0efSShaohua Li 	if (sh->qd_idx >= 0) {
397f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
398f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
399f6bed0efSShaohua Li 					sh->dev[sh->qd_idx].log_checksum, true);
400f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
401f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
402f6bed0efSShaohua Li 	} else {
403f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
404f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
405f6bed0efSShaohua Li 					0, false);
406f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
407f6bed0efSShaohua Li 	}
408f6bed0efSShaohua Li 
409f6bed0efSShaohua Li 	list_add_tail(&sh->log_list, &io->stripe_list);
410f6bed0efSShaohua Li 	atomic_inc(&io->pending_stripe);
411f6bed0efSShaohua Li 	sh->log_io = io;
412f6bed0efSShaohua Li }
413f6bed0efSShaohua Li 
414509ffec7SChristoph Hellwig static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
415f6bed0efSShaohua Li /*
416f6bed0efSShaohua Li  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
417f6bed0efSShaohua Li  * data from log to raid disks), so we shouldn't wait for reclaim here
418f6bed0efSShaohua Li  */
419f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
420f6bed0efSShaohua Li {
421f6bed0efSShaohua Li 	int write_disks = 0;
422f6bed0efSShaohua Li 	int data_pages, parity_pages;
423f6bed0efSShaohua Li 	int meta_size;
424f6bed0efSShaohua Li 	int reserve;
425f6bed0efSShaohua Li 	int i;
426f6bed0efSShaohua Li 
427f6bed0efSShaohua Li 	if (!log)
428f6bed0efSShaohua Li 		return -EAGAIN;
429f6bed0efSShaohua Li 	/* Don't support stripe batch */
430f6bed0efSShaohua Li 	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
431f6bed0efSShaohua Li 	    test_bit(STRIPE_SYNCING, &sh->state)) {
432f6bed0efSShaohua Li 		/* the stripe is written to log, we start writing it to raid */
433f6bed0efSShaohua Li 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
434f6bed0efSShaohua Li 		return -EAGAIN;
435f6bed0efSShaohua Li 	}
436f6bed0efSShaohua Li 
437f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
438f6bed0efSShaohua Li 		void *addr;
439f6bed0efSShaohua Li 
440f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
441f6bed0efSShaohua Li 			continue;
442f6bed0efSShaohua Li 		write_disks++;
443f6bed0efSShaohua Li 		/* checksum is already calculated in last run */
444f6bed0efSShaohua Li 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
445f6bed0efSShaohua Li 			continue;
446f6bed0efSShaohua Li 		addr = kmap_atomic(sh->dev[i].page);
4475cb2fbd6SShaohua Li 		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
448f6bed0efSShaohua Li 						    addr, PAGE_SIZE);
449f6bed0efSShaohua Li 		kunmap_atomic(addr);
450f6bed0efSShaohua Li 	}
451f6bed0efSShaohua Li 	parity_pages = 1 + !!(sh->qd_idx >= 0);
452f6bed0efSShaohua Li 	data_pages = write_disks - parity_pages;
453f6bed0efSShaohua Li 
454f6bed0efSShaohua Li 	meta_size =
455f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
456f6bed0efSShaohua Li 		 * data_pages) +
457f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
458f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
459f6bed0efSShaohua Li 	/* Doesn't work with very big raid array */
460f6bed0efSShaohua Li 	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
461f6bed0efSShaohua Li 		return -EINVAL;
462f6bed0efSShaohua Li 
463f6bed0efSShaohua Li 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
464253f9fd4SShaohua Li 	/*
465253f9fd4SShaohua Li 	 * The stripe must enter state machine again to finish the write, so
466253f9fd4SShaohua Li 	 * don't delay.
467253f9fd4SShaohua Li 	 */
468253f9fd4SShaohua Li 	clear_bit(STRIPE_DELAYED, &sh->state);
469f6bed0efSShaohua Li 	atomic_inc(&sh->count);
470f6bed0efSShaohua Li 
471f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
472f6bed0efSShaohua Li 	/* meta + data */
473f6bed0efSShaohua Li 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
474f6bed0efSShaohua Li 	if (r5l_has_free_space(log, reserve))
475f6bed0efSShaohua Li 		r5l_log_stripe(log, sh, data_pages, parity_pages);
476f6bed0efSShaohua Li 	else {
477f6bed0efSShaohua Li 		spin_lock(&log->no_space_stripes_lock);
478f6bed0efSShaohua Li 		list_add_tail(&sh->log_list, &log->no_space_stripes);
479f6bed0efSShaohua Li 		spin_unlock(&log->no_space_stripes_lock);
480f6bed0efSShaohua Li 
481f6bed0efSShaohua Li 		r5l_wake_reclaim(log, reserve);
482f6bed0efSShaohua Li 	}
483f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
484f6bed0efSShaohua Li 
485f6bed0efSShaohua Li 	return 0;
486f6bed0efSShaohua Li }
487f6bed0efSShaohua Li 
488f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log)
489f6bed0efSShaohua Li {
490f6bed0efSShaohua Li 	if (!log)
491f6bed0efSShaohua Li 		return;
492f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
493f6bed0efSShaohua Li 	r5l_submit_current_io(log);
494f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
495f6bed0efSShaohua Li }
496f6bed0efSShaohua Li 
497828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
498828cbe98SShaohua Li {
499828cbe98SShaohua Li 	if (!log)
500828cbe98SShaohua Li 		return -ENODEV;
501828cbe98SShaohua Li 	/*
502828cbe98SShaohua Li 	 * we flush log disk cache first, then write stripe data to raid disks.
503828cbe98SShaohua Li 	 * So if bio is finished, the log disk cache is flushed already. The
504828cbe98SShaohua Li 	 * recovery guarantees we can recovery the bio from log disk, so we
505828cbe98SShaohua Li 	 * don't need to flush again
506828cbe98SShaohua Li 	 */
507828cbe98SShaohua Li 	if (bio->bi_iter.bi_size == 0) {
508828cbe98SShaohua Li 		bio_endio(bio);
509828cbe98SShaohua Li 		return 0;
510828cbe98SShaohua Li 	}
511828cbe98SShaohua Li 	bio->bi_rw &= ~REQ_FLUSH;
512828cbe98SShaohua Li 	return -EAGAIN;
513828cbe98SShaohua Li }
514828cbe98SShaohua Li 
515f6bed0efSShaohua Li /* This will run after log space is reclaimed */
516f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log)
517f6bed0efSShaohua Li {
518f6bed0efSShaohua Li 	struct stripe_head *sh;
519f6bed0efSShaohua Li 
520f6bed0efSShaohua Li 	spin_lock(&log->no_space_stripes_lock);
521f6bed0efSShaohua Li 	while (!list_empty(&log->no_space_stripes)) {
522f6bed0efSShaohua Li 		sh = list_first_entry(&log->no_space_stripes,
523f6bed0efSShaohua Li 				      struct stripe_head, log_list);
524f6bed0efSShaohua Li 		list_del_init(&sh->log_list);
525f6bed0efSShaohua Li 		set_bit(STRIPE_HANDLE, &sh->state);
526f6bed0efSShaohua Li 		raid5_release_stripe(sh);
527f6bed0efSShaohua Li 	}
528f6bed0efSShaohua Li 	spin_unlock(&log->no_space_stripes_lock);
529f6bed0efSShaohua Li }
530f6bed0efSShaohua Li 
53117036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log)
53217036461SChristoph Hellwig {
53317036461SChristoph Hellwig 	return r5l_ring_distance(log, log->last_checkpoint,
53417036461SChristoph Hellwig 				 log->next_checkpoint);
53517036461SChristoph Hellwig }
53617036461SChristoph Hellwig 
53704732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log)
53817036461SChristoph Hellwig {
53917036461SChristoph Hellwig 	struct r5l_io_unit *io, *next;
54017036461SChristoph Hellwig 	bool found = false;
54117036461SChristoph Hellwig 
54217036461SChristoph Hellwig 	assert_spin_locked(&log->io_list_lock);
54317036461SChristoph Hellwig 
54404732f74SChristoph Hellwig 	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
54517036461SChristoph Hellwig 		/* don't change list order */
54617036461SChristoph Hellwig 		if (io->state < IO_UNIT_STRIPE_END)
54717036461SChristoph Hellwig 			break;
54817036461SChristoph Hellwig 
54917036461SChristoph Hellwig 		log->next_checkpoint = io->log_start;
55017036461SChristoph Hellwig 		log->next_cp_seq = io->seq;
55117036461SChristoph Hellwig 
55217036461SChristoph Hellwig 		list_del(&io->log_sibling);
55317036461SChristoph Hellwig 		r5l_free_io_unit(log, io);
55417036461SChristoph Hellwig 
55517036461SChristoph Hellwig 		found = true;
55617036461SChristoph Hellwig 	}
55717036461SChristoph Hellwig 
55817036461SChristoph Hellwig 	return found;
55917036461SChristoph Hellwig }
56017036461SChristoph Hellwig 
561509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
562509ffec7SChristoph Hellwig {
563509ffec7SChristoph Hellwig 	struct r5l_log *log = io->log;
564509ffec7SChristoph Hellwig 	unsigned long flags;
565509ffec7SChristoph Hellwig 
566509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
567509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
56817036461SChristoph Hellwig 
56904732f74SChristoph Hellwig 	if (!r5l_complete_finished_ios(log)) {
57085f2f9a4SShaohua Li 		spin_unlock_irqrestore(&log->io_list_lock, flags);
57185f2f9a4SShaohua Li 		return;
57285f2f9a4SShaohua Li 	}
573509ffec7SChristoph Hellwig 
57417036461SChristoph Hellwig 	if (r5l_reclaimable_space(log) > log->max_free_space)
575509ffec7SChristoph Hellwig 		r5l_wake_reclaim(log, 0);
576509ffec7SChristoph Hellwig 
577509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
578509ffec7SChristoph Hellwig 	wake_up(&log->iounit_wait);
579509ffec7SChristoph Hellwig }
580509ffec7SChristoph Hellwig 
5810576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh)
5820576b1c6SShaohua Li {
5830576b1c6SShaohua Li 	struct r5l_io_unit *io;
5840576b1c6SShaohua Li 
5850576b1c6SShaohua Li 	io = sh->log_io;
5860576b1c6SShaohua Li 	sh->log_io = NULL;
5870576b1c6SShaohua Li 
588509ffec7SChristoph Hellwig 	if (io && atomic_dec_and_test(&io->pending_stripe))
589509ffec7SChristoph Hellwig 		__r5l_stripe_write_finished(io);
5900576b1c6SShaohua Li }
5910576b1c6SShaohua Li 
592a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio)
593a8c34f91SShaohua Li {
594a8c34f91SShaohua Li 	struct r5l_log *log = container_of(bio, struct r5l_log,
595a8c34f91SShaohua Li 		flush_bio);
596a8c34f91SShaohua Li 	unsigned long flags;
597a8c34f91SShaohua Li 	struct r5l_io_unit *io;
598a8c34f91SShaohua Li 
599a8c34f91SShaohua Li 	spin_lock_irqsave(&log->io_list_lock, flags);
600d8858f43SChristoph Hellwig 	list_for_each_entry(io, &log->flushing_ios, log_sibling)
601d8858f43SChristoph Hellwig 		r5l_io_run_stripes(io);
60204732f74SChristoph Hellwig 	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
603a8c34f91SShaohua Li 	spin_unlock_irqrestore(&log->io_list_lock, flags);
604a8c34f91SShaohua Li }
605a8c34f91SShaohua Li 
6060576b1c6SShaohua Li /*
6070576b1c6SShaohua Li  * Starting dispatch IO to raid.
6080576b1c6SShaohua Li  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
6090576b1c6SShaohua Li  * broken meta in the middle of a log causes recovery can't find meta at the
6100576b1c6SShaohua Li  * head of log. If operations require meta at the head persistent in log, we
6110576b1c6SShaohua Li  * must make sure meta before it persistent in log too. A case is:
6120576b1c6SShaohua Li  *
6130576b1c6SShaohua Li  * stripe data/parity is in log, we start write stripe to raid disks. stripe
6140576b1c6SShaohua Li  * data/parity must be persistent in log before we do the write to raid disks.
6150576b1c6SShaohua Li  *
6160576b1c6SShaohua Li  * The solution is we restrictly maintain io_unit list order. In this case, we
6170576b1c6SShaohua Li  * only write stripes of an io_unit to raid disks till the io_unit is the first
6180576b1c6SShaohua Li  * one whose data/parity is in log.
6190576b1c6SShaohua Li  */
6200576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log)
6210576b1c6SShaohua Li {
622a8c34f91SShaohua Li 	bool do_flush;
62356fef7c6SChristoph Hellwig 
62456fef7c6SChristoph Hellwig 	if (!log || !log->need_cache_flush)
6250576b1c6SShaohua Li 		return;
6260576b1c6SShaohua Li 
627a8c34f91SShaohua Li 	spin_lock_irq(&log->io_list_lock);
628a8c34f91SShaohua Li 	/* flush bio is running */
629a8c34f91SShaohua Li 	if (!list_empty(&log->flushing_ios)) {
630a8c34f91SShaohua Li 		spin_unlock_irq(&log->io_list_lock);
6310576b1c6SShaohua Li 		return;
6320576b1c6SShaohua Li 	}
633a8c34f91SShaohua Li 	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
634a8c34f91SShaohua Li 	do_flush = !list_empty(&log->flushing_ios);
6350576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
636a8c34f91SShaohua Li 
637a8c34f91SShaohua Li 	if (!do_flush)
638a8c34f91SShaohua Li 		return;
639a8c34f91SShaohua Li 	bio_reset(&log->flush_bio);
640a8c34f91SShaohua Li 	log->flush_bio.bi_bdev = log->rdev->bdev;
641a8c34f91SShaohua Li 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
642a8c34f91SShaohua Li 	submit_bio(WRITE_FLUSH, &log->flush_bio);
6430576b1c6SShaohua Li }
6440576b1c6SShaohua Li 
6450576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp);
6460576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log)
6470576b1c6SShaohua Li {
6480576b1c6SShaohua Li 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
64917036461SChristoph Hellwig 	sector_t reclaimable;
65017036461SChristoph Hellwig 	sector_t next_checkpoint;
65117036461SChristoph Hellwig 	u64 next_cp_seq;
6520576b1c6SShaohua Li 
6530576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
6540576b1c6SShaohua Li 	/*
6550576b1c6SShaohua Li 	 * move proper io_unit to reclaim list. We should not change the order.
6560576b1c6SShaohua Li 	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
6570576b1c6SShaohua Li 	 * shouldn't reuse space of an unreclaimable io_unit
6580576b1c6SShaohua Li 	 */
6590576b1c6SShaohua Li 	while (1) {
66017036461SChristoph Hellwig 		reclaimable = r5l_reclaimable_space(log);
66117036461SChristoph Hellwig 		if (reclaimable >= reclaim_target ||
6620576b1c6SShaohua Li 		    (list_empty(&log->running_ios) &&
6630576b1c6SShaohua Li 		     list_empty(&log->io_end_ios) &&
664a8c34f91SShaohua Li 		     list_empty(&log->flushing_ios) &&
66504732f74SChristoph Hellwig 		     list_empty(&log->finished_ios)))
6660576b1c6SShaohua Li 			break;
6670576b1c6SShaohua Li 
66817036461SChristoph Hellwig 		md_wakeup_thread(log->rdev->mddev->thread);
66917036461SChristoph Hellwig 		wait_event_lock_irq(log->iounit_wait,
67017036461SChristoph Hellwig 				    r5l_reclaimable_space(log) > reclaimable,
67117036461SChristoph Hellwig 				    log->io_list_lock);
6720576b1c6SShaohua Li 	}
67317036461SChristoph Hellwig 
67417036461SChristoph Hellwig 	next_checkpoint = log->next_checkpoint;
67517036461SChristoph Hellwig 	next_cp_seq = log->next_cp_seq;
6760576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
6770576b1c6SShaohua Li 
67817036461SChristoph Hellwig 	BUG_ON(reclaimable < 0);
67917036461SChristoph Hellwig 	if (reclaimable == 0)
6800576b1c6SShaohua Li 		return;
6810576b1c6SShaohua Li 
6820576b1c6SShaohua Li 	/*
6830576b1c6SShaohua Li 	 * write_super will flush cache of each raid disk. We must write super
6840576b1c6SShaohua Li 	 * here, because the log area might be reused soon and we don't want to
6850576b1c6SShaohua Li 	 * confuse recovery
6860576b1c6SShaohua Li 	 */
68717036461SChristoph Hellwig 	r5l_write_super(log, next_checkpoint);
6880576b1c6SShaohua Li 
6890576b1c6SShaohua Li 	mutex_lock(&log->io_mutex);
69017036461SChristoph Hellwig 	log->last_checkpoint = next_checkpoint;
69117036461SChristoph Hellwig 	log->last_cp_seq = next_cp_seq;
6920576b1c6SShaohua Li 	mutex_unlock(&log->io_mutex);
6930576b1c6SShaohua Li 
69417036461SChristoph Hellwig 	r5l_run_no_space_stripes(log);
6950576b1c6SShaohua Li }
6960576b1c6SShaohua Li 
6970576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread)
6980576b1c6SShaohua Li {
6990576b1c6SShaohua Li 	struct mddev *mddev = thread->mddev;
7000576b1c6SShaohua Li 	struct r5conf *conf = mddev->private;
7010576b1c6SShaohua Li 	struct r5l_log *log = conf->log;
7020576b1c6SShaohua Li 
7030576b1c6SShaohua Li 	if (!log)
7040576b1c6SShaohua Li 		return;
7050576b1c6SShaohua Li 	r5l_do_reclaim(log);
7060576b1c6SShaohua Li }
7070576b1c6SShaohua Li 
708f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
709f6bed0efSShaohua Li {
7100576b1c6SShaohua Li 	unsigned long target;
7110576b1c6SShaohua Li 	unsigned long new = (unsigned long)space; /* overflow in theory */
7120576b1c6SShaohua Li 
7130576b1c6SShaohua Li 	do {
7140576b1c6SShaohua Li 		target = log->reclaim_target;
7150576b1c6SShaohua Li 		if (new < target)
7160576b1c6SShaohua Li 			return;
7170576b1c6SShaohua Li 	} while (cmpxchg(&log->reclaim_target, target, new) != target);
7180576b1c6SShaohua Li 	md_wakeup_thread(log->reclaim_thread);
719f6bed0efSShaohua Li }
720f6bed0efSShaohua Li 
721e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state)
722e6c033f7SShaohua Li {
723e6c033f7SShaohua Li 	if (!log || state == 2)
724e6c033f7SShaohua Li 		return;
725e6c033f7SShaohua Li 	if (state == 0) {
726e6c033f7SShaohua Li 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
727e6c033f7SShaohua Li 					log->rdev->mddev, "reclaim");
728e6c033f7SShaohua Li 	} else if (state == 1) {
729e6c033f7SShaohua Li 		/*
730e6c033f7SShaohua Li 		 * at this point all stripes are finished, so io_unit is at
731e6c033f7SShaohua Li 		 * least in STRIPE_END state
732e6c033f7SShaohua Li 		 */
733e6c033f7SShaohua Li 		r5l_wake_reclaim(log, -1L);
734e6c033f7SShaohua Li 		md_unregister_thread(&log->reclaim_thread);
735e6c033f7SShaohua Li 		r5l_do_reclaim(log);
736e6c033f7SShaohua Li 	}
737e6c033f7SShaohua Li }
738e6c033f7SShaohua Li 
739355810d1SShaohua Li struct r5l_recovery_ctx {
740355810d1SShaohua Li 	struct page *meta_page;		/* current meta */
741355810d1SShaohua Li 	sector_t meta_total_blocks;	/* total size of current meta and data */
742355810d1SShaohua Li 	sector_t pos;			/* recovery position */
743355810d1SShaohua Li 	u64 seq;			/* recovery position seq */
744355810d1SShaohua Li };
745355810d1SShaohua Li 
746355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log,
747355810d1SShaohua Li 			       struct r5l_recovery_ctx *ctx)
748355810d1SShaohua Li {
749355810d1SShaohua Li 	struct page *page = ctx->meta_page;
750355810d1SShaohua Li 	struct r5l_meta_block *mb;
751355810d1SShaohua Li 	u32 crc, stored_crc;
752355810d1SShaohua Li 
753355810d1SShaohua Li 	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
754355810d1SShaohua Li 		return -EIO;
755355810d1SShaohua Li 
756355810d1SShaohua Li 	mb = page_address(page);
757355810d1SShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
758355810d1SShaohua Li 	mb->checksum = 0;
759355810d1SShaohua Li 
760355810d1SShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
761355810d1SShaohua Li 	    le64_to_cpu(mb->seq) != ctx->seq ||
762355810d1SShaohua Li 	    mb->version != R5LOG_VERSION ||
763355810d1SShaohua Li 	    le64_to_cpu(mb->position) != ctx->pos)
764355810d1SShaohua Li 		return -EINVAL;
765355810d1SShaohua Li 
7665cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
767355810d1SShaohua Li 	if (stored_crc != crc)
768355810d1SShaohua Li 		return -EINVAL;
769355810d1SShaohua Li 
770355810d1SShaohua Li 	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
771355810d1SShaohua Li 		return -EINVAL;
772355810d1SShaohua Li 
773355810d1SShaohua Li 	ctx->meta_total_blocks = BLOCK_SECTORS;
774355810d1SShaohua Li 
775355810d1SShaohua Li 	return 0;
776355810d1SShaohua Li }
777355810d1SShaohua Li 
778355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
779355810d1SShaohua Li 					 struct r5l_recovery_ctx *ctx,
780355810d1SShaohua Li 					 sector_t stripe_sect,
781355810d1SShaohua Li 					 int *offset, sector_t *log_offset)
782355810d1SShaohua Li {
783355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
784355810d1SShaohua Li 	struct stripe_head *sh;
785355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
786355810d1SShaohua Li 	int disk_index;
787355810d1SShaohua Li 
788355810d1SShaohua Li 	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
789355810d1SShaohua Li 	while (1) {
790355810d1SShaohua Li 		payload = page_address(ctx->meta_page) + *offset;
791355810d1SShaohua Li 
792355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
793355810d1SShaohua Li 			raid5_compute_sector(conf,
794355810d1SShaohua Li 					     le64_to_cpu(payload->location), 0,
795355810d1SShaohua Li 					     &disk_index, sh);
796355810d1SShaohua Li 
797355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
798355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
799355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
800355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
801355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
802355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS;
803355810d1SShaohua Li 		} else {
804355810d1SShaohua Li 			disk_index = sh->pd_idx;
805355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
806355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
807355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
808355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
809355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
810355810d1SShaohua Li 
811355810d1SShaohua Li 			if (sh->qd_idx >= 0) {
812355810d1SShaohua Li 				disk_index = sh->qd_idx;
813355810d1SShaohua Li 				sync_page_io(log->rdev,
814355810d1SShaohua Li 					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
815355810d1SShaohua Li 					     PAGE_SIZE, sh->dev[disk_index].page,
816355810d1SShaohua Li 					     READ, false);
817355810d1SShaohua Li 				sh->dev[disk_index].log_checksum =
818355810d1SShaohua Li 					le32_to_cpu(payload->checksum[1]);
819355810d1SShaohua Li 				set_bit(R5_Wantwrite,
820355810d1SShaohua Li 					&sh->dev[disk_index].flags);
821355810d1SShaohua Li 			}
822355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
823355810d1SShaohua Li 		}
824355810d1SShaohua Li 
825355810d1SShaohua Li 		*log_offset = r5l_ring_add(log, *log_offset,
826355810d1SShaohua Li 					   le32_to_cpu(payload->size));
827355810d1SShaohua Li 		*offset += sizeof(struct r5l_payload_data_parity) +
828355810d1SShaohua Li 			sizeof(__le32) *
829355810d1SShaohua Li 			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
830355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
831355810d1SShaohua Li 			break;
832355810d1SShaohua Li 	}
833355810d1SShaohua Li 
834355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
835355810d1SShaohua Li 		void *addr;
836355810d1SShaohua Li 		u32 checksum;
837355810d1SShaohua Li 
838355810d1SShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
839355810d1SShaohua Li 			continue;
840355810d1SShaohua Li 		addr = kmap_atomic(sh->dev[disk_index].page);
8415cb2fbd6SShaohua Li 		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
842355810d1SShaohua Li 		kunmap_atomic(addr);
843355810d1SShaohua Li 		if (checksum != sh->dev[disk_index].log_checksum)
844355810d1SShaohua Li 			goto error;
845355810d1SShaohua Li 	}
846355810d1SShaohua Li 
847355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
848355810d1SShaohua Li 		struct md_rdev *rdev, *rrdev;
849355810d1SShaohua Li 
850355810d1SShaohua Li 		if (!test_and_clear_bit(R5_Wantwrite,
851355810d1SShaohua Li 					&sh->dev[disk_index].flags))
852355810d1SShaohua Li 			continue;
853355810d1SShaohua Li 
854355810d1SShaohua Li 		/* in case device is broken */
855355810d1SShaohua Li 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
856355810d1SShaohua Li 		if (rdev)
857355810d1SShaohua Li 			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
858355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
859355810d1SShaohua Li 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
860355810d1SShaohua Li 		if (rrdev)
861355810d1SShaohua Li 			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
862355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
863355810d1SShaohua Li 	}
864355810d1SShaohua Li 	raid5_release_stripe(sh);
865355810d1SShaohua Li 	return 0;
866355810d1SShaohua Li 
867355810d1SShaohua Li error:
868355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++)
869355810d1SShaohua Li 		sh->dev[disk_index].flags = 0;
870355810d1SShaohua Li 	raid5_release_stripe(sh);
871355810d1SShaohua Li 	return -EINVAL;
872355810d1SShaohua Li }
873355810d1SShaohua Li 
874355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log,
875355810d1SShaohua Li 				       struct r5l_recovery_ctx *ctx)
876355810d1SShaohua Li {
877355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
878355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
879355810d1SShaohua Li 	struct r5l_meta_block *mb;
880355810d1SShaohua Li 	int offset;
881355810d1SShaohua Li 	sector_t log_offset;
882355810d1SShaohua Li 	sector_t stripe_sector;
883355810d1SShaohua Li 
884355810d1SShaohua Li 	mb = page_address(ctx->meta_page);
885355810d1SShaohua Li 	offset = sizeof(struct r5l_meta_block);
886355810d1SShaohua Li 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
887355810d1SShaohua Li 
888355810d1SShaohua Li 	while (offset < le32_to_cpu(mb->meta_size)) {
889355810d1SShaohua Li 		int dd;
890355810d1SShaohua Li 
891355810d1SShaohua Li 		payload = (void *)mb + offset;
892355810d1SShaohua Li 		stripe_sector = raid5_compute_sector(conf,
893355810d1SShaohua Li 						     le64_to_cpu(payload->location), 0, &dd, NULL);
894355810d1SShaohua Li 		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
895355810d1SShaohua Li 						  &offset, &log_offset))
896355810d1SShaohua Li 			return -EINVAL;
897355810d1SShaohua Li 	}
898355810d1SShaohua Li 	return 0;
899355810d1SShaohua Li }
900355810d1SShaohua Li 
901355810d1SShaohua Li /* copy data/parity from log to raid disks */
902355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log,
903355810d1SShaohua Li 				   struct r5l_recovery_ctx *ctx)
904355810d1SShaohua Li {
905355810d1SShaohua Li 	while (1) {
906355810d1SShaohua Li 		if (r5l_read_meta_block(log, ctx))
907355810d1SShaohua Li 			return;
908355810d1SShaohua Li 		if (r5l_recovery_flush_one_meta(log, ctx))
909355810d1SShaohua Li 			return;
910355810d1SShaohua Li 		ctx->seq++;
911355810d1SShaohua Li 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
912355810d1SShaohua Li 	}
913355810d1SShaohua Li }
914355810d1SShaohua Li 
915355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
916355810d1SShaohua Li 					  u64 seq)
917355810d1SShaohua Li {
918355810d1SShaohua Li 	struct page *page;
919355810d1SShaohua Li 	struct r5l_meta_block *mb;
920355810d1SShaohua Li 	u32 crc;
921355810d1SShaohua Li 
922355810d1SShaohua Li 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
923355810d1SShaohua Li 	if (!page)
924355810d1SShaohua Li 		return -ENOMEM;
925355810d1SShaohua Li 	mb = page_address(page);
926355810d1SShaohua Li 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
927355810d1SShaohua Li 	mb->version = R5LOG_VERSION;
928355810d1SShaohua Li 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
929355810d1SShaohua Li 	mb->seq = cpu_to_le64(seq);
930355810d1SShaohua Li 	mb->position = cpu_to_le64(pos);
9315cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
932355810d1SShaohua Li 	mb->checksum = cpu_to_le32(crc);
933355810d1SShaohua Li 
934355810d1SShaohua Li 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
935355810d1SShaohua Li 		__free_page(page);
936355810d1SShaohua Li 		return -EIO;
937355810d1SShaohua Li 	}
938355810d1SShaohua Li 	__free_page(page);
939355810d1SShaohua Li 	return 0;
940355810d1SShaohua Li }
941355810d1SShaohua Li 
942f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log)
943f6bed0efSShaohua Li {
944355810d1SShaohua Li 	struct r5l_recovery_ctx ctx;
945355810d1SShaohua Li 
946355810d1SShaohua Li 	ctx.pos = log->last_checkpoint;
947355810d1SShaohua Li 	ctx.seq = log->last_cp_seq;
948355810d1SShaohua Li 	ctx.meta_page = alloc_page(GFP_KERNEL);
949355810d1SShaohua Li 	if (!ctx.meta_page)
950355810d1SShaohua Li 		return -ENOMEM;
951355810d1SShaohua Li 
952355810d1SShaohua Li 	r5l_recovery_flush_log(log, &ctx);
953355810d1SShaohua Li 	__free_page(ctx.meta_page);
954355810d1SShaohua Li 
955355810d1SShaohua Li 	/*
956355810d1SShaohua Li 	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
957355810d1SShaohua Li 	 * log will start here. but we can't let superblock point to last valid
958355810d1SShaohua Li 	 * meta block. The log might looks like:
959355810d1SShaohua Li 	 * | meta 1| meta 2| meta 3|
960355810d1SShaohua Li 	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
961355810d1SShaohua Li 	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
962355810d1SShaohua Li 	 * happens again, new recovery will start from meta 1. Since meta 2n is
963355810d1SShaohua Li 	 * valid now, recovery will think meta 3 is valid, which is wrong.
964355810d1SShaohua Li 	 * The solution is we create a new meta in meta2 with its seq == meta
965355810d1SShaohua Li 	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
966355810d1SShaohua Li 	 * not think meta 3 is a valid meta, because its seq doesn't match
967355810d1SShaohua Li 	 */
968355810d1SShaohua Li 	if (ctx.seq > log->last_cp_seq + 1) {
969355810d1SShaohua Li 		int ret;
970355810d1SShaohua Li 
971355810d1SShaohua Li 		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
972355810d1SShaohua Li 		if (ret)
973355810d1SShaohua Li 			return ret;
974355810d1SShaohua Li 		log->seq = ctx.seq + 11;
975355810d1SShaohua Li 		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
976355810d1SShaohua Li 		r5l_write_super(log, ctx.pos);
977355810d1SShaohua Li 	} else {
978355810d1SShaohua Li 		log->log_start = ctx.pos;
979355810d1SShaohua Li 		log->seq = ctx.seq;
980355810d1SShaohua Li 	}
981f6bed0efSShaohua Li 	return 0;
982f6bed0efSShaohua Li }
983f6bed0efSShaohua Li 
984f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp)
985f6bed0efSShaohua Li {
986f6bed0efSShaohua Li 	struct mddev *mddev = log->rdev->mddev;
987f6bed0efSShaohua Li 
988f6bed0efSShaohua Li 	log->rdev->journal_tail = cp;
989f6bed0efSShaohua Li 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
990f6bed0efSShaohua Li }
991f6bed0efSShaohua Li 
992f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log)
993f6bed0efSShaohua Li {
994f6bed0efSShaohua Li 	struct md_rdev *rdev = log->rdev;
995f6bed0efSShaohua Li 	struct page *page;
996f6bed0efSShaohua Li 	struct r5l_meta_block *mb;
997f6bed0efSShaohua Li 	sector_t cp = log->rdev->journal_tail;
998f6bed0efSShaohua Li 	u32 stored_crc, expected_crc;
999f6bed0efSShaohua Li 	bool create_super = false;
1000f6bed0efSShaohua Li 	int ret;
1001f6bed0efSShaohua Li 
1002f6bed0efSShaohua Li 	/* Make sure it's valid */
1003f6bed0efSShaohua Li 	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1004f6bed0efSShaohua Li 		cp = 0;
1005f6bed0efSShaohua Li 	page = alloc_page(GFP_KERNEL);
1006f6bed0efSShaohua Li 	if (!page)
1007f6bed0efSShaohua Li 		return -ENOMEM;
1008f6bed0efSShaohua Li 
1009f6bed0efSShaohua Li 	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1010f6bed0efSShaohua Li 		ret = -EIO;
1011f6bed0efSShaohua Li 		goto ioerr;
1012f6bed0efSShaohua Li 	}
1013f6bed0efSShaohua Li 	mb = page_address(page);
1014f6bed0efSShaohua Li 
1015f6bed0efSShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1016f6bed0efSShaohua Li 	    mb->version != R5LOG_VERSION) {
1017f6bed0efSShaohua Li 		create_super = true;
1018f6bed0efSShaohua Li 		goto create;
1019f6bed0efSShaohua Li 	}
1020f6bed0efSShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
1021f6bed0efSShaohua Li 	mb->checksum = 0;
10225cb2fbd6SShaohua Li 	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1023f6bed0efSShaohua Li 	if (stored_crc != expected_crc) {
1024f6bed0efSShaohua Li 		create_super = true;
1025f6bed0efSShaohua Li 		goto create;
1026f6bed0efSShaohua Li 	}
1027f6bed0efSShaohua Li 	if (le64_to_cpu(mb->position) != cp) {
1028f6bed0efSShaohua Li 		create_super = true;
1029f6bed0efSShaohua Li 		goto create;
1030f6bed0efSShaohua Li 	}
1031f6bed0efSShaohua Li create:
1032f6bed0efSShaohua Li 	if (create_super) {
1033f6bed0efSShaohua Li 		log->last_cp_seq = prandom_u32();
1034f6bed0efSShaohua Li 		cp = 0;
1035f6bed0efSShaohua Li 		/*
1036f6bed0efSShaohua Li 		 * Make sure super points to correct address. Log might have
1037f6bed0efSShaohua Li 		 * data very soon. If super hasn't correct log tail address,
1038f6bed0efSShaohua Li 		 * recovery can't find the log
1039f6bed0efSShaohua Li 		 */
1040f6bed0efSShaohua Li 		r5l_write_super(log, cp);
1041f6bed0efSShaohua Li 	} else
1042f6bed0efSShaohua Li 		log->last_cp_seq = le64_to_cpu(mb->seq);
1043f6bed0efSShaohua Li 
1044f6bed0efSShaohua Li 	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
10450576b1c6SShaohua Li 	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
10460576b1c6SShaohua Li 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
10470576b1c6SShaohua Li 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1048f6bed0efSShaohua Li 	log->last_checkpoint = cp;
1049f6bed0efSShaohua Li 
1050f6bed0efSShaohua Li 	__free_page(page);
1051f6bed0efSShaohua Li 
1052f6bed0efSShaohua Li 	return r5l_recovery_log(log);
1053f6bed0efSShaohua Li ioerr:
1054f6bed0efSShaohua Li 	__free_page(page);
1055f6bed0efSShaohua Li 	return ret;
1056f6bed0efSShaohua Li }
1057f6bed0efSShaohua Li 
1058f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1059f6bed0efSShaohua Li {
1060f6bed0efSShaohua Li 	struct r5l_log *log;
1061f6bed0efSShaohua Li 
1062f6bed0efSShaohua Li 	if (PAGE_SIZE != 4096)
1063f6bed0efSShaohua Li 		return -EINVAL;
1064f6bed0efSShaohua Li 	log = kzalloc(sizeof(*log), GFP_KERNEL);
1065f6bed0efSShaohua Li 	if (!log)
1066f6bed0efSShaohua Li 		return -ENOMEM;
1067f6bed0efSShaohua Li 	log->rdev = rdev;
1068f6bed0efSShaohua Li 
106956fef7c6SChristoph Hellwig 	log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
107056fef7c6SChristoph Hellwig 
10715cb2fbd6SShaohua Li 	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1072f6bed0efSShaohua Li 				       sizeof(rdev->mddev->uuid));
1073f6bed0efSShaohua Li 
1074f6bed0efSShaohua Li 	mutex_init(&log->io_mutex);
1075f6bed0efSShaohua Li 
1076f6bed0efSShaohua Li 	spin_lock_init(&log->io_list_lock);
1077f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->running_ios);
10780576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->io_end_ios);
1079a8c34f91SShaohua Li 	INIT_LIST_HEAD(&log->flushing_ios);
108004732f74SChristoph Hellwig 	INIT_LIST_HEAD(&log->finished_ios);
1081a8c34f91SShaohua Li 	bio_init(&log->flush_bio);
1082f6bed0efSShaohua Li 
1083f6bed0efSShaohua Li 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1084f6bed0efSShaohua Li 	if (!log->io_kc)
1085f6bed0efSShaohua Li 		goto io_kc;
1086f6bed0efSShaohua Li 
10870576b1c6SShaohua Li 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
10880576b1c6SShaohua Li 						 log->rdev->mddev, "reclaim");
10890576b1c6SShaohua Li 	if (!log->reclaim_thread)
10900576b1c6SShaohua Li 		goto reclaim_thread;
10910fd22b45SShaohua Li 	init_waitqueue_head(&log->iounit_wait);
10920576b1c6SShaohua Li 
1093f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->no_space_stripes);
1094f6bed0efSShaohua Li 	spin_lock_init(&log->no_space_stripes_lock);
1095f6bed0efSShaohua Li 
1096f6bed0efSShaohua Li 	if (r5l_load_log(log))
1097f6bed0efSShaohua Li 		goto error;
1098f6bed0efSShaohua Li 
1099f6bed0efSShaohua Li 	conf->log = log;
1100f6bed0efSShaohua Li 	return 0;
1101f6bed0efSShaohua Li error:
11020576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
11030576b1c6SShaohua Li reclaim_thread:
1104f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1105f6bed0efSShaohua Li io_kc:
1106f6bed0efSShaohua Li 	kfree(log);
1107f6bed0efSShaohua Li 	return -EINVAL;
1108f6bed0efSShaohua Li }
1109f6bed0efSShaohua Li 
1110f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log)
1111f6bed0efSShaohua Li {
11120576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
1113f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1114f6bed0efSShaohua Li 	kfree(log);
1115f6bed0efSShaohua Li }
1116