xref: /linux/drivers/md/raid5-cache.c (revision 85f2f9a4f49d3e3230b3c5fb08362d561691421e)
1f6bed0efSShaohua Li /*
2f6bed0efSShaohua Li  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3f6bed0efSShaohua Li  *
4f6bed0efSShaohua Li  * This program is free software; you can redistribute it and/or modify it
5f6bed0efSShaohua Li  * under the terms and conditions of the GNU General Public License,
6f6bed0efSShaohua Li  * version 2, as published by the Free Software Foundation.
7f6bed0efSShaohua Li  *
8f6bed0efSShaohua Li  * This program is distributed in the hope it will be useful, but WITHOUT
9f6bed0efSShaohua Li  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10f6bed0efSShaohua Li  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11f6bed0efSShaohua Li  * more details.
12f6bed0efSShaohua Li  *
13f6bed0efSShaohua Li  */
14f6bed0efSShaohua Li #include <linux/kernel.h>
15f6bed0efSShaohua Li #include <linux/wait.h>
16f6bed0efSShaohua Li #include <linux/blkdev.h>
17f6bed0efSShaohua Li #include <linux/slab.h>
18f6bed0efSShaohua Li #include <linux/raid/md_p.h>
195cb2fbd6SShaohua Li #include <linux/crc32c.h>
20f6bed0efSShaohua Li #include <linux/random.h>
21f6bed0efSShaohua Li #include "md.h"
22f6bed0efSShaohua Li #include "raid5.h"
23f6bed0efSShaohua Li 
24f6bed0efSShaohua Li /*
25f6bed0efSShaohua Li  * metadata/data stored in disk with 4k size unit (a block) regardless
26f6bed0efSShaohua Li  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27f6bed0efSShaohua Li  */
28f6bed0efSShaohua Li #define BLOCK_SECTORS (8)
29f6bed0efSShaohua Li 
300576b1c6SShaohua Li /*
310576b1c6SShaohua Li  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
320576b1c6SShaohua Li  * recovery scans a very long log
330576b1c6SShaohua Li  */
340576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
350576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
360576b1c6SShaohua Li 
37f6bed0efSShaohua Li struct r5l_log {
38f6bed0efSShaohua Li 	struct md_rdev *rdev;
39f6bed0efSShaohua Li 
40f6bed0efSShaohua Li 	u32 uuid_checksum;
41f6bed0efSShaohua Li 
42f6bed0efSShaohua Li 	sector_t device_size;		/* log device size, round to
43f6bed0efSShaohua Li 					 * BLOCK_SECTORS */
440576b1c6SShaohua Li 	sector_t max_free_space;	/* reclaim run if free space is at
450576b1c6SShaohua Li 					 * this size */
46f6bed0efSShaohua Li 
47f6bed0efSShaohua Li 	sector_t last_checkpoint;	/* log tail. where recovery scan
48f6bed0efSShaohua Li 					 * starts from */
49f6bed0efSShaohua Li 	u64 last_cp_seq;		/* log tail sequence */
50f6bed0efSShaohua Li 
51f6bed0efSShaohua Li 	sector_t log_start;		/* log head. where new data appends */
52f6bed0efSShaohua Li 	u64 seq;			/* log head sequence */
53f6bed0efSShaohua Li 
54f6bed0efSShaohua Li 	struct mutex io_mutex;
55f6bed0efSShaohua Li 	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
56f6bed0efSShaohua Li 
57f6bed0efSShaohua Li 	spinlock_t io_list_lock;
58f6bed0efSShaohua Li 	struct list_head running_ios;	/* io_units which are still running,
59f6bed0efSShaohua Li 					 * and have not yet been completely
60f6bed0efSShaohua Li 					 * written to the log */
61f6bed0efSShaohua Li 	struct list_head io_end_ios;	/* io_units which have been completely
62f6bed0efSShaohua Li 					 * written to the log but not yet written
63f6bed0efSShaohua Li 					 * to the RAID */
64a8c34f91SShaohua Li 	struct list_head flushing_ios;	/* io_units which are waiting for log
65a8c34f91SShaohua Li 					 * cache flush */
66a8c34f91SShaohua Li 	struct list_head flushed_ios;	/* io_units which settle down in log disk */
67a8c34f91SShaohua Li 	struct bio flush_bio;
680576b1c6SShaohua Li 	struct list_head stripe_end_ios;/* io_units which have been completely
690576b1c6SShaohua Li 					 * written to the RAID but have not yet
700576b1c6SShaohua Li 					 * been considered for updating super */
71f6bed0efSShaohua Li 
72f6bed0efSShaohua Li 	struct kmem_cache *io_kc;
73f6bed0efSShaohua Li 
740576b1c6SShaohua Li 	struct md_thread *reclaim_thread;
750576b1c6SShaohua Li 	unsigned long reclaim_target;	/* number of space that need to be
760576b1c6SShaohua Li 					 * reclaimed.  if it's 0, reclaim spaces
770576b1c6SShaohua Li 					 * used by io_units which are in
780576b1c6SShaohua Li 					 * IO_UNIT_STRIPE_END state (eg, reclaim
790576b1c6SShaohua Li 					 * dones't wait for specific io_unit
800576b1c6SShaohua Li 					 * switching to IO_UNIT_STRIPE_END
810576b1c6SShaohua Li 					 * state) */
820fd22b45SShaohua Li 	wait_queue_head_t iounit_wait;
830576b1c6SShaohua Li 
84f6bed0efSShaohua Li 	struct list_head no_space_stripes; /* pending stripes, log has no space */
85f6bed0efSShaohua Li 	spinlock_t no_space_stripes_lock;
86f6bed0efSShaohua Li };
87f6bed0efSShaohua Li 
88f6bed0efSShaohua Li /*
89f6bed0efSShaohua Li  * an IO range starts from a meta data block and end at the next meta data
90f6bed0efSShaohua Li  * block. The io unit's the meta data block tracks data/parity followed it. io
91f6bed0efSShaohua Li  * unit is written to log disk with normal write, as we always flush log disk
92f6bed0efSShaohua Li  * first and then start move data to raid disks, there is no requirement to
93f6bed0efSShaohua Li  * write io unit with FLUSH/FUA
94f6bed0efSShaohua Li  */
95f6bed0efSShaohua Li struct r5l_io_unit {
96f6bed0efSShaohua Li 	struct r5l_log *log;
97f6bed0efSShaohua Li 
98f6bed0efSShaohua Li 	struct page *meta_page;	/* store meta block */
99f6bed0efSShaohua Li 	int meta_offset;	/* current offset in meta_page */
100f6bed0efSShaohua Li 
101f6bed0efSShaohua Li 	struct bio_list bios;
102f6bed0efSShaohua Li 	atomic_t pending_io;	/* pending bios not written to log yet */
103f6bed0efSShaohua Li 	struct bio *current_bio;/* current_bio accepting new data */
104f6bed0efSShaohua Li 
105f6bed0efSShaohua Li 	atomic_t pending_stripe;/* how many stripes not flushed to raid */
106f6bed0efSShaohua Li 	u64 seq;		/* seq number of the metablock */
107f6bed0efSShaohua Li 	sector_t log_start;	/* where the io_unit starts */
108f6bed0efSShaohua Li 	sector_t log_end;	/* where the io_unit ends */
109f6bed0efSShaohua Li 	struct list_head log_sibling; /* log->running_ios */
110f6bed0efSShaohua Li 	struct list_head stripe_list; /* stripes added to the io_unit */
111f6bed0efSShaohua Li 
112f6bed0efSShaohua Li 	int state;
113f6bed0efSShaohua Li };
114f6bed0efSShaohua Li 
115f6bed0efSShaohua Li /* r5l_io_unit state */
116f6bed0efSShaohua Li enum r5l_io_unit_state {
117f6bed0efSShaohua Li 	IO_UNIT_RUNNING = 0,	/* accepting new IO */
118f6bed0efSShaohua Li 	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
119f6bed0efSShaohua Li 				 * don't accepting new bio */
120f6bed0efSShaohua Li 	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
121a8c34f91SShaohua Li 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
122f6bed0efSShaohua Li };
123f6bed0efSShaohua Li 
124f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
125f6bed0efSShaohua Li {
126f6bed0efSShaohua Li 	start += inc;
127f6bed0efSShaohua Li 	if (start >= log->device_size)
128f6bed0efSShaohua Li 		start = start - log->device_size;
129f6bed0efSShaohua Li 	return start;
130f6bed0efSShaohua Li }
131f6bed0efSShaohua Li 
132f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
133f6bed0efSShaohua Li 				  sector_t end)
134f6bed0efSShaohua Li {
135f6bed0efSShaohua Li 	if (end >= start)
136f6bed0efSShaohua Li 		return end - start;
137f6bed0efSShaohua Li 	else
138f6bed0efSShaohua Li 		return end + log->device_size - start;
139f6bed0efSShaohua Li }
140f6bed0efSShaohua Li 
141f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
142f6bed0efSShaohua Li {
143f6bed0efSShaohua Li 	sector_t used_size;
144f6bed0efSShaohua Li 
145f6bed0efSShaohua Li 	used_size = r5l_ring_distance(log, log->last_checkpoint,
146f6bed0efSShaohua Li 					log->log_start);
147f6bed0efSShaohua Li 
148f6bed0efSShaohua Li 	return log->device_size > used_size + size;
149f6bed0efSShaohua Li }
150f6bed0efSShaohua Li 
151f6bed0efSShaohua Li static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
152f6bed0efSShaohua Li {
153f6bed0efSShaohua Li 	struct r5l_io_unit *io;
154f6bed0efSShaohua Li 	/* We can't handle memory allocate failure so far */
155f6bed0efSShaohua Li 	gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
156f6bed0efSShaohua Li 
157f6bed0efSShaohua Li 	io = kmem_cache_zalloc(log->io_kc, gfp);
158f6bed0efSShaohua Li 	io->log = log;
159f6bed0efSShaohua Li 	io->meta_page = alloc_page(gfp | __GFP_ZERO);
160f6bed0efSShaohua Li 
161f6bed0efSShaohua Li 	bio_list_init(&io->bios);
162f6bed0efSShaohua Li 	INIT_LIST_HEAD(&io->log_sibling);
163f6bed0efSShaohua Li 	INIT_LIST_HEAD(&io->stripe_list);
164f6bed0efSShaohua Li 	io->state = IO_UNIT_RUNNING;
165f6bed0efSShaohua Li 	return io;
166f6bed0efSShaohua Li }
167f6bed0efSShaohua Li 
168f6bed0efSShaohua Li static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
169f6bed0efSShaohua Li {
170f6bed0efSShaohua Li 	__free_page(io->meta_page);
171f6bed0efSShaohua Li 	kmem_cache_free(log->io_kc, io);
172f6bed0efSShaohua Li }
173f6bed0efSShaohua Li 
174f6bed0efSShaohua Li static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
175f6bed0efSShaohua Li 				  enum r5l_io_unit_state state)
176f6bed0efSShaohua Li {
177f6bed0efSShaohua Li 	struct r5l_io_unit *io;
178f6bed0efSShaohua Li 
179f6bed0efSShaohua Li 	while (!list_empty(from)) {
180f6bed0efSShaohua Li 		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
181f6bed0efSShaohua Li 		/* don't change list order */
182f6bed0efSShaohua Li 		if (io->state >= state)
183f6bed0efSShaohua Li 			list_move_tail(&io->log_sibling, to);
184f6bed0efSShaohua Li 		else
185f6bed0efSShaohua Li 			break;
186f6bed0efSShaohua Li 	}
187f6bed0efSShaohua Li }
188f6bed0efSShaohua Li 
1890576b1c6SShaohua Li /*
1900576b1c6SShaohua Li  * We don't want too many io_units reside in stripe_end_ios list, which will
1910576b1c6SShaohua Li  * waste a lot of memory. So we try to remove some. But we must keep at least 2
1920576b1c6SShaohua Li  * io_units. The superblock must point to a valid meta, if it's the last meta,
1930576b1c6SShaohua Li  * recovery can scan less
1940576b1c6SShaohua Li  */
1950576b1c6SShaohua Li static void r5l_compress_stripe_end_list(struct r5l_log *log)
1960576b1c6SShaohua Li {
1970576b1c6SShaohua Li 	struct r5l_io_unit *first, *last, *io;
1980576b1c6SShaohua Li 
1990576b1c6SShaohua Li 	first = list_first_entry(&log->stripe_end_ios,
2000576b1c6SShaohua Li 				 struct r5l_io_unit, log_sibling);
2010576b1c6SShaohua Li 	last = list_last_entry(&log->stripe_end_ios,
2020576b1c6SShaohua Li 			       struct r5l_io_unit, log_sibling);
2030576b1c6SShaohua Li 	if (first == last)
2040576b1c6SShaohua Li 		return;
2050576b1c6SShaohua Li 	list_del(&first->log_sibling);
2060576b1c6SShaohua Li 	list_del(&last->log_sibling);
2070576b1c6SShaohua Li 	while (!list_empty(&log->stripe_end_ios)) {
2080576b1c6SShaohua Li 		io = list_first_entry(&log->stripe_end_ios,
2090576b1c6SShaohua Li 				      struct r5l_io_unit, log_sibling);
2100576b1c6SShaohua Li 		list_del(&io->log_sibling);
2110576b1c6SShaohua Li 		first->log_end = io->log_end;
2120576b1c6SShaohua Li 		r5l_free_io_unit(log, io);
2130576b1c6SShaohua Li 	}
2140576b1c6SShaohua Li 	list_add_tail(&first->log_sibling, &log->stripe_end_ios);
2150576b1c6SShaohua Li 	list_add_tail(&last->log_sibling, &log->stripe_end_ios);
2160576b1c6SShaohua Li }
2170576b1c6SShaohua Li 
218f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
219f6bed0efSShaohua Li 				    enum r5l_io_unit_state state)
220f6bed0efSShaohua Li {
221f6bed0efSShaohua Li 	if (WARN_ON(io->state >= state))
222f6bed0efSShaohua Li 		return;
223f6bed0efSShaohua Li 	io->state = state;
224f6bed0efSShaohua Li }
225f6bed0efSShaohua Li 
226f6bed0efSShaohua Li /* XXX: totally ignores I/O errors */
227f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio)
228f6bed0efSShaohua Li {
229f6bed0efSShaohua Li 	struct r5l_io_unit *io = bio->bi_private;
230f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
231509ffec7SChristoph Hellwig 	unsigned long flags;
232f6bed0efSShaohua Li 
233f6bed0efSShaohua Li 	bio_put(bio);
234f6bed0efSShaohua Li 
235f6bed0efSShaohua Li 	if (!atomic_dec_and_test(&io->pending_io))
236f6bed0efSShaohua Li 		return;
237f6bed0efSShaohua Li 
238509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
239509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
240509ffec7SChristoph Hellwig 	r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
241509ffec7SChristoph Hellwig 			IO_UNIT_IO_END);
242509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
243509ffec7SChristoph Hellwig 
244f6bed0efSShaohua Li 	md_wakeup_thread(log->rdev->mddev->thread);
245f6bed0efSShaohua Li }
246f6bed0efSShaohua Li 
247f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log)
248f6bed0efSShaohua Li {
249f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
250f6bed0efSShaohua Li 	struct r5l_meta_block *block;
251f6bed0efSShaohua Li 	struct bio *bio;
252509ffec7SChristoph Hellwig 	unsigned long flags;
253f6bed0efSShaohua Li 	u32 crc;
254f6bed0efSShaohua Li 
255f6bed0efSShaohua Li 	if (!io)
256f6bed0efSShaohua Li 		return;
257f6bed0efSShaohua Li 
258f6bed0efSShaohua Li 	block = page_address(io->meta_page);
259f6bed0efSShaohua Li 	block->meta_size = cpu_to_le32(io->meta_offset);
2605cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
261f6bed0efSShaohua Li 	block->checksum = cpu_to_le32(crc);
262f6bed0efSShaohua Li 
263f6bed0efSShaohua Li 	log->current_io = NULL;
264509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
265509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
266509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
267f6bed0efSShaohua Li 
268f6bed0efSShaohua Li 	while ((bio = bio_list_pop(&io->bios))) {
269f6bed0efSShaohua Li 		/* all IO must start from rdev->data_offset */
270f6bed0efSShaohua Li 		bio->bi_iter.bi_sector += log->rdev->data_offset;
271f6bed0efSShaohua Li 		submit_bio(WRITE, bio);
272f6bed0efSShaohua Li 	}
273f6bed0efSShaohua Li }
274f6bed0efSShaohua Li 
275f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
276f6bed0efSShaohua Li {
277f6bed0efSShaohua Li 	struct r5l_io_unit *io;
278f6bed0efSShaohua Li 	struct r5l_meta_block *block;
279f6bed0efSShaohua Li 	struct bio *bio;
280f6bed0efSShaohua Li 
281f6bed0efSShaohua Li 	io = r5l_alloc_io_unit(log);
282f6bed0efSShaohua Li 
283f6bed0efSShaohua Li 	block = page_address(io->meta_page);
284f6bed0efSShaohua Li 	block->magic = cpu_to_le32(R5LOG_MAGIC);
285f6bed0efSShaohua Li 	block->version = R5LOG_VERSION;
286f6bed0efSShaohua Li 	block->seq = cpu_to_le64(log->seq);
287f6bed0efSShaohua Li 	block->position = cpu_to_le64(log->log_start);
288f6bed0efSShaohua Li 
289f6bed0efSShaohua Li 	io->log_start = log->log_start;
290f6bed0efSShaohua Li 	io->meta_offset = sizeof(struct r5l_meta_block);
291f6bed0efSShaohua Li 	io->seq = log->seq;
292f6bed0efSShaohua Li 
293f6bed0efSShaohua Li 	bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
294f6bed0efSShaohua Li 	io->current_bio = bio;
295f6bed0efSShaohua Li 	bio->bi_rw = WRITE;
296f6bed0efSShaohua Li 	bio->bi_bdev = log->rdev->bdev;
297f6bed0efSShaohua Li 	bio->bi_iter.bi_sector = log->log_start;
298f6bed0efSShaohua Li 	bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
299f6bed0efSShaohua Li 	bio->bi_end_io = r5l_log_endio;
300f6bed0efSShaohua Li 	bio->bi_private = io;
301f6bed0efSShaohua Li 
302f6bed0efSShaohua Li 	bio_list_add(&io->bios, bio);
303f6bed0efSShaohua Li 	atomic_inc(&io->pending_io);
304f6bed0efSShaohua Li 
305f6bed0efSShaohua Li 	log->seq++;
306f6bed0efSShaohua Li 	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
307f6bed0efSShaohua Li 	io->log_end = log->log_start;
308f6bed0efSShaohua Li 	/* current bio hit disk end */
309f6bed0efSShaohua Li 	if (log->log_start == 0)
310f6bed0efSShaohua Li 		io->current_bio = NULL;
311f6bed0efSShaohua Li 
312f6bed0efSShaohua Li 	spin_lock_irq(&log->io_list_lock);
313f6bed0efSShaohua Li 	list_add_tail(&io->log_sibling, &log->running_ios);
314f6bed0efSShaohua Li 	spin_unlock_irq(&log->io_list_lock);
315f6bed0efSShaohua Li 
316f6bed0efSShaohua Li 	return io;
317f6bed0efSShaohua Li }
318f6bed0efSShaohua Li 
319f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
320f6bed0efSShaohua Li {
321f6bed0efSShaohua Li 	struct r5l_io_unit *io;
322f6bed0efSShaohua Li 
323f6bed0efSShaohua Li 	io = log->current_io;
324f6bed0efSShaohua Li 	if (io && io->meta_offset + payload_size > PAGE_SIZE)
325f6bed0efSShaohua Li 		r5l_submit_current_io(log);
326f6bed0efSShaohua Li 	io = log->current_io;
327f6bed0efSShaohua Li 	if (io)
328f6bed0efSShaohua Li 		return 0;
329f6bed0efSShaohua Li 
330f6bed0efSShaohua Li 	log->current_io = r5l_new_meta(log);
331f6bed0efSShaohua Li 	return 0;
332f6bed0efSShaohua Li }
333f6bed0efSShaohua Li 
334f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
335f6bed0efSShaohua Li 				    sector_t location,
336f6bed0efSShaohua Li 				    u32 checksum1, u32 checksum2,
337f6bed0efSShaohua Li 				    bool checksum2_valid)
338f6bed0efSShaohua Li {
339f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
340f6bed0efSShaohua Li 	struct r5l_payload_data_parity *payload;
341f6bed0efSShaohua Li 
342f6bed0efSShaohua Li 	payload = page_address(io->meta_page) + io->meta_offset;
343f6bed0efSShaohua Li 	payload->header.type = cpu_to_le16(type);
344f6bed0efSShaohua Li 	payload->header.flags = cpu_to_le16(0);
345f6bed0efSShaohua Li 	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
346f6bed0efSShaohua Li 				    (PAGE_SHIFT - 9));
347f6bed0efSShaohua Li 	payload->location = cpu_to_le64(location);
348f6bed0efSShaohua Li 	payload->checksum[0] = cpu_to_le32(checksum1);
349f6bed0efSShaohua Li 	if (checksum2_valid)
350f6bed0efSShaohua Li 		payload->checksum[1] = cpu_to_le32(checksum2);
351f6bed0efSShaohua Li 
352f6bed0efSShaohua Li 	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
353f6bed0efSShaohua Li 		sizeof(__le32) * (1 + !!checksum2_valid);
354f6bed0efSShaohua Li }
355f6bed0efSShaohua Li 
356f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
357f6bed0efSShaohua Li {
358f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
359f6bed0efSShaohua Li 
360f6bed0efSShaohua Li alloc_bio:
361f6bed0efSShaohua Li 	if (!io->current_bio) {
362f6bed0efSShaohua Li 		struct bio *bio;
363f6bed0efSShaohua Li 
364f6bed0efSShaohua Li 		bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
365f6bed0efSShaohua Li 		bio->bi_rw = WRITE;
366f6bed0efSShaohua Li 		bio->bi_bdev = log->rdev->bdev;
367f6bed0efSShaohua Li 		bio->bi_iter.bi_sector = log->log_start;
368f6bed0efSShaohua Li 		bio->bi_end_io = r5l_log_endio;
369f6bed0efSShaohua Li 		bio->bi_private = io;
370f6bed0efSShaohua Li 		bio_list_add(&io->bios, bio);
371f6bed0efSShaohua Li 		atomic_inc(&io->pending_io);
372f6bed0efSShaohua Li 		io->current_bio = bio;
373f6bed0efSShaohua Li 	}
374f6bed0efSShaohua Li 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
375f6bed0efSShaohua Li 		io->current_bio = NULL;
376f6bed0efSShaohua Li 		goto alloc_bio;
377f6bed0efSShaohua Li 	}
378f6bed0efSShaohua Li 	log->log_start = r5l_ring_add(log, log->log_start,
379f6bed0efSShaohua Li 				      BLOCK_SECTORS);
380f6bed0efSShaohua Li 	/* current bio hit disk end */
381f6bed0efSShaohua Li 	if (log->log_start == 0)
382f6bed0efSShaohua Li 		io->current_bio = NULL;
383f6bed0efSShaohua Li 
384f6bed0efSShaohua Li 	io->log_end = log->log_start;
385f6bed0efSShaohua Li }
386f6bed0efSShaohua Li 
387f6bed0efSShaohua Li static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
388f6bed0efSShaohua Li 			   int data_pages, int parity_pages)
389f6bed0efSShaohua Li {
390f6bed0efSShaohua Li 	int i;
391f6bed0efSShaohua Li 	int meta_size;
392f6bed0efSShaohua Li 	struct r5l_io_unit *io;
393f6bed0efSShaohua Li 
394f6bed0efSShaohua Li 	meta_size =
395f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
396f6bed0efSShaohua Li 		 * data_pages) +
397f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
398f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
399f6bed0efSShaohua Li 
400f6bed0efSShaohua Li 	r5l_get_meta(log, meta_size);
401f6bed0efSShaohua Li 	io = log->current_io;
402f6bed0efSShaohua Li 
403f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
404f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
405f6bed0efSShaohua Li 			continue;
406f6bed0efSShaohua Li 		if (i == sh->pd_idx || i == sh->qd_idx)
407f6bed0efSShaohua Li 			continue;
408f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
409f6bed0efSShaohua Li 					raid5_compute_blocknr(sh, i, 0),
410f6bed0efSShaohua Li 					sh->dev[i].log_checksum, 0, false);
411f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[i].page);
412f6bed0efSShaohua Li 	}
413f6bed0efSShaohua Li 
414f6bed0efSShaohua Li 	if (sh->qd_idx >= 0) {
415f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
416f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
417f6bed0efSShaohua Li 					sh->dev[sh->qd_idx].log_checksum, true);
418f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
419f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
420f6bed0efSShaohua Li 	} else {
421f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
422f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
423f6bed0efSShaohua Li 					0, false);
424f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
425f6bed0efSShaohua Li 	}
426f6bed0efSShaohua Li 
427f6bed0efSShaohua Li 	list_add_tail(&sh->log_list, &io->stripe_list);
428f6bed0efSShaohua Li 	atomic_inc(&io->pending_stripe);
429f6bed0efSShaohua Li 	sh->log_io = io;
430f6bed0efSShaohua Li }
431f6bed0efSShaohua Li 
432509ffec7SChristoph Hellwig static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
433f6bed0efSShaohua Li /*
434f6bed0efSShaohua Li  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
435f6bed0efSShaohua Li  * data from log to raid disks), so we shouldn't wait for reclaim here
436f6bed0efSShaohua Li  */
437f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
438f6bed0efSShaohua Li {
439f6bed0efSShaohua Li 	int write_disks = 0;
440f6bed0efSShaohua Li 	int data_pages, parity_pages;
441f6bed0efSShaohua Li 	int meta_size;
442f6bed0efSShaohua Li 	int reserve;
443f6bed0efSShaohua Li 	int i;
444f6bed0efSShaohua Li 
445f6bed0efSShaohua Li 	if (!log)
446f6bed0efSShaohua Li 		return -EAGAIN;
447f6bed0efSShaohua Li 	/* Don't support stripe batch */
448f6bed0efSShaohua Li 	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
449f6bed0efSShaohua Li 	    test_bit(STRIPE_SYNCING, &sh->state)) {
450f6bed0efSShaohua Li 		/* the stripe is written to log, we start writing it to raid */
451f6bed0efSShaohua Li 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
452f6bed0efSShaohua Li 		return -EAGAIN;
453f6bed0efSShaohua Li 	}
454f6bed0efSShaohua Li 
455f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
456f6bed0efSShaohua Li 		void *addr;
457f6bed0efSShaohua Li 
458f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
459f6bed0efSShaohua Li 			continue;
460f6bed0efSShaohua Li 		write_disks++;
461f6bed0efSShaohua Li 		/* checksum is already calculated in last run */
462f6bed0efSShaohua Li 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
463f6bed0efSShaohua Li 			continue;
464f6bed0efSShaohua Li 		addr = kmap_atomic(sh->dev[i].page);
4655cb2fbd6SShaohua Li 		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
466f6bed0efSShaohua Li 						    addr, PAGE_SIZE);
467f6bed0efSShaohua Li 		kunmap_atomic(addr);
468f6bed0efSShaohua Li 	}
469f6bed0efSShaohua Li 	parity_pages = 1 + !!(sh->qd_idx >= 0);
470f6bed0efSShaohua Li 	data_pages = write_disks - parity_pages;
471f6bed0efSShaohua Li 
472f6bed0efSShaohua Li 	meta_size =
473f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
474f6bed0efSShaohua Li 		 * data_pages) +
475f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
476f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
477f6bed0efSShaohua Li 	/* Doesn't work with very big raid array */
478f6bed0efSShaohua Li 	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
479f6bed0efSShaohua Li 		return -EINVAL;
480f6bed0efSShaohua Li 
481f6bed0efSShaohua Li 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
482f6bed0efSShaohua Li 	atomic_inc(&sh->count);
483f6bed0efSShaohua Li 
484f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
485f6bed0efSShaohua Li 	/* meta + data */
486f6bed0efSShaohua Li 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
487f6bed0efSShaohua Li 	if (r5l_has_free_space(log, reserve))
488f6bed0efSShaohua Li 		r5l_log_stripe(log, sh, data_pages, parity_pages);
489f6bed0efSShaohua Li 	else {
490f6bed0efSShaohua Li 		spin_lock(&log->no_space_stripes_lock);
491f6bed0efSShaohua Li 		list_add_tail(&sh->log_list, &log->no_space_stripes);
492f6bed0efSShaohua Li 		spin_unlock(&log->no_space_stripes_lock);
493f6bed0efSShaohua Li 
494f6bed0efSShaohua Li 		r5l_wake_reclaim(log, reserve);
495f6bed0efSShaohua Li 	}
496f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
497f6bed0efSShaohua Li 
498f6bed0efSShaohua Li 	return 0;
499f6bed0efSShaohua Li }
500f6bed0efSShaohua Li 
501f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log)
502f6bed0efSShaohua Li {
503f6bed0efSShaohua Li 	if (!log)
504f6bed0efSShaohua Li 		return;
505f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
506f6bed0efSShaohua Li 	r5l_submit_current_io(log);
507f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
508f6bed0efSShaohua Li }
509f6bed0efSShaohua Li 
510828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
511828cbe98SShaohua Li {
512828cbe98SShaohua Li 	if (!log)
513828cbe98SShaohua Li 		return -ENODEV;
514828cbe98SShaohua Li 	/*
515828cbe98SShaohua Li 	 * we flush log disk cache first, then write stripe data to raid disks.
516828cbe98SShaohua Li 	 * So if bio is finished, the log disk cache is flushed already. The
517828cbe98SShaohua Li 	 * recovery guarantees we can recovery the bio from log disk, so we
518828cbe98SShaohua Li 	 * don't need to flush again
519828cbe98SShaohua Li 	 */
520828cbe98SShaohua Li 	if (bio->bi_iter.bi_size == 0) {
521828cbe98SShaohua Li 		bio_endio(bio);
522828cbe98SShaohua Li 		return 0;
523828cbe98SShaohua Li 	}
524828cbe98SShaohua Li 	bio->bi_rw &= ~REQ_FLUSH;
525828cbe98SShaohua Li 	return -EAGAIN;
526828cbe98SShaohua Li }
527828cbe98SShaohua Li 
528f6bed0efSShaohua Li /* This will run after log space is reclaimed */
529f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log)
530f6bed0efSShaohua Li {
531f6bed0efSShaohua Li 	struct stripe_head *sh;
532f6bed0efSShaohua Li 
533f6bed0efSShaohua Li 	spin_lock(&log->no_space_stripes_lock);
534f6bed0efSShaohua Li 	while (!list_empty(&log->no_space_stripes)) {
535f6bed0efSShaohua Li 		sh = list_first_entry(&log->no_space_stripes,
536f6bed0efSShaohua Li 				      struct stripe_head, log_list);
537f6bed0efSShaohua Li 		list_del_init(&sh->log_list);
538f6bed0efSShaohua Li 		set_bit(STRIPE_HANDLE, &sh->state);
539f6bed0efSShaohua Li 		raid5_release_stripe(sh);
540f6bed0efSShaohua Li 	}
541f6bed0efSShaohua Li 	spin_unlock(&log->no_space_stripes_lock);
542f6bed0efSShaohua Li }
543f6bed0efSShaohua Li 
544509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
545509ffec7SChristoph Hellwig {
546509ffec7SChristoph Hellwig 	struct r5l_log *log = io->log;
547509ffec7SChristoph Hellwig 	struct r5l_io_unit *last;
548509ffec7SChristoph Hellwig 	sector_t reclaimable_space;
549509ffec7SChristoph Hellwig 	unsigned long flags;
550509ffec7SChristoph Hellwig 
551509ffec7SChristoph Hellwig 	spin_lock_irqsave(&log->io_list_lock, flags);
552509ffec7SChristoph Hellwig 	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
553*85f2f9a4SShaohua Li 	/* might move 0 entry */
554509ffec7SChristoph Hellwig 	r5l_move_io_unit_list(&log->flushed_ios, &log->stripe_end_ios,
555509ffec7SChristoph Hellwig 			      IO_UNIT_STRIPE_END);
556*85f2f9a4SShaohua Li 	if (list_empty(&log->stripe_end_ios)) {
557*85f2f9a4SShaohua Li 		spin_unlock_irqrestore(&log->io_list_lock, flags);
558*85f2f9a4SShaohua Li 		return;
559*85f2f9a4SShaohua Li 	}
560509ffec7SChristoph Hellwig 
561509ffec7SChristoph Hellwig 	last = list_last_entry(&log->stripe_end_ios,
562509ffec7SChristoph Hellwig 			       struct r5l_io_unit, log_sibling);
563509ffec7SChristoph Hellwig 	reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
564509ffec7SChristoph Hellwig 					      last->log_end);
565509ffec7SChristoph Hellwig 	if (reclaimable_space >= log->max_free_space)
566509ffec7SChristoph Hellwig 		r5l_wake_reclaim(log, 0);
567509ffec7SChristoph Hellwig 
568509ffec7SChristoph Hellwig 	r5l_compress_stripe_end_list(log);
569509ffec7SChristoph Hellwig 	spin_unlock_irqrestore(&log->io_list_lock, flags);
570509ffec7SChristoph Hellwig 	wake_up(&log->iounit_wait);
571509ffec7SChristoph Hellwig }
572509ffec7SChristoph Hellwig 
5730576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh)
5740576b1c6SShaohua Li {
5750576b1c6SShaohua Li 	struct r5l_io_unit *io;
5760576b1c6SShaohua Li 
5770576b1c6SShaohua Li 	io = sh->log_io;
5780576b1c6SShaohua Li 	sh->log_io = NULL;
5790576b1c6SShaohua Li 
580509ffec7SChristoph Hellwig 	if (io && atomic_dec_and_test(&io->pending_stripe))
581509ffec7SChristoph Hellwig 		__r5l_stripe_write_finished(io);
5820576b1c6SShaohua Li }
5830576b1c6SShaohua Li 
584a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio)
585a8c34f91SShaohua Li {
586a8c34f91SShaohua Li 	struct r5l_log *log = container_of(bio, struct r5l_log,
587a8c34f91SShaohua Li 		flush_bio);
588a8c34f91SShaohua Li 	unsigned long flags;
589a8c34f91SShaohua Li 	struct r5l_io_unit *io;
590a8c34f91SShaohua Li 	struct stripe_head *sh;
591a8c34f91SShaohua Li 
592a8c34f91SShaohua Li 	spin_lock_irqsave(&log->io_list_lock, flags);
593a8c34f91SShaohua Li 	list_for_each_entry(io, &log->flushing_ios, log_sibling) {
594a8c34f91SShaohua Li 		while (!list_empty(&io->stripe_list)) {
595a8c34f91SShaohua Li 			sh = list_first_entry(&io->stripe_list,
596a8c34f91SShaohua Li 				struct stripe_head, log_list);
597a8c34f91SShaohua Li 			list_del_init(&sh->log_list);
598a8c34f91SShaohua Li 			set_bit(STRIPE_HANDLE, &sh->state);
599a8c34f91SShaohua Li 			raid5_release_stripe(sh);
600a8c34f91SShaohua Li 		}
601a8c34f91SShaohua Li 	}
602a8c34f91SShaohua Li 	list_splice_tail_init(&log->flushing_ios, &log->flushed_ios);
603a8c34f91SShaohua Li 	spin_unlock_irqrestore(&log->io_list_lock, flags);
604a8c34f91SShaohua Li }
605a8c34f91SShaohua Li 
6060576b1c6SShaohua Li /*
6070576b1c6SShaohua Li  * Starting dispatch IO to raid.
6080576b1c6SShaohua Li  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
6090576b1c6SShaohua Li  * broken meta in the middle of a log causes recovery can't find meta at the
6100576b1c6SShaohua Li  * head of log. If operations require meta at the head persistent in log, we
6110576b1c6SShaohua Li  * must make sure meta before it persistent in log too. A case is:
6120576b1c6SShaohua Li  *
6130576b1c6SShaohua Li  * stripe data/parity is in log, we start write stripe to raid disks. stripe
6140576b1c6SShaohua Li  * data/parity must be persistent in log before we do the write to raid disks.
6150576b1c6SShaohua Li  *
6160576b1c6SShaohua Li  * The solution is we restrictly maintain io_unit list order. In this case, we
6170576b1c6SShaohua Li  * only write stripes of an io_unit to raid disks till the io_unit is the first
6180576b1c6SShaohua Li  * one whose data/parity is in log.
6190576b1c6SShaohua Li  */
6200576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log)
6210576b1c6SShaohua Li {
622a8c34f91SShaohua Li 	bool do_flush;
6230576b1c6SShaohua Li 	if (!log)
6240576b1c6SShaohua Li 		return;
6250576b1c6SShaohua Li 
626a8c34f91SShaohua Li 	spin_lock_irq(&log->io_list_lock);
627a8c34f91SShaohua Li 	/* flush bio is running */
628a8c34f91SShaohua Li 	if (!list_empty(&log->flushing_ios)) {
629a8c34f91SShaohua Li 		spin_unlock_irq(&log->io_list_lock);
6300576b1c6SShaohua Li 		return;
6310576b1c6SShaohua Li 	}
632a8c34f91SShaohua Li 	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
633a8c34f91SShaohua Li 	do_flush = !list_empty(&log->flushing_ios);
6340576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
635a8c34f91SShaohua Li 
636a8c34f91SShaohua Li 	if (!do_flush)
637a8c34f91SShaohua Li 		return;
638a8c34f91SShaohua Li 	bio_reset(&log->flush_bio);
639a8c34f91SShaohua Li 	log->flush_bio.bi_bdev = log->rdev->bdev;
640a8c34f91SShaohua Li 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
641a8c34f91SShaohua Li 	submit_bio(WRITE_FLUSH, &log->flush_bio);
6420576b1c6SShaohua Li }
6430576b1c6SShaohua Li 
6440fd22b45SShaohua Li static void r5l_kick_io_unit(struct r5l_log *log)
6450576b1c6SShaohua Li {
646a8c34f91SShaohua Li 	md_wakeup_thread(log->rdev->mddev->thread);
6470fd22b45SShaohua Li 	wait_event_lock_irq(log->iounit_wait, !list_empty(&log->stripe_end_ios),
6480fd22b45SShaohua Li 			    log->io_list_lock);
6490576b1c6SShaohua Li }
6500576b1c6SShaohua Li 
6510576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp);
6520576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log)
6530576b1c6SShaohua Li {
6540576b1c6SShaohua Li 	struct r5l_io_unit *io, *last;
6550576b1c6SShaohua Li 	LIST_HEAD(list);
6560576b1c6SShaohua Li 	sector_t free = 0;
6570576b1c6SShaohua Li 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
6580576b1c6SShaohua Li 
6590576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
6600576b1c6SShaohua Li 	/*
6610576b1c6SShaohua Li 	 * move proper io_unit to reclaim list. We should not change the order.
6620576b1c6SShaohua Li 	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
6630576b1c6SShaohua Li 	 * shouldn't reuse space of an unreclaimable io_unit
6640576b1c6SShaohua Li 	 */
6650576b1c6SShaohua Li 	while (1) {
666a8c34f91SShaohua Li 		struct list_head *target_list = NULL;
667a8c34f91SShaohua Li 
6680576b1c6SShaohua Li 		while (!list_empty(&log->stripe_end_ios)) {
6690576b1c6SShaohua Li 			io = list_first_entry(&log->stripe_end_ios,
6700576b1c6SShaohua Li 					      struct r5l_io_unit, log_sibling);
6710576b1c6SShaohua Li 			list_move_tail(&io->log_sibling, &list);
6720576b1c6SShaohua Li 			free += r5l_ring_distance(log, io->log_start,
6730576b1c6SShaohua Li 						  io->log_end);
6740576b1c6SShaohua Li 		}
6750576b1c6SShaohua Li 
6760576b1c6SShaohua Li 		if (free >= reclaim_target ||
6770576b1c6SShaohua Li 		    (list_empty(&log->running_ios) &&
6780576b1c6SShaohua Li 		     list_empty(&log->io_end_ios) &&
679a8c34f91SShaohua Li 		     list_empty(&log->flushing_ios) &&
680a8c34f91SShaohua Li 		     list_empty(&log->flushed_ios)))
6810576b1c6SShaohua Li 			break;
6820576b1c6SShaohua Li 
6830576b1c6SShaohua Li 		/* Below waiting mostly happens when we shutdown the raid */
684a8c34f91SShaohua Li 		if (!list_empty(&log->flushed_ios))
685a8c34f91SShaohua Li 			target_list = &log->flushed_ios;
686a8c34f91SShaohua Li 		else if (!list_empty(&log->flushing_ios))
687a8c34f91SShaohua Li 			target_list = &log->flushing_ios;
688a8c34f91SShaohua Li 		else if (!list_empty(&log->io_end_ios))
689a8c34f91SShaohua Li 			target_list = &log->io_end_ios;
690a8c34f91SShaohua Li 		else if (!list_empty(&log->running_ios))
691a8c34f91SShaohua Li 			target_list = &log->running_ios;
6920576b1c6SShaohua Li 
6930fd22b45SShaohua Li 		r5l_kick_io_unit(log);
6940576b1c6SShaohua Li 	}
6950576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
6960576b1c6SShaohua Li 
6970576b1c6SShaohua Li 	if (list_empty(&list))
6980576b1c6SShaohua Li 		return;
6990576b1c6SShaohua Li 
7000576b1c6SShaohua Li 	/* super always point to last valid meta */
7010576b1c6SShaohua Li 	last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
7020576b1c6SShaohua Li 	/*
7030576b1c6SShaohua Li 	 * write_super will flush cache of each raid disk. We must write super
7040576b1c6SShaohua Li 	 * here, because the log area might be reused soon and we don't want to
7050576b1c6SShaohua Li 	 * confuse recovery
7060576b1c6SShaohua Li 	 */
7070576b1c6SShaohua Li 	r5l_write_super(log, last->log_start);
7080576b1c6SShaohua Li 
7090576b1c6SShaohua Li 	mutex_lock(&log->io_mutex);
7100576b1c6SShaohua Li 	log->last_checkpoint = last->log_start;
7110576b1c6SShaohua Li 	log->last_cp_seq = last->seq;
7120576b1c6SShaohua Li 	mutex_unlock(&log->io_mutex);
7130576b1c6SShaohua Li 	r5l_run_no_space_stripes(log);
7140576b1c6SShaohua Li 
7150576b1c6SShaohua Li 	while (!list_empty(&list)) {
7160576b1c6SShaohua Li 		io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
7170576b1c6SShaohua Li 		list_del(&io->log_sibling);
7180576b1c6SShaohua Li 		r5l_free_io_unit(log, io);
7190576b1c6SShaohua Li 	}
7200576b1c6SShaohua Li }
7210576b1c6SShaohua Li 
7220576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread)
7230576b1c6SShaohua Li {
7240576b1c6SShaohua Li 	struct mddev *mddev = thread->mddev;
7250576b1c6SShaohua Li 	struct r5conf *conf = mddev->private;
7260576b1c6SShaohua Li 	struct r5l_log *log = conf->log;
7270576b1c6SShaohua Li 
7280576b1c6SShaohua Li 	if (!log)
7290576b1c6SShaohua Li 		return;
7300576b1c6SShaohua Li 	r5l_do_reclaim(log);
7310576b1c6SShaohua Li }
7320576b1c6SShaohua Li 
733f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
734f6bed0efSShaohua Li {
7350576b1c6SShaohua Li 	unsigned long target;
7360576b1c6SShaohua Li 	unsigned long new = (unsigned long)space; /* overflow in theory */
7370576b1c6SShaohua Li 
7380576b1c6SShaohua Li 	do {
7390576b1c6SShaohua Li 		target = log->reclaim_target;
7400576b1c6SShaohua Li 		if (new < target)
7410576b1c6SShaohua Li 			return;
7420576b1c6SShaohua Li 	} while (cmpxchg(&log->reclaim_target, target, new) != target);
7430576b1c6SShaohua Li 	md_wakeup_thread(log->reclaim_thread);
744f6bed0efSShaohua Li }
745f6bed0efSShaohua Li 
746355810d1SShaohua Li struct r5l_recovery_ctx {
747355810d1SShaohua Li 	struct page *meta_page;		/* current meta */
748355810d1SShaohua Li 	sector_t meta_total_blocks;	/* total size of current meta and data */
749355810d1SShaohua Li 	sector_t pos;			/* recovery position */
750355810d1SShaohua Li 	u64 seq;			/* recovery position seq */
751355810d1SShaohua Li };
752355810d1SShaohua Li 
753355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log,
754355810d1SShaohua Li 			       struct r5l_recovery_ctx *ctx)
755355810d1SShaohua Li {
756355810d1SShaohua Li 	struct page *page = ctx->meta_page;
757355810d1SShaohua Li 	struct r5l_meta_block *mb;
758355810d1SShaohua Li 	u32 crc, stored_crc;
759355810d1SShaohua Li 
760355810d1SShaohua Li 	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
761355810d1SShaohua Li 		return -EIO;
762355810d1SShaohua Li 
763355810d1SShaohua Li 	mb = page_address(page);
764355810d1SShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
765355810d1SShaohua Li 	mb->checksum = 0;
766355810d1SShaohua Li 
767355810d1SShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
768355810d1SShaohua Li 	    le64_to_cpu(mb->seq) != ctx->seq ||
769355810d1SShaohua Li 	    mb->version != R5LOG_VERSION ||
770355810d1SShaohua Li 	    le64_to_cpu(mb->position) != ctx->pos)
771355810d1SShaohua Li 		return -EINVAL;
772355810d1SShaohua Li 
7735cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
774355810d1SShaohua Li 	if (stored_crc != crc)
775355810d1SShaohua Li 		return -EINVAL;
776355810d1SShaohua Li 
777355810d1SShaohua Li 	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
778355810d1SShaohua Li 		return -EINVAL;
779355810d1SShaohua Li 
780355810d1SShaohua Li 	ctx->meta_total_blocks = BLOCK_SECTORS;
781355810d1SShaohua Li 
782355810d1SShaohua Li 	return 0;
783355810d1SShaohua Li }
784355810d1SShaohua Li 
785355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
786355810d1SShaohua Li 					 struct r5l_recovery_ctx *ctx,
787355810d1SShaohua Li 					 sector_t stripe_sect,
788355810d1SShaohua Li 					 int *offset, sector_t *log_offset)
789355810d1SShaohua Li {
790355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
791355810d1SShaohua Li 	struct stripe_head *sh;
792355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
793355810d1SShaohua Li 	int disk_index;
794355810d1SShaohua Li 
795355810d1SShaohua Li 	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
796355810d1SShaohua Li 	while (1) {
797355810d1SShaohua Li 		payload = page_address(ctx->meta_page) + *offset;
798355810d1SShaohua Li 
799355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
800355810d1SShaohua Li 			raid5_compute_sector(conf,
801355810d1SShaohua Li 					     le64_to_cpu(payload->location), 0,
802355810d1SShaohua Li 					     &disk_index, sh);
803355810d1SShaohua Li 
804355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
805355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
806355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
807355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
808355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
809355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS;
810355810d1SShaohua Li 		} else {
811355810d1SShaohua Li 			disk_index = sh->pd_idx;
812355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
813355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
814355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
815355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
816355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
817355810d1SShaohua Li 
818355810d1SShaohua Li 			if (sh->qd_idx >= 0) {
819355810d1SShaohua Li 				disk_index = sh->qd_idx;
820355810d1SShaohua Li 				sync_page_io(log->rdev,
821355810d1SShaohua Li 					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
822355810d1SShaohua Li 					     PAGE_SIZE, sh->dev[disk_index].page,
823355810d1SShaohua Li 					     READ, false);
824355810d1SShaohua Li 				sh->dev[disk_index].log_checksum =
825355810d1SShaohua Li 					le32_to_cpu(payload->checksum[1]);
826355810d1SShaohua Li 				set_bit(R5_Wantwrite,
827355810d1SShaohua Li 					&sh->dev[disk_index].flags);
828355810d1SShaohua Li 			}
829355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
830355810d1SShaohua Li 		}
831355810d1SShaohua Li 
832355810d1SShaohua Li 		*log_offset = r5l_ring_add(log, *log_offset,
833355810d1SShaohua Li 					   le32_to_cpu(payload->size));
834355810d1SShaohua Li 		*offset += sizeof(struct r5l_payload_data_parity) +
835355810d1SShaohua Li 			sizeof(__le32) *
836355810d1SShaohua Li 			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
837355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
838355810d1SShaohua Li 			break;
839355810d1SShaohua Li 	}
840355810d1SShaohua Li 
841355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
842355810d1SShaohua Li 		void *addr;
843355810d1SShaohua Li 		u32 checksum;
844355810d1SShaohua Li 
845355810d1SShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
846355810d1SShaohua Li 			continue;
847355810d1SShaohua Li 		addr = kmap_atomic(sh->dev[disk_index].page);
8485cb2fbd6SShaohua Li 		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
849355810d1SShaohua Li 		kunmap_atomic(addr);
850355810d1SShaohua Li 		if (checksum != sh->dev[disk_index].log_checksum)
851355810d1SShaohua Li 			goto error;
852355810d1SShaohua Li 	}
853355810d1SShaohua Li 
854355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
855355810d1SShaohua Li 		struct md_rdev *rdev, *rrdev;
856355810d1SShaohua Li 
857355810d1SShaohua Li 		if (!test_and_clear_bit(R5_Wantwrite,
858355810d1SShaohua Li 					&sh->dev[disk_index].flags))
859355810d1SShaohua Li 			continue;
860355810d1SShaohua Li 
861355810d1SShaohua Li 		/* in case device is broken */
862355810d1SShaohua Li 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
863355810d1SShaohua Li 		if (rdev)
864355810d1SShaohua Li 			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
865355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
866355810d1SShaohua Li 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
867355810d1SShaohua Li 		if (rrdev)
868355810d1SShaohua Li 			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
869355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
870355810d1SShaohua Li 	}
871355810d1SShaohua Li 	raid5_release_stripe(sh);
872355810d1SShaohua Li 	return 0;
873355810d1SShaohua Li 
874355810d1SShaohua Li error:
875355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++)
876355810d1SShaohua Li 		sh->dev[disk_index].flags = 0;
877355810d1SShaohua Li 	raid5_release_stripe(sh);
878355810d1SShaohua Li 	return -EINVAL;
879355810d1SShaohua Li }
880355810d1SShaohua Li 
881355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log,
882355810d1SShaohua Li 				       struct r5l_recovery_ctx *ctx)
883355810d1SShaohua Li {
884355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
885355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
886355810d1SShaohua Li 	struct r5l_meta_block *mb;
887355810d1SShaohua Li 	int offset;
888355810d1SShaohua Li 	sector_t log_offset;
889355810d1SShaohua Li 	sector_t stripe_sector;
890355810d1SShaohua Li 
891355810d1SShaohua Li 	mb = page_address(ctx->meta_page);
892355810d1SShaohua Li 	offset = sizeof(struct r5l_meta_block);
893355810d1SShaohua Li 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
894355810d1SShaohua Li 
895355810d1SShaohua Li 	while (offset < le32_to_cpu(mb->meta_size)) {
896355810d1SShaohua Li 		int dd;
897355810d1SShaohua Li 
898355810d1SShaohua Li 		payload = (void *)mb + offset;
899355810d1SShaohua Li 		stripe_sector = raid5_compute_sector(conf,
900355810d1SShaohua Li 						     le64_to_cpu(payload->location), 0, &dd, NULL);
901355810d1SShaohua Li 		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
902355810d1SShaohua Li 						  &offset, &log_offset))
903355810d1SShaohua Li 			return -EINVAL;
904355810d1SShaohua Li 	}
905355810d1SShaohua Li 	return 0;
906355810d1SShaohua Li }
907355810d1SShaohua Li 
908355810d1SShaohua Li /* copy data/parity from log to raid disks */
909355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log,
910355810d1SShaohua Li 				   struct r5l_recovery_ctx *ctx)
911355810d1SShaohua Li {
912355810d1SShaohua Li 	while (1) {
913355810d1SShaohua Li 		if (r5l_read_meta_block(log, ctx))
914355810d1SShaohua Li 			return;
915355810d1SShaohua Li 		if (r5l_recovery_flush_one_meta(log, ctx))
916355810d1SShaohua Li 			return;
917355810d1SShaohua Li 		ctx->seq++;
918355810d1SShaohua Li 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
919355810d1SShaohua Li 	}
920355810d1SShaohua Li }
921355810d1SShaohua Li 
922355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
923355810d1SShaohua Li 					  u64 seq)
924355810d1SShaohua Li {
925355810d1SShaohua Li 	struct page *page;
926355810d1SShaohua Li 	struct r5l_meta_block *mb;
927355810d1SShaohua Li 	u32 crc;
928355810d1SShaohua Li 
929355810d1SShaohua Li 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
930355810d1SShaohua Li 	if (!page)
931355810d1SShaohua Li 		return -ENOMEM;
932355810d1SShaohua Li 	mb = page_address(page);
933355810d1SShaohua Li 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
934355810d1SShaohua Li 	mb->version = R5LOG_VERSION;
935355810d1SShaohua Li 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
936355810d1SShaohua Li 	mb->seq = cpu_to_le64(seq);
937355810d1SShaohua Li 	mb->position = cpu_to_le64(pos);
9385cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
939355810d1SShaohua Li 	mb->checksum = cpu_to_le32(crc);
940355810d1SShaohua Li 
941355810d1SShaohua Li 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
942355810d1SShaohua Li 		__free_page(page);
943355810d1SShaohua Li 		return -EIO;
944355810d1SShaohua Li 	}
945355810d1SShaohua Li 	__free_page(page);
946355810d1SShaohua Li 	return 0;
947355810d1SShaohua Li }
948355810d1SShaohua Li 
949f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log)
950f6bed0efSShaohua Li {
951355810d1SShaohua Li 	struct r5l_recovery_ctx ctx;
952355810d1SShaohua Li 
953355810d1SShaohua Li 	ctx.pos = log->last_checkpoint;
954355810d1SShaohua Li 	ctx.seq = log->last_cp_seq;
955355810d1SShaohua Li 	ctx.meta_page = alloc_page(GFP_KERNEL);
956355810d1SShaohua Li 	if (!ctx.meta_page)
957355810d1SShaohua Li 		return -ENOMEM;
958355810d1SShaohua Li 
959355810d1SShaohua Li 	r5l_recovery_flush_log(log, &ctx);
960355810d1SShaohua Li 	__free_page(ctx.meta_page);
961355810d1SShaohua Li 
962355810d1SShaohua Li 	/*
963355810d1SShaohua Li 	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
964355810d1SShaohua Li 	 * log will start here. but we can't let superblock point to last valid
965355810d1SShaohua Li 	 * meta block. The log might looks like:
966355810d1SShaohua Li 	 * | meta 1| meta 2| meta 3|
967355810d1SShaohua Li 	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
968355810d1SShaohua Li 	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
969355810d1SShaohua Li 	 * happens again, new recovery will start from meta 1. Since meta 2n is
970355810d1SShaohua Li 	 * valid now, recovery will think meta 3 is valid, which is wrong.
971355810d1SShaohua Li 	 * The solution is we create a new meta in meta2 with its seq == meta
972355810d1SShaohua Li 	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
973355810d1SShaohua Li 	 * not think meta 3 is a valid meta, because its seq doesn't match
974355810d1SShaohua Li 	 */
975355810d1SShaohua Li 	if (ctx.seq > log->last_cp_seq + 1) {
976355810d1SShaohua Li 		int ret;
977355810d1SShaohua Li 
978355810d1SShaohua Li 		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
979355810d1SShaohua Li 		if (ret)
980355810d1SShaohua Li 			return ret;
981355810d1SShaohua Li 		log->seq = ctx.seq + 11;
982355810d1SShaohua Li 		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
983355810d1SShaohua Li 		r5l_write_super(log, ctx.pos);
984355810d1SShaohua Li 	} else {
985355810d1SShaohua Li 		log->log_start = ctx.pos;
986355810d1SShaohua Li 		log->seq = ctx.seq;
987355810d1SShaohua Li 	}
988f6bed0efSShaohua Li 	return 0;
989f6bed0efSShaohua Li }
990f6bed0efSShaohua Li 
991f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp)
992f6bed0efSShaohua Li {
993f6bed0efSShaohua Li 	struct mddev *mddev = log->rdev->mddev;
994f6bed0efSShaohua Li 
995f6bed0efSShaohua Li 	log->rdev->journal_tail = cp;
996f6bed0efSShaohua Li 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
997f6bed0efSShaohua Li }
998f6bed0efSShaohua Li 
999f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log)
1000f6bed0efSShaohua Li {
1001f6bed0efSShaohua Li 	struct md_rdev *rdev = log->rdev;
1002f6bed0efSShaohua Li 	struct page *page;
1003f6bed0efSShaohua Li 	struct r5l_meta_block *mb;
1004f6bed0efSShaohua Li 	sector_t cp = log->rdev->journal_tail;
1005f6bed0efSShaohua Li 	u32 stored_crc, expected_crc;
1006f6bed0efSShaohua Li 	bool create_super = false;
1007f6bed0efSShaohua Li 	int ret;
1008f6bed0efSShaohua Li 
1009f6bed0efSShaohua Li 	/* Make sure it's valid */
1010f6bed0efSShaohua Li 	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1011f6bed0efSShaohua Li 		cp = 0;
1012f6bed0efSShaohua Li 	page = alloc_page(GFP_KERNEL);
1013f6bed0efSShaohua Li 	if (!page)
1014f6bed0efSShaohua Li 		return -ENOMEM;
1015f6bed0efSShaohua Li 
1016f6bed0efSShaohua Li 	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1017f6bed0efSShaohua Li 		ret = -EIO;
1018f6bed0efSShaohua Li 		goto ioerr;
1019f6bed0efSShaohua Li 	}
1020f6bed0efSShaohua Li 	mb = page_address(page);
1021f6bed0efSShaohua Li 
1022f6bed0efSShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1023f6bed0efSShaohua Li 	    mb->version != R5LOG_VERSION) {
1024f6bed0efSShaohua Li 		create_super = true;
1025f6bed0efSShaohua Li 		goto create;
1026f6bed0efSShaohua Li 	}
1027f6bed0efSShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
1028f6bed0efSShaohua Li 	mb->checksum = 0;
10295cb2fbd6SShaohua Li 	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1030f6bed0efSShaohua Li 	if (stored_crc != expected_crc) {
1031f6bed0efSShaohua Li 		create_super = true;
1032f6bed0efSShaohua Li 		goto create;
1033f6bed0efSShaohua Li 	}
1034f6bed0efSShaohua Li 	if (le64_to_cpu(mb->position) != cp) {
1035f6bed0efSShaohua Li 		create_super = true;
1036f6bed0efSShaohua Li 		goto create;
1037f6bed0efSShaohua Li 	}
1038f6bed0efSShaohua Li create:
1039f6bed0efSShaohua Li 	if (create_super) {
1040f6bed0efSShaohua Li 		log->last_cp_seq = prandom_u32();
1041f6bed0efSShaohua Li 		cp = 0;
1042f6bed0efSShaohua Li 		/*
1043f6bed0efSShaohua Li 		 * Make sure super points to correct address. Log might have
1044f6bed0efSShaohua Li 		 * data very soon. If super hasn't correct log tail address,
1045f6bed0efSShaohua Li 		 * recovery can't find the log
1046f6bed0efSShaohua Li 		 */
1047f6bed0efSShaohua Li 		r5l_write_super(log, cp);
1048f6bed0efSShaohua Li 	} else
1049f6bed0efSShaohua Li 		log->last_cp_seq = le64_to_cpu(mb->seq);
1050f6bed0efSShaohua Li 
1051f6bed0efSShaohua Li 	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
10520576b1c6SShaohua Li 	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
10530576b1c6SShaohua Li 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
10540576b1c6SShaohua Li 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1055f6bed0efSShaohua Li 	log->last_checkpoint = cp;
1056f6bed0efSShaohua Li 
1057f6bed0efSShaohua Li 	__free_page(page);
1058f6bed0efSShaohua Li 
1059f6bed0efSShaohua Li 	return r5l_recovery_log(log);
1060f6bed0efSShaohua Li ioerr:
1061f6bed0efSShaohua Li 	__free_page(page);
1062f6bed0efSShaohua Li 	return ret;
1063f6bed0efSShaohua Li }
1064f6bed0efSShaohua Li 
1065f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1066f6bed0efSShaohua Li {
1067f6bed0efSShaohua Li 	struct r5l_log *log;
1068f6bed0efSShaohua Li 
1069f6bed0efSShaohua Li 	if (PAGE_SIZE != 4096)
1070f6bed0efSShaohua Li 		return -EINVAL;
1071f6bed0efSShaohua Li 	log = kzalloc(sizeof(*log), GFP_KERNEL);
1072f6bed0efSShaohua Li 	if (!log)
1073f6bed0efSShaohua Li 		return -ENOMEM;
1074f6bed0efSShaohua Li 	log->rdev = rdev;
1075f6bed0efSShaohua Li 
10765cb2fbd6SShaohua Li 	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1077f6bed0efSShaohua Li 				       sizeof(rdev->mddev->uuid));
1078f6bed0efSShaohua Li 
1079f6bed0efSShaohua Li 	mutex_init(&log->io_mutex);
1080f6bed0efSShaohua Li 
1081f6bed0efSShaohua Li 	spin_lock_init(&log->io_list_lock);
1082f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->running_ios);
10830576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->io_end_ios);
10840576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->stripe_end_ios);
1085a8c34f91SShaohua Li 	INIT_LIST_HEAD(&log->flushing_ios);
1086a8c34f91SShaohua Li 	INIT_LIST_HEAD(&log->flushed_ios);
1087a8c34f91SShaohua Li 	bio_init(&log->flush_bio);
1088f6bed0efSShaohua Li 
1089f6bed0efSShaohua Li 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1090f6bed0efSShaohua Li 	if (!log->io_kc)
1091f6bed0efSShaohua Li 		goto io_kc;
1092f6bed0efSShaohua Li 
10930576b1c6SShaohua Li 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
10940576b1c6SShaohua Li 						 log->rdev->mddev, "reclaim");
10950576b1c6SShaohua Li 	if (!log->reclaim_thread)
10960576b1c6SShaohua Li 		goto reclaim_thread;
10970fd22b45SShaohua Li 	init_waitqueue_head(&log->iounit_wait);
10980576b1c6SShaohua Li 
1099f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->no_space_stripes);
1100f6bed0efSShaohua Li 	spin_lock_init(&log->no_space_stripes_lock);
1101f6bed0efSShaohua Li 
1102f6bed0efSShaohua Li 	if (r5l_load_log(log))
1103f6bed0efSShaohua Li 		goto error;
1104f6bed0efSShaohua Li 
1105f6bed0efSShaohua Li 	conf->log = log;
1106f6bed0efSShaohua Li 	return 0;
1107f6bed0efSShaohua Li error:
11080576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
11090576b1c6SShaohua Li reclaim_thread:
1110f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1111f6bed0efSShaohua Li io_kc:
1112f6bed0efSShaohua Li 	kfree(log);
1113f6bed0efSShaohua Li 	return -EINVAL;
1114f6bed0efSShaohua Li }
1115f6bed0efSShaohua Li 
1116f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log)
1117f6bed0efSShaohua Li {
11180576b1c6SShaohua Li 	/*
11190576b1c6SShaohua Li 	 * at this point all stripes are finished, so io_unit is at least in
11200576b1c6SShaohua Li 	 * STRIPE_END state
11210576b1c6SShaohua Li 	 */
11220576b1c6SShaohua Li 	r5l_wake_reclaim(log, -1L);
11230576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
11240576b1c6SShaohua Li 	r5l_do_reclaim(log);
11250576b1c6SShaohua Li 	/*
11260576b1c6SShaohua Li 	 * force a super update, r5l_do_reclaim might updated the super.
11270576b1c6SShaohua Li 	 * mddev->thread is already stopped
11280576b1c6SShaohua Li 	 */
11290576b1c6SShaohua Li 	md_update_sb(log->rdev->mddev, 1);
11300576b1c6SShaohua Li 
1131f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1132f6bed0efSShaohua Li 	kfree(log);
1133f6bed0efSShaohua Li }
1134