xref: /linux/drivers/md/raid5-cache.c (revision 5cb2fbd6ea0d151dcb12d98c06c8761eedfed2ee)
1f6bed0efSShaohua Li /*
2f6bed0efSShaohua Li  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3f6bed0efSShaohua Li  *
4f6bed0efSShaohua Li  * This program is free software; you can redistribute it and/or modify it
5f6bed0efSShaohua Li  * under the terms and conditions of the GNU General Public License,
6f6bed0efSShaohua Li  * version 2, as published by the Free Software Foundation.
7f6bed0efSShaohua Li  *
8f6bed0efSShaohua Li  * This program is distributed in the hope it will be useful, but WITHOUT
9f6bed0efSShaohua Li  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10f6bed0efSShaohua Li  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11f6bed0efSShaohua Li  * more details.
12f6bed0efSShaohua Li  *
13f6bed0efSShaohua Li  */
14f6bed0efSShaohua Li #include <linux/kernel.h>
15f6bed0efSShaohua Li #include <linux/wait.h>
16f6bed0efSShaohua Li #include <linux/blkdev.h>
17f6bed0efSShaohua Li #include <linux/slab.h>
18f6bed0efSShaohua Li #include <linux/raid/md_p.h>
19*5cb2fbd6SShaohua Li #include <linux/crc32c.h>
20f6bed0efSShaohua Li #include <linux/random.h>
21f6bed0efSShaohua Li #include "md.h"
22f6bed0efSShaohua Li #include "raid5.h"
23f6bed0efSShaohua Li 
24f6bed0efSShaohua Li /*
25f6bed0efSShaohua Li  * metadata/data stored in disk with 4k size unit (a block) regardless
26f6bed0efSShaohua Li  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27f6bed0efSShaohua Li  */
28f6bed0efSShaohua Li #define BLOCK_SECTORS (8)
29f6bed0efSShaohua Li 
300576b1c6SShaohua Li /*
310576b1c6SShaohua Li  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
320576b1c6SShaohua Li  * recovery scans a very long log
330576b1c6SShaohua Li  */
340576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
350576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
360576b1c6SShaohua Li 
37f6bed0efSShaohua Li struct r5l_log {
38f6bed0efSShaohua Li 	struct md_rdev *rdev;
39f6bed0efSShaohua Li 
40f6bed0efSShaohua Li 	u32 uuid_checksum;
41f6bed0efSShaohua Li 
42f6bed0efSShaohua Li 	sector_t device_size;		/* log device size, round to
43f6bed0efSShaohua Li 					 * BLOCK_SECTORS */
440576b1c6SShaohua Li 	sector_t max_free_space;	/* reclaim run if free space is at
450576b1c6SShaohua Li 					 * this size */
46f6bed0efSShaohua Li 
47f6bed0efSShaohua Li 	sector_t last_checkpoint;	/* log tail. where recovery scan
48f6bed0efSShaohua Li 					 * starts from */
49f6bed0efSShaohua Li 	u64 last_cp_seq;		/* log tail sequence */
50f6bed0efSShaohua Li 
51f6bed0efSShaohua Li 	sector_t log_start;		/* log head. where new data appends */
52f6bed0efSShaohua Li 	u64 seq;			/* log head sequence */
53f6bed0efSShaohua Li 
54f6bed0efSShaohua Li 	struct mutex io_mutex;
55f6bed0efSShaohua Li 	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
56f6bed0efSShaohua Li 
57f6bed0efSShaohua Li 	spinlock_t io_list_lock;
58f6bed0efSShaohua Li 	struct list_head running_ios;	/* io_units which are still running,
59f6bed0efSShaohua Li 					 * and have not yet been completely
60f6bed0efSShaohua Li 					 * written to the log */
61f6bed0efSShaohua Li 	struct list_head io_end_ios;	/* io_units which have been completely
62f6bed0efSShaohua Li 					 * written to the log but not yet written
63f6bed0efSShaohua Li 					 * to the RAID */
640576b1c6SShaohua Li 	struct list_head stripe_end_ios;/* io_units which have been completely
650576b1c6SShaohua Li 					 * written to the RAID but have not yet
660576b1c6SShaohua Li 					 * been considered for updating super */
67f6bed0efSShaohua Li 
68f6bed0efSShaohua Li 	struct kmem_cache *io_kc;
69f6bed0efSShaohua Li 
700576b1c6SShaohua Li 	struct md_thread *reclaim_thread;
710576b1c6SShaohua Li 	unsigned long reclaim_target;	/* number of space that need to be
720576b1c6SShaohua Li 					 * reclaimed.  if it's 0, reclaim spaces
730576b1c6SShaohua Li 					 * used by io_units which are in
740576b1c6SShaohua Li 					 * IO_UNIT_STRIPE_END state (eg, reclaim
750576b1c6SShaohua Li 					 * dones't wait for specific io_unit
760576b1c6SShaohua Li 					 * switching to IO_UNIT_STRIPE_END
770576b1c6SShaohua Li 					 * state) */
780576b1c6SShaohua Li 
79f6bed0efSShaohua Li 	struct list_head no_space_stripes; /* pending stripes, log has no space */
80f6bed0efSShaohua Li 	spinlock_t no_space_stripes_lock;
81f6bed0efSShaohua Li };
82f6bed0efSShaohua Li 
83f6bed0efSShaohua Li /*
84f6bed0efSShaohua Li  * an IO range starts from a meta data block and end at the next meta data
85f6bed0efSShaohua Li  * block. The io unit's the meta data block tracks data/parity followed it. io
86f6bed0efSShaohua Li  * unit is written to log disk with normal write, as we always flush log disk
87f6bed0efSShaohua Li  * first and then start move data to raid disks, there is no requirement to
88f6bed0efSShaohua Li  * write io unit with FLUSH/FUA
89f6bed0efSShaohua Li  */
90f6bed0efSShaohua Li struct r5l_io_unit {
91f6bed0efSShaohua Li 	struct r5l_log *log;
92f6bed0efSShaohua Li 
93f6bed0efSShaohua Li 	struct page *meta_page;	/* store meta block */
94f6bed0efSShaohua Li 	int meta_offset;	/* current offset in meta_page */
95f6bed0efSShaohua Li 
96f6bed0efSShaohua Li 	struct bio_list bios;
97f6bed0efSShaohua Li 	atomic_t pending_io;	/* pending bios not written to log yet */
98f6bed0efSShaohua Li 	struct bio *current_bio;/* current_bio accepting new data */
99f6bed0efSShaohua Li 
100f6bed0efSShaohua Li 	atomic_t pending_stripe;/* how many stripes not flushed to raid */
101f6bed0efSShaohua Li 	u64 seq;		/* seq number of the metablock */
102f6bed0efSShaohua Li 	sector_t log_start;	/* where the io_unit starts */
103f6bed0efSShaohua Li 	sector_t log_end;	/* where the io_unit ends */
104f6bed0efSShaohua Li 	struct list_head log_sibling; /* log->running_ios */
105f6bed0efSShaohua Li 	struct list_head stripe_list; /* stripes added to the io_unit */
106f6bed0efSShaohua Li 
107f6bed0efSShaohua Li 	int state;
108f6bed0efSShaohua Li 	wait_queue_head_t wait_state;
109f6bed0efSShaohua Li };
110f6bed0efSShaohua Li 
111f6bed0efSShaohua Li /* r5l_io_unit state */
112f6bed0efSShaohua Li enum r5l_io_unit_state {
113f6bed0efSShaohua Li 	IO_UNIT_RUNNING = 0,	/* accepting new IO */
114f6bed0efSShaohua Li 	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
115f6bed0efSShaohua Li 				 * don't accepting new bio */
116f6bed0efSShaohua Li 	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
117f6bed0efSShaohua Li 	IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */
118f6bed0efSShaohua Li 	IO_UNIT_STRIPE_END = 4,	/* stripes data finished writing to raid */
119f6bed0efSShaohua Li };
120f6bed0efSShaohua Li 
121f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
122f6bed0efSShaohua Li {
123f6bed0efSShaohua Li 	start += inc;
124f6bed0efSShaohua Li 	if (start >= log->device_size)
125f6bed0efSShaohua Li 		start = start - log->device_size;
126f6bed0efSShaohua Li 	return start;
127f6bed0efSShaohua Li }
128f6bed0efSShaohua Li 
129f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
130f6bed0efSShaohua Li 				  sector_t end)
131f6bed0efSShaohua Li {
132f6bed0efSShaohua Li 	if (end >= start)
133f6bed0efSShaohua Li 		return end - start;
134f6bed0efSShaohua Li 	else
135f6bed0efSShaohua Li 		return end + log->device_size - start;
136f6bed0efSShaohua Li }
137f6bed0efSShaohua Li 
138f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
139f6bed0efSShaohua Li {
140f6bed0efSShaohua Li 	sector_t used_size;
141f6bed0efSShaohua Li 
142f6bed0efSShaohua Li 	used_size = r5l_ring_distance(log, log->last_checkpoint,
143f6bed0efSShaohua Li 					log->log_start);
144f6bed0efSShaohua Li 
145f6bed0efSShaohua Li 	return log->device_size > used_size + size;
146f6bed0efSShaohua Li }
147f6bed0efSShaohua Li 
148f6bed0efSShaohua Li static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
149f6bed0efSShaohua Li {
150f6bed0efSShaohua Li 	struct r5l_io_unit *io;
151f6bed0efSShaohua Li 	/* We can't handle memory allocate failure so far */
152f6bed0efSShaohua Li 	gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
153f6bed0efSShaohua Li 
154f6bed0efSShaohua Li 	io = kmem_cache_zalloc(log->io_kc, gfp);
155f6bed0efSShaohua Li 	io->log = log;
156f6bed0efSShaohua Li 	io->meta_page = alloc_page(gfp | __GFP_ZERO);
157f6bed0efSShaohua Li 
158f6bed0efSShaohua Li 	bio_list_init(&io->bios);
159f6bed0efSShaohua Li 	INIT_LIST_HEAD(&io->log_sibling);
160f6bed0efSShaohua Li 	INIT_LIST_HEAD(&io->stripe_list);
161f6bed0efSShaohua Li 	io->state = IO_UNIT_RUNNING;
162f6bed0efSShaohua Li 	init_waitqueue_head(&io->wait_state);
163f6bed0efSShaohua Li 	return io;
164f6bed0efSShaohua Li }
165f6bed0efSShaohua Li 
166f6bed0efSShaohua Li static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
167f6bed0efSShaohua Li {
168f6bed0efSShaohua Li 	__free_page(io->meta_page);
169f6bed0efSShaohua Li 	kmem_cache_free(log->io_kc, io);
170f6bed0efSShaohua Li }
171f6bed0efSShaohua Li 
172f6bed0efSShaohua Li static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
173f6bed0efSShaohua Li 				  enum r5l_io_unit_state state)
174f6bed0efSShaohua Li {
175f6bed0efSShaohua Li 	struct r5l_io_unit *io;
176f6bed0efSShaohua Li 
177f6bed0efSShaohua Li 	while (!list_empty(from)) {
178f6bed0efSShaohua Li 		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
179f6bed0efSShaohua Li 		/* don't change list order */
180f6bed0efSShaohua Li 		if (io->state >= state)
181f6bed0efSShaohua Li 			list_move_tail(&io->log_sibling, to);
182f6bed0efSShaohua Li 		else
183f6bed0efSShaohua Li 			break;
184f6bed0efSShaohua Li 	}
185f6bed0efSShaohua Li }
186f6bed0efSShaohua Li 
1870576b1c6SShaohua Li /*
1880576b1c6SShaohua Li  * We don't want too many io_units reside in stripe_end_ios list, which will
1890576b1c6SShaohua Li  * waste a lot of memory. So we try to remove some. But we must keep at least 2
1900576b1c6SShaohua Li  * io_units. The superblock must point to a valid meta, if it's the last meta,
1910576b1c6SShaohua Li  * recovery can scan less
1920576b1c6SShaohua Li  */
1930576b1c6SShaohua Li static void r5l_compress_stripe_end_list(struct r5l_log *log)
1940576b1c6SShaohua Li {
1950576b1c6SShaohua Li 	struct r5l_io_unit *first, *last, *io;
1960576b1c6SShaohua Li 
1970576b1c6SShaohua Li 	first = list_first_entry(&log->stripe_end_ios,
1980576b1c6SShaohua Li 				 struct r5l_io_unit, log_sibling);
1990576b1c6SShaohua Li 	last = list_last_entry(&log->stripe_end_ios,
2000576b1c6SShaohua Li 			       struct r5l_io_unit, log_sibling);
2010576b1c6SShaohua Li 	if (first == last)
2020576b1c6SShaohua Li 		return;
2030576b1c6SShaohua Li 	list_del(&first->log_sibling);
2040576b1c6SShaohua Li 	list_del(&last->log_sibling);
2050576b1c6SShaohua Li 	while (!list_empty(&log->stripe_end_ios)) {
2060576b1c6SShaohua Li 		io = list_first_entry(&log->stripe_end_ios,
2070576b1c6SShaohua Li 				      struct r5l_io_unit, log_sibling);
2080576b1c6SShaohua Li 		list_del(&io->log_sibling);
2090576b1c6SShaohua Li 		first->log_end = io->log_end;
2100576b1c6SShaohua Li 		r5l_free_io_unit(log, io);
2110576b1c6SShaohua Li 	}
2120576b1c6SShaohua Li 	list_add_tail(&first->log_sibling, &log->stripe_end_ios);
2130576b1c6SShaohua Li 	list_add_tail(&last->log_sibling, &log->stripe_end_ios);
2140576b1c6SShaohua Li }
2150576b1c6SShaohua Li 
216f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
217f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
218f6bed0efSShaohua Li 				    enum r5l_io_unit_state state)
219f6bed0efSShaohua Li {
220f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
221f6bed0efSShaohua Li 
222f6bed0efSShaohua Li 	if (WARN_ON(io->state >= state))
223f6bed0efSShaohua Li 		return;
224f6bed0efSShaohua Li 	io->state = state;
225f6bed0efSShaohua Li 	if (state == IO_UNIT_IO_END)
226f6bed0efSShaohua Li 		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
227f6bed0efSShaohua Li 				      IO_UNIT_IO_END);
2280576b1c6SShaohua Li 	if (state == IO_UNIT_STRIPE_END) {
2290576b1c6SShaohua Li 		struct r5l_io_unit *last;
2300576b1c6SShaohua Li 		sector_t reclaimable_space;
2310576b1c6SShaohua Li 
2320576b1c6SShaohua Li 		r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
2330576b1c6SShaohua Li 				      IO_UNIT_STRIPE_END);
2340576b1c6SShaohua Li 
2350576b1c6SShaohua Li 		last = list_last_entry(&log->stripe_end_ios,
2360576b1c6SShaohua Li 				       struct r5l_io_unit, log_sibling);
2370576b1c6SShaohua Li 		reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
2380576b1c6SShaohua Li 						      last->log_end);
2390576b1c6SShaohua Li 		if (reclaimable_space >= log->max_free_space)
2400576b1c6SShaohua Li 			r5l_wake_reclaim(log, 0);
2410576b1c6SShaohua Li 
2420576b1c6SShaohua Li 		r5l_compress_stripe_end_list(log);
2430576b1c6SShaohua Li 	}
244f6bed0efSShaohua Li 	wake_up(&io->wait_state);
245f6bed0efSShaohua Li }
246f6bed0efSShaohua Li 
247f6bed0efSShaohua Li static void r5l_set_io_unit_state(struct r5l_io_unit *io,
248f6bed0efSShaohua Li 				  enum r5l_io_unit_state state)
249f6bed0efSShaohua Li {
250f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
251f6bed0efSShaohua Li 	unsigned long flags;
252f6bed0efSShaohua Li 
253f6bed0efSShaohua Li 	spin_lock_irqsave(&log->io_list_lock, flags);
254f6bed0efSShaohua Li 	__r5l_set_io_unit_state(io, state);
255f6bed0efSShaohua Li 	spin_unlock_irqrestore(&log->io_list_lock, flags);
256f6bed0efSShaohua Li }
257f6bed0efSShaohua Li 
258f6bed0efSShaohua Li /* XXX: totally ignores I/O errors */
259f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio)
260f6bed0efSShaohua Li {
261f6bed0efSShaohua Li 	struct r5l_io_unit *io = bio->bi_private;
262f6bed0efSShaohua Li 	struct r5l_log *log = io->log;
263f6bed0efSShaohua Li 
264f6bed0efSShaohua Li 	bio_put(bio);
265f6bed0efSShaohua Li 
266f6bed0efSShaohua Li 	if (!atomic_dec_and_test(&io->pending_io))
267f6bed0efSShaohua Li 		return;
268f6bed0efSShaohua Li 
269f6bed0efSShaohua Li 	r5l_set_io_unit_state(io, IO_UNIT_IO_END);
270f6bed0efSShaohua Li 	md_wakeup_thread(log->rdev->mddev->thread);
271f6bed0efSShaohua Li }
272f6bed0efSShaohua Li 
273f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log)
274f6bed0efSShaohua Li {
275f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
276f6bed0efSShaohua Li 	struct r5l_meta_block *block;
277f6bed0efSShaohua Li 	struct bio *bio;
278f6bed0efSShaohua Li 	u32 crc;
279f6bed0efSShaohua Li 
280f6bed0efSShaohua Li 	if (!io)
281f6bed0efSShaohua Li 		return;
282f6bed0efSShaohua Li 
283f6bed0efSShaohua Li 	block = page_address(io->meta_page);
284f6bed0efSShaohua Li 	block->meta_size = cpu_to_le32(io->meta_offset);
285*5cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
286f6bed0efSShaohua Li 	block->checksum = cpu_to_le32(crc);
287f6bed0efSShaohua Li 
288f6bed0efSShaohua Li 	log->current_io = NULL;
289f6bed0efSShaohua Li 	r5l_set_io_unit_state(io, IO_UNIT_IO_START);
290f6bed0efSShaohua Li 
291f6bed0efSShaohua Li 	while ((bio = bio_list_pop(&io->bios))) {
292f6bed0efSShaohua Li 		/* all IO must start from rdev->data_offset */
293f6bed0efSShaohua Li 		bio->bi_iter.bi_sector += log->rdev->data_offset;
294f6bed0efSShaohua Li 		submit_bio(WRITE, bio);
295f6bed0efSShaohua Li 	}
296f6bed0efSShaohua Li }
297f6bed0efSShaohua Li 
298f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
299f6bed0efSShaohua Li {
300f6bed0efSShaohua Li 	struct r5l_io_unit *io;
301f6bed0efSShaohua Li 	struct r5l_meta_block *block;
302f6bed0efSShaohua Li 	struct bio *bio;
303f6bed0efSShaohua Li 
304f6bed0efSShaohua Li 	io = r5l_alloc_io_unit(log);
305f6bed0efSShaohua Li 
306f6bed0efSShaohua Li 	block = page_address(io->meta_page);
307f6bed0efSShaohua Li 	block->magic = cpu_to_le32(R5LOG_MAGIC);
308f6bed0efSShaohua Li 	block->version = R5LOG_VERSION;
309f6bed0efSShaohua Li 	block->seq = cpu_to_le64(log->seq);
310f6bed0efSShaohua Li 	block->position = cpu_to_le64(log->log_start);
311f6bed0efSShaohua Li 
312f6bed0efSShaohua Li 	io->log_start = log->log_start;
313f6bed0efSShaohua Li 	io->meta_offset = sizeof(struct r5l_meta_block);
314f6bed0efSShaohua Li 	io->seq = log->seq;
315f6bed0efSShaohua Li 
316f6bed0efSShaohua Li 	bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
317f6bed0efSShaohua Li 	io->current_bio = bio;
318f6bed0efSShaohua Li 	bio->bi_rw = WRITE;
319f6bed0efSShaohua Li 	bio->bi_bdev = log->rdev->bdev;
320f6bed0efSShaohua Li 	bio->bi_iter.bi_sector = log->log_start;
321f6bed0efSShaohua Li 	bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
322f6bed0efSShaohua Li 	bio->bi_end_io = r5l_log_endio;
323f6bed0efSShaohua Li 	bio->bi_private = io;
324f6bed0efSShaohua Li 
325f6bed0efSShaohua Li 	bio_list_add(&io->bios, bio);
326f6bed0efSShaohua Li 	atomic_inc(&io->pending_io);
327f6bed0efSShaohua Li 
328f6bed0efSShaohua Li 	log->seq++;
329f6bed0efSShaohua Li 	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
330f6bed0efSShaohua Li 	io->log_end = log->log_start;
331f6bed0efSShaohua Li 	/* current bio hit disk end */
332f6bed0efSShaohua Li 	if (log->log_start == 0)
333f6bed0efSShaohua Li 		io->current_bio = NULL;
334f6bed0efSShaohua Li 
335f6bed0efSShaohua Li 	spin_lock_irq(&log->io_list_lock);
336f6bed0efSShaohua Li 	list_add_tail(&io->log_sibling, &log->running_ios);
337f6bed0efSShaohua Li 	spin_unlock_irq(&log->io_list_lock);
338f6bed0efSShaohua Li 
339f6bed0efSShaohua Li 	return io;
340f6bed0efSShaohua Li }
341f6bed0efSShaohua Li 
342f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
343f6bed0efSShaohua Li {
344f6bed0efSShaohua Li 	struct r5l_io_unit *io;
345f6bed0efSShaohua Li 
346f6bed0efSShaohua Li 	io = log->current_io;
347f6bed0efSShaohua Li 	if (io && io->meta_offset + payload_size > PAGE_SIZE)
348f6bed0efSShaohua Li 		r5l_submit_current_io(log);
349f6bed0efSShaohua Li 	io = log->current_io;
350f6bed0efSShaohua Li 	if (io)
351f6bed0efSShaohua Li 		return 0;
352f6bed0efSShaohua Li 
353f6bed0efSShaohua Li 	log->current_io = r5l_new_meta(log);
354f6bed0efSShaohua Li 	return 0;
355f6bed0efSShaohua Li }
356f6bed0efSShaohua Li 
357f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
358f6bed0efSShaohua Li 				    sector_t location,
359f6bed0efSShaohua Li 				    u32 checksum1, u32 checksum2,
360f6bed0efSShaohua Li 				    bool checksum2_valid)
361f6bed0efSShaohua Li {
362f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
363f6bed0efSShaohua Li 	struct r5l_payload_data_parity *payload;
364f6bed0efSShaohua Li 
365f6bed0efSShaohua Li 	payload = page_address(io->meta_page) + io->meta_offset;
366f6bed0efSShaohua Li 	payload->header.type = cpu_to_le16(type);
367f6bed0efSShaohua Li 	payload->header.flags = cpu_to_le16(0);
368f6bed0efSShaohua Li 	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
369f6bed0efSShaohua Li 				    (PAGE_SHIFT - 9));
370f6bed0efSShaohua Li 	payload->location = cpu_to_le64(location);
371f6bed0efSShaohua Li 	payload->checksum[0] = cpu_to_le32(checksum1);
372f6bed0efSShaohua Li 	if (checksum2_valid)
373f6bed0efSShaohua Li 		payload->checksum[1] = cpu_to_le32(checksum2);
374f6bed0efSShaohua Li 
375f6bed0efSShaohua Li 	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
376f6bed0efSShaohua Li 		sizeof(__le32) * (1 + !!checksum2_valid);
377f6bed0efSShaohua Li }
378f6bed0efSShaohua Li 
379f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
380f6bed0efSShaohua Li {
381f6bed0efSShaohua Li 	struct r5l_io_unit *io = log->current_io;
382f6bed0efSShaohua Li 
383f6bed0efSShaohua Li alloc_bio:
384f6bed0efSShaohua Li 	if (!io->current_bio) {
385f6bed0efSShaohua Li 		struct bio *bio;
386f6bed0efSShaohua Li 
387f6bed0efSShaohua Li 		bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
388f6bed0efSShaohua Li 		bio->bi_rw = WRITE;
389f6bed0efSShaohua Li 		bio->bi_bdev = log->rdev->bdev;
390f6bed0efSShaohua Li 		bio->bi_iter.bi_sector = log->log_start;
391f6bed0efSShaohua Li 		bio->bi_end_io = r5l_log_endio;
392f6bed0efSShaohua Li 		bio->bi_private = io;
393f6bed0efSShaohua Li 		bio_list_add(&io->bios, bio);
394f6bed0efSShaohua Li 		atomic_inc(&io->pending_io);
395f6bed0efSShaohua Li 		io->current_bio = bio;
396f6bed0efSShaohua Li 	}
397f6bed0efSShaohua Li 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
398f6bed0efSShaohua Li 		io->current_bio = NULL;
399f6bed0efSShaohua Li 		goto alloc_bio;
400f6bed0efSShaohua Li 	}
401f6bed0efSShaohua Li 	log->log_start = r5l_ring_add(log, log->log_start,
402f6bed0efSShaohua Li 				      BLOCK_SECTORS);
403f6bed0efSShaohua Li 	/* current bio hit disk end */
404f6bed0efSShaohua Li 	if (log->log_start == 0)
405f6bed0efSShaohua Li 		io->current_bio = NULL;
406f6bed0efSShaohua Li 
407f6bed0efSShaohua Li 	io->log_end = log->log_start;
408f6bed0efSShaohua Li }
409f6bed0efSShaohua Li 
410f6bed0efSShaohua Li static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
411f6bed0efSShaohua Li 			   int data_pages, int parity_pages)
412f6bed0efSShaohua Li {
413f6bed0efSShaohua Li 	int i;
414f6bed0efSShaohua Li 	int meta_size;
415f6bed0efSShaohua Li 	struct r5l_io_unit *io;
416f6bed0efSShaohua Li 
417f6bed0efSShaohua Li 	meta_size =
418f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
419f6bed0efSShaohua Li 		 * data_pages) +
420f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
421f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
422f6bed0efSShaohua Li 
423f6bed0efSShaohua Li 	r5l_get_meta(log, meta_size);
424f6bed0efSShaohua Li 	io = log->current_io;
425f6bed0efSShaohua Li 
426f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
427f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
428f6bed0efSShaohua Li 			continue;
429f6bed0efSShaohua Li 		if (i == sh->pd_idx || i == sh->qd_idx)
430f6bed0efSShaohua Li 			continue;
431f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
432f6bed0efSShaohua Li 					raid5_compute_blocknr(sh, i, 0),
433f6bed0efSShaohua Li 					sh->dev[i].log_checksum, 0, false);
434f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[i].page);
435f6bed0efSShaohua Li 	}
436f6bed0efSShaohua Li 
437f6bed0efSShaohua Li 	if (sh->qd_idx >= 0) {
438f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
439f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
440f6bed0efSShaohua Li 					sh->dev[sh->qd_idx].log_checksum, true);
441f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
442f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
443f6bed0efSShaohua Li 	} else {
444f6bed0efSShaohua Li 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
445f6bed0efSShaohua Li 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
446f6bed0efSShaohua Li 					0, false);
447f6bed0efSShaohua Li 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
448f6bed0efSShaohua Li 	}
449f6bed0efSShaohua Li 
450f6bed0efSShaohua Li 	list_add_tail(&sh->log_list, &io->stripe_list);
451f6bed0efSShaohua Li 	atomic_inc(&io->pending_stripe);
452f6bed0efSShaohua Li 	sh->log_io = io;
453f6bed0efSShaohua Li }
454f6bed0efSShaohua Li 
455f6bed0efSShaohua Li /*
456f6bed0efSShaohua Li  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
457f6bed0efSShaohua Li  * data from log to raid disks), so we shouldn't wait for reclaim here
458f6bed0efSShaohua Li  */
459f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
460f6bed0efSShaohua Li {
461f6bed0efSShaohua Li 	int write_disks = 0;
462f6bed0efSShaohua Li 	int data_pages, parity_pages;
463f6bed0efSShaohua Li 	int meta_size;
464f6bed0efSShaohua Li 	int reserve;
465f6bed0efSShaohua Li 	int i;
466f6bed0efSShaohua Li 
467f6bed0efSShaohua Li 	if (!log)
468f6bed0efSShaohua Li 		return -EAGAIN;
469f6bed0efSShaohua Li 	/* Don't support stripe batch */
470f6bed0efSShaohua Li 	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
471f6bed0efSShaohua Li 	    test_bit(STRIPE_SYNCING, &sh->state)) {
472f6bed0efSShaohua Li 		/* the stripe is written to log, we start writing it to raid */
473f6bed0efSShaohua Li 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
474f6bed0efSShaohua Li 		return -EAGAIN;
475f6bed0efSShaohua Li 	}
476f6bed0efSShaohua Li 
477f6bed0efSShaohua Li 	for (i = 0; i < sh->disks; i++) {
478f6bed0efSShaohua Li 		void *addr;
479f6bed0efSShaohua Li 
480f6bed0efSShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
481f6bed0efSShaohua Li 			continue;
482f6bed0efSShaohua Li 		write_disks++;
483f6bed0efSShaohua Li 		/* checksum is already calculated in last run */
484f6bed0efSShaohua Li 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
485f6bed0efSShaohua Li 			continue;
486f6bed0efSShaohua Li 		addr = kmap_atomic(sh->dev[i].page);
487*5cb2fbd6SShaohua Li 		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
488f6bed0efSShaohua Li 						    addr, PAGE_SIZE);
489f6bed0efSShaohua Li 		kunmap_atomic(addr);
490f6bed0efSShaohua Li 	}
491f6bed0efSShaohua Li 	parity_pages = 1 + !!(sh->qd_idx >= 0);
492f6bed0efSShaohua Li 	data_pages = write_disks - parity_pages;
493f6bed0efSShaohua Li 
494f6bed0efSShaohua Li 	meta_size =
495f6bed0efSShaohua Li 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
496f6bed0efSShaohua Li 		 * data_pages) +
497f6bed0efSShaohua Li 		sizeof(struct r5l_payload_data_parity) +
498f6bed0efSShaohua Li 		sizeof(__le32) * parity_pages;
499f6bed0efSShaohua Li 	/* Doesn't work with very big raid array */
500f6bed0efSShaohua Li 	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
501f6bed0efSShaohua Li 		return -EINVAL;
502f6bed0efSShaohua Li 
503f6bed0efSShaohua Li 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
504f6bed0efSShaohua Li 	atomic_inc(&sh->count);
505f6bed0efSShaohua Li 
506f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
507f6bed0efSShaohua Li 	/* meta + data */
508f6bed0efSShaohua Li 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
509f6bed0efSShaohua Li 	if (r5l_has_free_space(log, reserve))
510f6bed0efSShaohua Li 		r5l_log_stripe(log, sh, data_pages, parity_pages);
511f6bed0efSShaohua Li 	else {
512f6bed0efSShaohua Li 		spin_lock(&log->no_space_stripes_lock);
513f6bed0efSShaohua Li 		list_add_tail(&sh->log_list, &log->no_space_stripes);
514f6bed0efSShaohua Li 		spin_unlock(&log->no_space_stripes_lock);
515f6bed0efSShaohua Li 
516f6bed0efSShaohua Li 		r5l_wake_reclaim(log, reserve);
517f6bed0efSShaohua Li 	}
518f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
519f6bed0efSShaohua Li 
520f6bed0efSShaohua Li 	return 0;
521f6bed0efSShaohua Li }
522f6bed0efSShaohua Li 
523f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log)
524f6bed0efSShaohua Li {
525f6bed0efSShaohua Li 	if (!log)
526f6bed0efSShaohua Li 		return;
527f6bed0efSShaohua Li 	mutex_lock(&log->io_mutex);
528f6bed0efSShaohua Li 	r5l_submit_current_io(log);
529f6bed0efSShaohua Li 	mutex_unlock(&log->io_mutex);
530f6bed0efSShaohua Li }
531f6bed0efSShaohua Li 
532f6bed0efSShaohua Li /* This will run after log space is reclaimed */
533f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log)
534f6bed0efSShaohua Li {
535f6bed0efSShaohua Li 	struct stripe_head *sh;
536f6bed0efSShaohua Li 
537f6bed0efSShaohua Li 	spin_lock(&log->no_space_stripes_lock);
538f6bed0efSShaohua Li 	while (!list_empty(&log->no_space_stripes)) {
539f6bed0efSShaohua Li 		sh = list_first_entry(&log->no_space_stripes,
540f6bed0efSShaohua Li 				      struct stripe_head, log_list);
541f6bed0efSShaohua Li 		list_del_init(&sh->log_list);
542f6bed0efSShaohua Li 		set_bit(STRIPE_HANDLE, &sh->state);
543f6bed0efSShaohua Li 		raid5_release_stripe(sh);
544f6bed0efSShaohua Li 	}
545f6bed0efSShaohua Li 	spin_unlock(&log->no_space_stripes_lock);
546f6bed0efSShaohua Li }
547f6bed0efSShaohua Li 
5480576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh)
5490576b1c6SShaohua Li {
5500576b1c6SShaohua Li 	struct r5l_io_unit *io;
5510576b1c6SShaohua Li 
5520576b1c6SShaohua Li 	/* Don't support stripe batch */
5530576b1c6SShaohua Li 	io = sh->log_io;
5540576b1c6SShaohua Li 	if (!io)
5550576b1c6SShaohua Li 		return;
5560576b1c6SShaohua Li 	sh->log_io = NULL;
5570576b1c6SShaohua Li 
5580576b1c6SShaohua Li 	if (atomic_dec_and_test(&io->pending_stripe))
5590576b1c6SShaohua Li 		r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
5600576b1c6SShaohua Li }
5610576b1c6SShaohua Li 
5620576b1c6SShaohua Li /*
5630576b1c6SShaohua Li  * Starting dispatch IO to raid.
5640576b1c6SShaohua Li  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
5650576b1c6SShaohua Li  * broken meta in the middle of a log causes recovery can't find meta at the
5660576b1c6SShaohua Li  * head of log. If operations require meta at the head persistent in log, we
5670576b1c6SShaohua Li  * must make sure meta before it persistent in log too. A case is:
5680576b1c6SShaohua Li  *
5690576b1c6SShaohua Li  * stripe data/parity is in log, we start write stripe to raid disks. stripe
5700576b1c6SShaohua Li  * data/parity must be persistent in log before we do the write to raid disks.
5710576b1c6SShaohua Li  *
5720576b1c6SShaohua Li  * The solution is we restrictly maintain io_unit list order. In this case, we
5730576b1c6SShaohua Li  * only write stripes of an io_unit to raid disks till the io_unit is the first
5740576b1c6SShaohua Li  * one whose data/parity is in log.
5750576b1c6SShaohua Li  */
5760576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log)
5770576b1c6SShaohua Li {
5780576b1c6SShaohua Li 	struct r5l_io_unit *io;
5790576b1c6SShaohua Li 	struct stripe_head *sh;
5800576b1c6SShaohua Li 	bool run_stripe;
5810576b1c6SShaohua Li 
5820576b1c6SShaohua Li 	if (!log)
5830576b1c6SShaohua Li 		return;
5840576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
5850576b1c6SShaohua Li 	run_stripe = !list_empty(&log->io_end_ios);
5860576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
5870576b1c6SShaohua Li 
5880576b1c6SShaohua Li 	if (!run_stripe)
5890576b1c6SShaohua Li 		return;
5900576b1c6SShaohua Li 
5910576b1c6SShaohua Li 	blkdev_issue_flush(log->rdev->bdev, GFP_NOIO, NULL);
5920576b1c6SShaohua Li 
5930576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
5940576b1c6SShaohua Li 	list_for_each_entry(io, &log->io_end_ios, log_sibling) {
5950576b1c6SShaohua Li 		if (io->state >= IO_UNIT_STRIPE_START)
5960576b1c6SShaohua Li 			continue;
5970576b1c6SShaohua Li 		__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START);
5980576b1c6SShaohua Li 
5990576b1c6SShaohua Li 		while (!list_empty(&io->stripe_list)) {
6000576b1c6SShaohua Li 			sh = list_first_entry(&io->stripe_list,
6010576b1c6SShaohua Li 					      struct stripe_head, log_list);
6020576b1c6SShaohua Li 			list_del_init(&sh->log_list);
6030576b1c6SShaohua Li 			set_bit(STRIPE_HANDLE, &sh->state);
6040576b1c6SShaohua Li 			raid5_release_stripe(sh);
6050576b1c6SShaohua Li 		}
6060576b1c6SShaohua Li 	}
6070576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
6080576b1c6SShaohua Li }
6090576b1c6SShaohua Li 
6100576b1c6SShaohua Li static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
6110576b1c6SShaohua Li {
6120576b1c6SShaohua Li 	/* the log thread will write the io unit */
6130576b1c6SShaohua Li 	wait_event(io->wait_state, io->state >= IO_UNIT_IO_END);
6140576b1c6SShaohua Li 	if (io->state < IO_UNIT_STRIPE_START)
6150576b1c6SShaohua Li 		r5l_flush_stripe_to_raid(log);
6160576b1c6SShaohua Li 	wait_event(io->wait_state, io->state >= IO_UNIT_STRIPE_END);
6170576b1c6SShaohua Li }
6180576b1c6SShaohua Li 
6190576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp);
6200576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log)
6210576b1c6SShaohua Li {
6220576b1c6SShaohua Li 	struct r5l_io_unit *io, *last;
6230576b1c6SShaohua Li 	LIST_HEAD(list);
6240576b1c6SShaohua Li 	sector_t free = 0;
6250576b1c6SShaohua Li 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
6260576b1c6SShaohua Li 
6270576b1c6SShaohua Li 	spin_lock_irq(&log->io_list_lock);
6280576b1c6SShaohua Li 	/*
6290576b1c6SShaohua Li 	 * move proper io_unit to reclaim list. We should not change the order.
6300576b1c6SShaohua Li 	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
6310576b1c6SShaohua Li 	 * shouldn't reuse space of an unreclaimable io_unit
6320576b1c6SShaohua Li 	 */
6330576b1c6SShaohua Li 	while (1) {
6340576b1c6SShaohua Li 		while (!list_empty(&log->stripe_end_ios)) {
6350576b1c6SShaohua Li 			io = list_first_entry(&log->stripe_end_ios,
6360576b1c6SShaohua Li 					      struct r5l_io_unit, log_sibling);
6370576b1c6SShaohua Li 			list_move_tail(&io->log_sibling, &list);
6380576b1c6SShaohua Li 			free += r5l_ring_distance(log, io->log_start,
6390576b1c6SShaohua Li 						  io->log_end);
6400576b1c6SShaohua Li 		}
6410576b1c6SShaohua Li 
6420576b1c6SShaohua Li 		if (free >= reclaim_target ||
6430576b1c6SShaohua Li 		    (list_empty(&log->running_ios) &&
6440576b1c6SShaohua Li 		     list_empty(&log->io_end_ios) &&
6450576b1c6SShaohua Li 		     list_empty(&log->stripe_end_ios)))
6460576b1c6SShaohua Li 			break;
6470576b1c6SShaohua Li 
6480576b1c6SShaohua Li 		/* Below waiting mostly happens when we shutdown the raid */
6490576b1c6SShaohua Li 		if (!list_empty(&log->io_end_ios)) {
6500576b1c6SShaohua Li 			io = list_first_entry(&log->io_end_ios,
6510576b1c6SShaohua Li 					      struct r5l_io_unit, log_sibling);
6520576b1c6SShaohua Li 			spin_unlock_irq(&log->io_list_lock);
6530576b1c6SShaohua Li 			/* nobody else can delete the io, we are safe */
6540576b1c6SShaohua Li 			r5l_kick_io_unit(log, io);
6550576b1c6SShaohua Li 			spin_lock_irq(&log->io_list_lock);
6560576b1c6SShaohua Li 			continue;
6570576b1c6SShaohua Li 		}
6580576b1c6SShaohua Li 
6590576b1c6SShaohua Li 		if (!list_empty(&log->running_ios)) {
6600576b1c6SShaohua Li 			io = list_first_entry(&log->running_ios,
6610576b1c6SShaohua Li 					      struct r5l_io_unit, log_sibling);
6620576b1c6SShaohua Li 			spin_unlock_irq(&log->io_list_lock);
6630576b1c6SShaohua Li 			/* nobody else can delete the io, we are safe */
6640576b1c6SShaohua Li 			r5l_kick_io_unit(log, io);
6650576b1c6SShaohua Li 			spin_lock_irq(&log->io_list_lock);
6660576b1c6SShaohua Li 			continue;
6670576b1c6SShaohua Li 		}
6680576b1c6SShaohua Li 	}
6690576b1c6SShaohua Li 	spin_unlock_irq(&log->io_list_lock);
6700576b1c6SShaohua Li 
6710576b1c6SShaohua Li 	if (list_empty(&list))
6720576b1c6SShaohua Li 		return;
6730576b1c6SShaohua Li 
6740576b1c6SShaohua Li 	/* super always point to last valid meta */
6750576b1c6SShaohua Li 	last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
6760576b1c6SShaohua Li 	/*
6770576b1c6SShaohua Li 	 * write_super will flush cache of each raid disk. We must write super
6780576b1c6SShaohua Li 	 * here, because the log area might be reused soon and we don't want to
6790576b1c6SShaohua Li 	 * confuse recovery
6800576b1c6SShaohua Li 	 */
6810576b1c6SShaohua Li 	r5l_write_super(log, last->log_start);
6820576b1c6SShaohua Li 
6830576b1c6SShaohua Li 	mutex_lock(&log->io_mutex);
6840576b1c6SShaohua Li 	log->last_checkpoint = last->log_start;
6850576b1c6SShaohua Li 	log->last_cp_seq = last->seq;
6860576b1c6SShaohua Li 	mutex_unlock(&log->io_mutex);
6870576b1c6SShaohua Li 	r5l_run_no_space_stripes(log);
6880576b1c6SShaohua Li 
6890576b1c6SShaohua Li 	while (!list_empty(&list)) {
6900576b1c6SShaohua Li 		io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
6910576b1c6SShaohua Li 		list_del(&io->log_sibling);
6920576b1c6SShaohua Li 		r5l_free_io_unit(log, io);
6930576b1c6SShaohua Li 	}
6940576b1c6SShaohua Li }
6950576b1c6SShaohua Li 
6960576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread)
6970576b1c6SShaohua Li {
6980576b1c6SShaohua Li 	struct mddev *mddev = thread->mddev;
6990576b1c6SShaohua Li 	struct r5conf *conf = mddev->private;
7000576b1c6SShaohua Li 	struct r5l_log *log = conf->log;
7010576b1c6SShaohua Li 
7020576b1c6SShaohua Li 	if (!log)
7030576b1c6SShaohua Li 		return;
7040576b1c6SShaohua Li 	r5l_do_reclaim(log);
7050576b1c6SShaohua Li }
7060576b1c6SShaohua Li 
707f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
708f6bed0efSShaohua Li {
7090576b1c6SShaohua Li 	unsigned long target;
7100576b1c6SShaohua Li 	unsigned long new = (unsigned long)space; /* overflow in theory */
7110576b1c6SShaohua Li 
7120576b1c6SShaohua Li 	do {
7130576b1c6SShaohua Li 		target = log->reclaim_target;
7140576b1c6SShaohua Li 		if (new < target)
7150576b1c6SShaohua Li 			return;
7160576b1c6SShaohua Li 	} while (cmpxchg(&log->reclaim_target, target, new) != target);
7170576b1c6SShaohua Li 	md_wakeup_thread(log->reclaim_thread);
718f6bed0efSShaohua Li }
719f6bed0efSShaohua Li 
720355810d1SShaohua Li struct r5l_recovery_ctx {
721355810d1SShaohua Li 	struct page *meta_page;		/* current meta */
722355810d1SShaohua Li 	sector_t meta_total_blocks;	/* total size of current meta and data */
723355810d1SShaohua Li 	sector_t pos;			/* recovery position */
724355810d1SShaohua Li 	u64 seq;			/* recovery position seq */
725355810d1SShaohua Li };
726355810d1SShaohua Li 
727355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log,
728355810d1SShaohua Li 			       struct r5l_recovery_ctx *ctx)
729355810d1SShaohua Li {
730355810d1SShaohua Li 	struct page *page = ctx->meta_page;
731355810d1SShaohua Li 	struct r5l_meta_block *mb;
732355810d1SShaohua Li 	u32 crc, stored_crc;
733355810d1SShaohua Li 
734355810d1SShaohua Li 	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
735355810d1SShaohua Li 		return -EIO;
736355810d1SShaohua Li 
737355810d1SShaohua Li 	mb = page_address(page);
738355810d1SShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
739355810d1SShaohua Li 	mb->checksum = 0;
740355810d1SShaohua Li 
741355810d1SShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
742355810d1SShaohua Li 	    le64_to_cpu(mb->seq) != ctx->seq ||
743355810d1SShaohua Li 	    mb->version != R5LOG_VERSION ||
744355810d1SShaohua Li 	    le64_to_cpu(mb->position) != ctx->pos)
745355810d1SShaohua Li 		return -EINVAL;
746355810d1SShaohua Li 
747*5cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
748355810d1SShaohua Li 	if (stored_crc != crc)
749355810d1SShaohua Li 		return -EINVAL;
750355810d1SShaohua Li 
751355810d1SShaohua Li 	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
752355810d1SShaohua Li 		return -EINVAL;
753355810d1SShaohua Li 
754355810d1SShaohua Li 	ctx->meta_total_blocks = BLOCK_SECTORS;
755355810d1SShaohua Li 
756355810d1SShaohua Li 	return 0;
757355810d1SShaohua Li }
758355810d1SShaohua Li 
759355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
760355810d1SShaohua Li 					 struct r5l_recovery_ctx *ctx,
761355810d1SShaohua Li 					 sector_t stripe_sect,
762355810d1SShaohua Li 					 int *offset, sector_t *log_offset)
763355810d1SShaohua Li {
764355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
765355810d1SShaohua Li 	struct stripe_head *sh;
766355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
767355810d1SShaohua Li 	int disk_index;
768355810d1SShaohua Li 
769355810d1SShaohua Li 	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
770355810d1SShaohua Li 	while (1) {
771355810d1SShaohua Li 		payload = page_address(ctx->meta_page) + *offset;
772355810d1SShaohua Li 
773355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
774355810d1SShaohua Li 			raid5_compute_sector(conf,
775355810d1SShaohua Li 					     le64_to_cpu(payload->location), 0,
776355810d1SShaohua Li 					     &disk_index, sh);
777355810d1SShaohua Li 
778355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
779355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
780355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
781355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
782355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
783355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS;
784355810d1SShaohua Li 		} else {
785355810d1SShaohua Li 			disk_index = sh->pd_idx;
786355810d1SShaohua Li 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
787355810d1SShaohua Li 				     sh->dev[disk_index].page, READ, false);
788355810d1SShaohua Li 			sh->dev[disk_index].log_checksum =
789355810d1SShaohua Li 				le32_to_cpu(payload->checksum[0]);
790355810d1SShaohua Li 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
791355810d1SShaohua Li 
792355810d1SShaohua Li 			if (sh->qd_idx >= 0) {
793355810d1SShaohua Li 				disk_index = sh->qd_idx;
794355810d1SShaohua Li 				sync_page_io(log->rdev,
795355810d1SShaohua Li 					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
796355810d1SShaohua Li 					     PAGE_SIZE, sh->dev[disk_index].page,
797355810d1SShaohua Li 					     READ, false);
798355810d1SShaohua Li 				sh->dev[disk_index].log_checksum =
799355810d1SShaohua Li 					le32_to_cpu(payload->checksum[1]);
800355810d1SShaohua Li 				set_bit(R5_Wantwrite,
801355810d1SShaohua Li 					&sh->dev[disk_index].flags);
802355810d1SShaohua Li 			}
803355810d1SShaohua Li 			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
804355810d1SShaohua Li 		}
805355810d1SShaohua Li 
806355810d1SShaohua Li 		*log_offset = r5l_ring_add(log, *log_offset,
807355810d1SShaohua Li 					   le32_to_cpu(payload->size));
808355810d1SShaohua Li 		*offset += sizeof(struct r5l_payload_data_parity) +
809355810d1SShaohua Li 			sizeof(__le32) *
810355810d1SShaohua Li 			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
811355810d1SShaohua Li 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
812355810d1SShaohua Li 			break;
813355810d1SShaohua Li 	}
814355810d1SShaohua Li 
815355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
816355810d1SShaohua Li 		void *addr;
817355810d1SShaohua Li 		u32 checksum;
818355810d1SShaohua Li 
819355810d1SShaohua Li 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
820355810d1SShaohua Li 			continue;
821355810d1SShaohua Li 		addr = kmap_atomic(sh->dev[disk_index].page);
822*5cb2fbd6SShaohua Li 		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
823355810d1SShaohua Li 		kunmap_atomic(addr);
824355810d1SShaohua Li 		if (checksum != sh->dev[disk_index].log_checksum)
825355810d1SShaohua Li 			goto error;
826355810d1SShaohua Li 	}
827355810d1SShaohua Li 
828355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
829355810d1SShaohua Li 		struct md_rdev *rdev, *rrdev;
830355810d1SShaohua Li 
831355810d1SShaohua Li 		if (!test_and_clear_bit(R5_Wantwrite,
832355810d1SShaohua Li 					&sh->dev[disk_index].flags))
833355810d1SShaohua Li 			continue;
834355810d1SShaohua Li 
835355810d1SShaohua Li 		/* in case device is broken */
836355810d1SShaohua Li 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
837355810d1SShaohua Li 		if (rdev)
838355810d1SShaohua Li 			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
839355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
840355810d1SShaohua Li 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
841355810d1SShaohua Li 		if (rrdev)
842355810d1SShaohua Li 			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
843355810d1SShaohua Li 				     sh->dev[disk_index].page, WRITE, false);
844355810d1SShaohua Li 	}
845355810d1SShaohua Li 	raid5_release_stripe(sh);
846355810d1SShaohua Li 	return 0;
847355810d1SShaohua Li 
848355810d1SShaohua Li error:
849355810d1SShaohua Li 	for (disk_index = 0; disk_index < sh->disks; disk_index++)
850355810d1SShaohua Li 		sh->dev[disk_index].flags = 0;
851355810d1SShaohua Li 	raid5_release_stripe(sh);
852355810d1SShaohua Li 	return -EINVAL;
853355810d1SShaohua Li }
854355810d1SShaohua Li 
855355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log,
856355810d1SShaohua Li 				       struct r5l_recovery_ctx *ctx)
857355810d1SShaohua Li {
858355810d1SShaohua Li 	struct r5conf *conf = log->rdev->mddev->private;
859355810d1SShaohua Li 	struct r5l_payload_data_parity *payload;
860355810d1SShaohua Li 	struct r5l_meta_block *mb;
861355810d1SShaohua Li 	int offset;
862355810d1SShaohua Li 	sector_t log_offset;
863355810d1SShaohua Li 	sector_t stripe_sector;
864355810d1SShaohua Li 
865355810d1SShaohua Li 	mb = page_address(ctx->meta_page);
866355810d1SShaohua Li 	offset = sizeof(struct r5l_meta_block);
867355810d1SShaohua Li 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
868355810d1SShaohua Li 
869355810d1SShaohua Li 	while (offset < le32_to_cpu(mb->meta_size)) {
870355810d1SShaohua Li 		int dd;
871355810d1SShaohua Li 
872355810d1SShaohua Li 		payload = (void *)mb + offset;
873355810d1SShaohua Li 		stripe_sector = raid5_compute_sector(conf,
874355810d1SShaohua Li 						     le64_to_cpu(payload->location), 0, &dd, NULL);
875355810d1SShaohua Li 		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
876355810d1SShaohua Li 						  &offset, &log_offset))
877355810d1SShaohua Li 			return -EINVAL;
878355810d1SShaohua Li 	}
879355810d1SShaohua Li 	return 0;
880355810d1SShaohua Li }
881355810d1SShaohua Li 
882355810d1SShaohua Li /* copy data/parity from log to raid disks */
883355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log,
884355810d1SShaohua Li 				   struct r5l_recovery_ctx *ctx)
885355810d1SShaohua Li {
886355810d1SShaohua Li 	while (1) {
887355810d1SShaohua Li 		if (r5l_read_meta_block(log, ctx))
888355810d1SShaohua Li 			return;
889355810d1SShaohua Li 		if (r5l_recovery_flush_one_meta(log, ctx))
890355810d1SShaohua Li 			return;
891355810d1SShaohua Li 		ctx->seq++;
892355810d1SShaohua Li 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
893355810d1SShaohua Li 	}
894355810d1SShaohua Li }
895355810d1SShaohua Li 
896355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
897355810d1SShaohua Li 					  u64 seq)
898355810d1SShaohua Li {
899355810d1SShaohua Li 	struct page *page;
900355810d1SShaohua Li 	struct r5l_meta_block *mb;
901355810d1SShaohua Li 	u32 crc;
902355810d1SShaohua Li 
903355810d1SShaohua Li 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
904355810d1SShaohua Li 	if (!page)
905355810d1SShaohua Li 		return -ENOMEM;
906355810d1SShaohua Li 	mb = page_address(page);
907355810d1SShaohua Li 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
908355810d1SShaohua Li 	mb->version = R5LOG_VERSION;
909355810d1SShaohua Li 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
910355810d1SShaohua Li 	mb->seq = cpu_to_le64(seq);
911355810d1SShaohua Li 	mb->position = cpu_to_le64(pos);
912*5cb2fbd6SShaohua Li 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
913355810d1SShaohua Li 	mb->checksum = cpu_to_le32(crc);
914355810d1SShaohua Li 
915355810d1SShaohua Li 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
916355810d1SShaohua Li 		__free_page(page);
917355810d1SShaohua Li 		return -EIO;
918355810d1SShaohua Li 	}
919355810d1SShaohua Li 	__free_page(page);
920355810d1SShaohua Li 	return 0;
921355810d1SShaohua Li }
922355810d1SShaohua Li 
923f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log)
924f6bed0efSShaohua Li {
925355810d1SShaohua Li 	struct r5l_recovery_ctx ctx;
926355810d1SShaohua Li 
927355810d1SShaohua Li 	ctx.pos = log->last_checkpoint;
928355810d1SShaohua Li 	ctx.seq = log->last_cp_seq;
929355810d1SShaohua Li 	ctx.meta_page = alloc_page(GFP_KERNEL);
930355810d1SShaohua Li 	if (!ctx.meta_page)
931355810d1SShaohua Li 		return -ENOMEM;
932355810d1SShaohua Li 
933355810d1SShaohua Li 	r5l_recovery_flush_log(log, &ctx);
934355810d1SShaohua Li 	__free_page(ctx.meta_page);
935355810d1SShaohua Li 
936355810d1SShaohua Li 	/*
937355810d1SShaohua Li 	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
938355810d1SShaohua Li 	 * log will start here. but we can't let superblock point to last valid
939355810d1SShaohua Li 	 * meta block. The log might looks like:
940355810d1SShaohua Li 	 * | meta 1| meta 2| meta 3|
941355810d1SShaohua Li 	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
942355810d1SShaohua Li 	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
943355810d1SShaohua Li 	 * happens again, new recovery will start from meta 1. Since meta 2n is
944355810d1SShaohua Li 	 * valid now, recovery will think meta 3 is valid, which is wrong.
945355810d1SShaohua Li 	 * The solution is we create a new meta in meta2 with its seq == meta
946355810d1SShaohua Li 	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
947355810d1SShaohua Li 	 * not think meta 3 is a valid meta, because its seq doesn't match
948355810d1SShaohua Li 	 */
949355810d1SShaohua Li 	if (ctx.seq > log->last_cp_seq + 1) {
950355810d1SShaohua Li 		int ret;
951355810d1SShaohua Li 
952355810d1SShaohua Li 		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
953355810d1SShaohua Li 		if (ret)
954355810d1SShaohua Li 			return ret;
955355810d1SShaohua Li 		log->seq = ctx.seq + 11;
956355810d1SShaohua Li 		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
957355810d1SShaohua Li 		r5l_write_super(log, ctx.pos);
958355810d1SShaohua Li 	} else {
959355810d1SShaohua Li 		log->log_start = ctx.pos;
960355810d1SShaohua Li 		log->seq = ctx.seq;
961355810d1SShaohua Li 	}
962f6bed0efSShaohua Li 	return 0;
963f6bed0efSShaohua Li }
964f6bed0efSShaohua Li 
965f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp)
966f6bed0efSShaohua Li {
967f6bed0efSShaohua Li 	struct mddev *mddev = log->rdev->mddev;
968f6bed0efSShaohua Li 
969f6bed0efSShaohua Li 	log->rdev->journal_tail = cp;
970f6bed0efSShaohua Li 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
971f6bed0efSShaohua Li }
972f6bed0efSShaohua Li 
973f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log)
974f6bed0efSShaohua Li {
975f6bed0efSShaohua Li 	struct md_rdev *rdev = log->rdev;
976f6bed0efSShaohua Li 	struct page *page;
977f6bed0efSShaohua Li 	struct r5l_meta_block *mb;
978f6bed0efSShaohua Li 	sector_t cp = log->rdev->journal_tail;
979f6bed0efSShaohua Li 	u32 stored_crc, expected_crc;
980f6bed0efSShaohua Li 	bool create_super = false;
981f6bed0efSShaohua Li 	int ret;
982f6bed0efSShaohua Li 
983f6bed0efSShaohua Li 	/* Make sure it's valid */
984f6bed0efSShaohua Li 	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
985f6bed0efSShaohua Li 		cp = 0;
986f6bed0efSShaohua Li 	page = alloc_page(GFP_KERNEL);
987f6bed0efSShaohua Li 	if (!page)
988f6bed0efSShaohua Li 		return -ENOMEM;
989f6bed0efSShaohua Li 
990f6bed0efSShaohua Li 	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
991f6bed0efSShaohua Li 		ret = -EIO;
992f6bed0efSShaohua Li 		goto ioerr;
993f6bed0efSShaohua Li 	}
994f6bed0efSShaohua Li 	mb = page_address(page);
995f6bed0efSShaohua Li 
996f6bed0efSShaohua Li 	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
997f6bed0efSShaohua Li 	    mb->version != R5LOG_VERSION) {
998f6bed0efSShaohua Li 		create_super = true;
999f6bed0efSShaohua Li 		goto create;
1000f6bed0efSShaohua Li 	}
1001f6bed0efSShaohua Li 	stored_crc = le32_to_cpu(mb->checksum);
1002f6bed0efSShaohua Li 	mb->checksum = 0;
1003*5cb2fbd6SShaohua Li 	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1004f6bed0efSShaohua Li 	if (stored_crc != expected_crc) {
1005f6bed0efSShaohua Li 		create_super = true;
1006f6bed0efSShaohua Li 		goto create;
1007f6bed0efSShaohua Li 	}
1008f6bed0efSShaohua Li 	if (le64_to_cpu(mb->position) != cp) {
1009f6bed0efSShaohua Li 		create_super = true;
1010f6bed0efSShaohua Li 		goto create;
1011f6bed0efSShaohua Li 	}
1012f6bed0efSShaohua Li create:
1013f6bed0efSShaohua Li 	if (create_super) {
1014f6bed0efSShaohua Li 		log->last_cp_seq = prandom_u32();
1015f6bed0efSShaohua Li 		cp = 0;
1016f6bed0efSShaohua Li 		/*
1017f6bed0efSShaohua Li 		 * Make sure super points to correct address. Log might have
1018f6bed0efSShaohua Li 		 * data very soon. If super hasn't correct log tail address,
1019f6bed0efSShaohua Li 		 * recovery can't find the log
1020f6bed0efSShaohua Li 		 */
1021f6bed0efSShaohua Li 		r5l_write_super(log, cp);
1022f6bed0efSShaohua Li 	} else
1023f6bed0efSShaohua Li 		log->last_cp_seq = le64_to_cpu(mb->seq);
1024f6bed0efSShaohua Li 
1025f6bed0efSShaohua Li 	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
10260576b1c6SShaohua Li 	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
10270576b1c6SShaohua Li 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
10280576b1c6SShaohua Li 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1029f6bed0efSShaohua Li 	log->last_checkpoint = cp;
1030f6bed0efSShaohua Li 
1031f6bed0efSShaohua Li 	__free_page(page);
1032f6bed0efSShaohua Li 
1033f6bed0efSShaohua Li 	return r5l_recovery_log(log);
1034f6bed0efSShaohua Li ioerr:
1035f6bed0efSShaohua Li 	__free_page(page);
1036f6bed0efSShaohua Li 	return ret;
1037f6bed0efSShaohua Li }
1038f6bed0efSShaohua Li 
1039f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1040f6bed0efSShaohua Li {
1041f6bed0efSShaohua Li 	struct r5l_log *log;
1042f6bed0efSShaohua Li 
1043f6bed0efSShaohua Li 	if (PAGE_SIZE != 4096)
1044f6bed0efSShaohua Li 		return -EINVAL;
1045f6bed0efSShaohua Li 	log = kzalloc(sizeof(*log), GFP_KERNEL);
1046f6bed0efSShaohua Li 	if (!log)
1047f6bed0efSShaohua Li 		return -ENOMEM;
1048f6bed0efSShaohua Li 	log->rdev = rdev;
1049f6bed0efSShaohua Li 
1050*5cb2fbd6SShaohua Li 	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1051f6bed0efSShaohua Li 				       sizeof(rdev->mddev->uuid));
1052f6bed0efSShaohua Li 
1053f6bed0efSShaohua Li 	mutex_init(&log->io_mutex);
1054f6bed0efSShaohua Li 
1055f6bed0efSShaohua Li 	spin_lock_init(&log->io_list_lock);
1056f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->running_ios);
10570576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->io_end_ios);
10580576b1c6SShaohua Li 	INIT_LIST_HEAD(&log->stripe_end_ios);
1059f6bed0efSShaohua Li 
1060f6bed0efSShaohua Li 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1061f6bed0efSShaohua Li 	if (!log->io_kc)
1062f6bed0efSShaohua Li 		goto io_kc;
1063f6bed0efSShaohua Li 
10640576b1c6SShaohua Li 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
10650576b1c6SShaohua Li 						 log->rdev->mddev, "reclaim");
10660576b1c6SShaohua Li 	if (!log->reclaim_thread)
10670576b1c6SShaohua Li 		goto reclaim_thread;
10680576b1c6SShaohua Li 
1069f6bed0efSShaohua Li 	INIT_LIST_HEAD(&log->no_space_stripes);
1070f6bed0efSShaohua Li 	spin_lock_init(&log->no_space_stripes_lock);
1071f6bed0efSShaohua Li 
1072f6bed0efSShaohua Li 	if (r5l_load_log(log))
1073f6bed0efSShaohua Li 		goto error;
1074f6bed0efSShaohua Li 
1075f6bed0efSShaohua Li 	conf->log = log;
1076f6bed0efSShaohua Li 	return 0;
1077f6bed0efSShaohua Li error:
10780576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
10790576b1c6SShaohua Li reclaim_thread:
1080f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1081f6bed0efSShaohua Li io_kc:
1082f6bed0efSShaohua Li 	kfree(log);
1083f6bed0efSShaohua Li 	return -EINVAL;
1084f6bed0efSShaohua Li }
1085f6bed0efSShaohua Li 
1086f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log)
1087f6bed0efSShaohua Li {
10880576b1c6SShaohua Li 	/*
10890576b1c6SShaohua Li 	 * at this point all stripes are finished, so io_unit is at least in
10900576b1c6SShaohua Li 	 * STRIPE_END state
10910576b1c6SShaohua Li 	 */
10920576b1c6SShaohua Li 	r5l_wake_reclaim(log, -1L);
10930576b1c6SShaohua Li 	md_unregister_thread(&log->reclaim_thread);
10940576b1c6SShaohua Li 	r5l_do_reclaim(log);
10950576b1c6SShaohua Li 	/*
10960576b1c6SShaohua Li 	 * force a super update, r5l_do_reclaim might updated the super.
10970576b1c6SShaohua Li 	 * mddev->thread is already stopped
10980576b1c6SShaohua Li 	 */
10990576b1c6SShaohua Li 	md_update_sb(log->rdev->mddev, 1);
11000576b1c6SShaohua Li 
1101f6bed0efSShaohua Li 	kmem_cache_destroy(log->io_kc);
1102f6bed0efSShaohua Li 	kfree(log);
1103f6bed0efSShaohua Li }
1104