xref: /linux/drivers/md/dm-delay.c (revision f7d7ccf92f2b9398781f791b4af1a74a9f65b5c3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2005-2007 Red Hat GmbH
4  *
5  * A target that delays reads and/or writes and can send
6  * them to different devices.
7  *
8  * This file is released under the GPL.
9  */
10 
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/blkdev.h>
14 #include <linux/bio.h>
15 #include <linux/slab.h>
16 #include <linux/kthread.h>
17 
18 #include <linux/device-mapper.h>
19 
20 #define DM_MSG_PREFIX "delay"
21 
22 struct delay_class {
23 	struct dm_dev *dev;
24 	sector_t start;
25 	unsigned int delay;
26 	unsigned int ops;
27 };
28 
29 struct delay_c {
30 	struct timer_list delay_timer;
31 	struct mutex process_bios_lock; /* hold while removing bios to be processed from list */
32 	spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */
33 	struct workqueue_struct *kdelayd_wq;
34 	struct work_struct flush_expired_bios;
35 	struct list_head delayed_bios;
36 	struct task_struct *worker;
37 	bool may_delay;
38 
39 	struct delay_class read;
40 	struct delay_class write;
41 	struct delay_class flush;
42 
43 	int argc;
44 };
45 
46 struct dm_delay_info {
47 	struct delay_c *context;
48 	struct delay_class *class;
49 	struct list_head list;
50 	unsigned long expires;
51 };
52 
53 static void handle_delayed_timer(struct timer_list *t)
54 {
55 	struct delay_c *dc = from_timer(dc, t, delay_timer);
56 
57 	queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
58 }
59 
60 static void queue_timeout(struct delay_c *dc, unsigned long expires)
61 {
62 	timer_reduce(&dc->delay_timer, expires);
63 }
64 
65 static inline bool delay_is_fast(struct delay_c *dc)
66 {
67 	return !!dc->worker;
68 }
69 
70 static void flush_bios(struct bio *bio)
71 {
72 	struct bio *n;
73 
74 	while (bio) {
75 		n = bio->bi_next;
76 		bio->bi_next = NULL;
77 		dm_submit_bio_remap(bio, NULL);
78 		bio = n;
79 	}
80 }
81 
82 static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
83 {
84 	struct dm_delay_info *delayed, *next;
85 	struct bio_list flush_bio_list;
86 	LIST_HEAD(local_list);
87 	unsigned long next_expires = 0;
88 	bool start_timer = false;
89 	bio_list_init(&flush_bio_list);
90 
91 	mutex_lock(&dc->process_bios_lock);
92 	spin_lock(&dc->delayed_bios_lock);
93 	list_replace_init(&dc->delayed_bios, &local_list);
94 	spin_unlock(&dc->delayed_bios_lock);
95 	list_for_each_entry_safe(delayed, next, &local_list, list) {
96 		cond_resched();
97 		if (flush_all || time_after_eq(jiffies, delayed->expires)) {
98 			struct bio *bio = dm_bio_from_per_bio_data(delayed,
99 						sizeof(struct dm_delay_info));
100 			list_del(&delayed->list);
101 			bio_list_add(&flush_bio_list, bio);
102 			delayed->class->ops--;
103 			continue;
104 		}
105 
106 		if (!delay_is_fast(dc)) {
107 			if (!start_timer) {
108 				start_timer = true;
109 				next_expires = delayed->expires;
110 			} else {
111 				next_expires = min(next_expires, delayed->expires);
112 			}
113 		}
114 	}
115 	spin_lock(&dc->delayed_bios_lock);
116 	list_splice(&local_list, &dc->delayed_bios);
117 	spin_unlock(&dc->delayed_bios_lock);
118 	mutex_unlock(&dc->process_bios_lock);
119 
120 	if (start_timer)
121 		queue_timeout(dc, next_expires);
122 
123 	flush_bios(bio_list_get(&flush_bio_list));
124 }
125 
126 static int flush_worker_fn(void *data)
127 {
128 	struct delay_c *dc = data;
129 
130 	while (!kthread_should_stop()) {
131 		flush_delayed_bios(dc, false);
132 		spin_lock(&dc->delayed_bios_lock);
133 		if (unlikely(list_empty(&dc->delayed_bios))) {
134 			set_current_state(TASK_INTERRUPTIBLE);
135 			spin_unlock(&dc->delayed_bios_lock);
136 			schedule();
137 		} else {
138 			spin_unlock(&dc->delayed_bios_lock);
139 			cond_resched();
140 		}
141 	}
142 
143 	return 0;
144 }
145 
146 static void flush_expired_bios(struct work_struct *work)
147 {
148 	struct delay_c *dc;
149 
150 	dc = container_of(work, struct delay_c, flush_expired_bios);
151 	flush_delayed_bios(dc, false);
152 }
153 
154 static void delay_dtr(struct dm_target *ti)
155 {
156 	struct delay_c *dc = ti->private;
157 
158 	if (dc->kdelayd_wq) {
159 		timer_shutdown_sync(&dc->delay_timer);
160 		destroy_workqueue(dc->kdelayd_wq);
161 	}
162 
163 	if (dc->read.dev)
164 		dm_put_device(ti, dc->read.dev);
165 	if (dc->write.dev)
166 		dm_put_device(ti, dc->write.dev);
167 	if (dc->flush.dev)
168 		dm_put_device(ti, dc->flush.dev);
169 	if (dc->worker)
170 		kthread_stop(dc->worker);
171 
172 	mutex_destroy(&dc->process_bios_lock);
173 
174 	kfree(dc);
175 }
176 
177 static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv)
178 {
179 	int ret;
180 	unsigned long long tmpll;
181 	char dummy;
182 
183 	if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
184 		ti->error = "Invalid device sector";
185 		return -EINVAL;
186 	}
187 	c->start = tmpll;
188 
189 	if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) {
190 		ti->error = "Invalid delay";
191 		return -EINVAL;
192 	}
193 
194 	ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev);
195 	if (ret) {
196 		ti->error = "Device lookup failed";
197 		return ret;
198 	}
199 
200 	return 0;
201 }
202 
203 /*
204  * Mapping parameters:
205  *    <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
206  *
207  * With separate write parameters, the first set is only used for reads.
208  * Offsets are specified in sectors.
209  * Delays are specified in milliseconds.
210  */
211 static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
212 {
213 	struct delay_c *dc;
214 	int ret;
215 	unsigned int max_delay;
216 
217 	if (argc != 3 && argc != 6 && argc != 9) {
218 		ti->error = "Requires exactly 3, 6 or 9 arguments";
219 		return -EINVAL;
220 	}
221 
222 	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
223 	if (!dc) {
224 		ti->error = "Cannot allocate context";
225 		return -ENOMEM;
226 	}
227 
228 	ti->private = dc;
229 	INIT_LIST_HEAD(&dc->delayed_bios);
230 	mutex_init(&dc->process_bios_lock);
231 	spin_lock_init(&dc->delayed_bios_lock);
232 	dc->may_delay = true;
233 	dc->argc = argc;
234 
235 	ret = delay_class_ctr(ti, &dc->read, argv);
236 	if (ret)
237 		goto bad;
238 	max_delay = dc->read.delay;
239 
240 	if (argc == 3) {
241 		ret = delay_class_ctr(ti, &dc->write, argv);
242 		if (ret)
243 			goto bad;
244 		ret = delay_class_ctr(ti, &dc->flush, argv);
245 		if (ret)
246 			goto bad;
247 		goto out;
248 	}
249 
250 	ret = delay_class_ctr(ti, &dc->write, argv + 3);
251 	if (ret)
252 		goto bad;
253 	max_delay = max(max_delay, dc->write.delay);
254 
255 	if (argc == 6) {
256 		ret = delay_class_ctr(ti, &dc->flush, argv + 3);
257 		if (ret)
258 			goto bad;
259 		goto out;
260 	}
261 
262 	ret = delay_class_ctr(ti, &dc->flush, argv + 6);
263 	if (ret)
264 		goto bad;
265 	max_delay = max(max_delay, dc->flush.delay);
266 
267 out:
268 	if (max_delay < 50) {
269 		/*
270 		 * In case of small requested delays, use kthread instead of
271 		 * timers and workqueue to achieve better latency.
272 		 */
273 		dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker");
274 		if (IS_ERR(dc->worker)) {
275 			ret = PTR_ERR(dc->worker);
276 			dc->worker = NULL;
277 			goto bad;
278 		}
279 	} else {
280 		timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
281 		INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
282 		dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
283 		if (!dc->kdelayd_wq) {
284 			ret = -EINVAL;
285 			DMERR("Couldn't start kdelayd");
286 			goto bad;
287 		}
288 	}
289 
290 	ti->num_flush_bios = 1;
291 	ti->num_discard_bios = 1;
292 	ti->accounts_remapped_io = true;
293 	ti->per_io_data_size = sizeof(struct dm_delay_info);
294 	return 0;
295 
296 bad:
297 	delay_dtr(ti);
298 	return ret;
299 }
300 
301 static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
302 {
303 	struct dm_delay_info *delayed;
304 	unsigned long expires = 0;
305 
306 	if (!c->delay)
307 		return DM_MAPIO_REMAPPED;
308 
309 	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
310 
311 	delayed->context = dc;
312 	delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
313 
314 	spin_lock(&dc->delayed_bios_lock);
315 	if (unlikely(!dc->may_delay)) {
316 		spin_unlock(&dc->delayed_bios_lock);
317 		return DM_MAPIO_REMAPPED;
318 	}
319 	c->ops++;
320 	list_add_tail(&delayed->list, &dc->delayed_bios);
321 	spin_unlock(&dc->delayed_bios_lock);
322 
323 	if (delay_is_fast(dc))
324 		wake_up_process(dc->worker);
325 	else
326 		queue_timeout(dc, expires);
327 
328 	return DM_MAPIO_SUBMITTED;
329 }
330 
331 static void delay_presuspend(struct dm_target *ti)
332 {
333 	struct delay_c *dc = ti->private;
334 
335 	spin_lock(&dc->delayed_bios_lock);
336 	dc->may_delay = false;
337 	spin_unlock(&dc->delayed_bios_lock);
338 
339 	if (!delay_is_fast(dc))
340 		timer_delete(&dc->delay_timer);
341 	flush_delayed_bios(dc, true);
342 }
343 
344 static void delay_resume(struct dm_target *ti)
345 {
346 	struct delay_c *dc = ti->private;
347 
348 	dc->may_delay = true;
349 }
350 
351 static int delay_map(struct dm_target *ti, struct bio *bio)
352 {
353 	struct delay_c *dc = ti->private;
354 	struct delay_class *c;
355 	struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
356 
357 	if (bio_data_dir(bio) == WRITE) {
358 		if (unlikely(bio->bi_opf & REQ_PREFLUSH))
359 			c = &dc->flush;
360 		else
361 			c = &dc->write;
362 	} else {
363 		c = &dc->read;
364 	}
365 	delayed->class = c;
366 	bio_set_dev(bio, c->dev->bdev);
367 	bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
368 
369 	return delay_bio(dc, c, bio);
370 }
371 
372 #define DMEMIT_DELAY_CLASS(c) \
373 	DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
374 
375 static void delay_status(struct dm_target *ti, status_type_t type,
376 			 unsigned int status_flags, char *result, unsigned int maxlen)
377 {
378 	struct delay_c *dc = ti->private;
379 	int sz = 0;
380 
381 	switch (type) {
382 	case STATUSTYPE_INFO:
383 		DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops);
384 		break;
385 
386 	case STATUSTYPE_TABLE:
387 		DMEMIT_DELAY_CLASS(&dc->read);
388 		if (dc->argc >= 6) {
389 			DMEMIT(" ");
390 			DMEMIT_DELAY_CLASS(&dc->write);
391 		}
392 		if (dc->argc >= 9) {
393 			DMEMIT(" ");
394 			DMEMIT_DELAY_CLASS(&dc->flush);
395 		}
396 		break;
397 
398 	case STATUSTYPE_IMA:
399 		*result = '\0';
400 		break;
401 	}
402 }
403 
404 static int delay_iterate_devices(struct dm_target *ti,
405 				 iterate_devices_callout_fn fn, void *data)
406 {
407 	struct delay_c *dc = ti->private;
408 	int ret = 0;
409 
410 	ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data);
411 	if (ret)
412 		goto out;
413 	ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data);
414 	if (ret)
415 		goto out;
416 	ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data);
417 	if (ret)
418 		goto out;
419 
420 out:
421 	return ret;
422 }
423 
424 static struct target_type delay_target = {
425 	.name	     = "delay",
426 	.version     = {1, 4, 0},
427 	.features    = DM_TARGET_PASSES_INTEGRITY,
428 	.module      = THIS_MODULE,
429 	.ctr	     = delay_ctr,
430 	.dtr	     = delay_dtr,
431 	.map	     = delay_map,
432 	.presuspend  = delay_presuspend,
433 	.resume	     = delay_resume,
434 	.status	     = delay_status,
435 	.iterate_devices = delay_iterate_devices,
436 };
437 module_dm(delay);
438 
439 MODULE_DESCRIPTION(DM_NAME " delay target");
440 MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
441 MODULE_LICENSE("GPL");
442