1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2005-2007 Red Hat GmbH 4 * 5 * A target that delays reads and/or writes and can send 6 * them to different devices. 7 * 8 * This file is released under the GPL. 9 */ 10 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/blkdev.h> 14 #include <linux/bio.h> 15 #include <linux/slab.h> 16 #include <linux/kthread.h> 17 #include <linux/delay.h> 18 19 #include <linux/device-mapper.h> 20 21 #define DM_MSG_PREFIX "delay" 22 23 #define SLEEP_SHIFT 3 24 25 struct delay_class { 26 struct dm_dev *dev; 27 sector_t start; 28 unsigned int delay; 29 unsigned int ops; 30 }; 31 32 struct delay_c { 33 struct timer_list delay_timer; 34 struct mutex process_bios_lock; /* hold while removing bios to be processed from list */ 35 spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */ 36 struct workqueue_struct *kdelayd_wq; 37 struct work_struct flush_expired_bios; 38 struct list_head delayed_bios; 39 struct task_struct *worker; 40 unsigned int worker_sleep_us; 41 bool may_delay; 42 43 struct delay_class read; 44 struct delay_class write; 45 struct delay_class flush; 46 47 int argc; 48 }; 49 50 struct dm_delay_info { 51 struct delay_c *context; 52 struct delay_class *class; 53 struct list_head list; 54 unsigned long expires; 55 }; 56 57 static void handle_delayed_timer(struct timer_list *t) 58 { 59 struct delay_c *dc = timer_container_of(dc, t, delay_timer); 60 61 queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); 62 } 63 64 static void queue_timeout(struct delay_c *dc, unsigned long expires) 65 { 66 timer_reduce(&dc->delay_timer, expires); 67 } 68 69 static inline bool delay_is_fast(struct delay_c *dc) 70 { 71 return !!dc->worker; 72 } 73 74 static void flush_bios(struct bio *bio) 75 { 76 struct bio *n; 77 78 while (bio) { 79 n = bio->bi_next; 80 bio->bi_next = NULL; 81 dm_submit_bio_remap(bio, NULL); 82 bio = n; 83 } 84 } 85 86 static void flush_delayed_bios(struct delay_c *dc, bool flush_all) 87 { 88 struct dm_delay_info *delayed, *next; 89 struct bio_list flush_bio_list; 90 LIST_HEAD(local_list); 91 unsigned long next_expires = 0; 92 bool start_timer = false; 93 bio_list_init(&flush_bio_list); 94 95 mutex_lock(&dc->process_bios_lock); 96 spin_lock(&dc->delayed_bios_lock); 97 list_replace_init(&dc->delayed_bios, &local_list); 98 spin_unlock(&dc->delayed_bios_lock); 99 list_for_each_entry_safe(delayed, next, &local_list, list) { 100 cond_resched(); 101 if (flush_all || time_after_eq(jiffies, delayed->expires)) { 102 struct bio *bio = dm_bio_from_per_bio_data(delayed, 103 sizeof(struct dm_delay_info)); 104 list_del(&delayed->list); 105 bio_list_add(&flush_bio_list, bio); 106 delayed->class->ops--; 107 continue; 108 } 109 110 if (!delay_is_fast(dc)) { 111 if (!start_timer) { 112 start_timer = true; 113 next_expires = delayed->expires; 114 } else { 115 next_expires = min(next_expires, delayed->expires); 116 } 117 } 118 } 119 spin_lock(&dc->delayed_bios_lock); 120 list_splice(&local_list, &dc->delayed_bios); 121 spin_unlock(&dc->delayed_bios_lock); 122 mutex_unlock(&dc->process_bios_lock); 123 124 if (start_timer) 125 queue_timeout(dc, next_expires); 126 127 flush_bios(bio_list_get(&flush_bio_list)); 128 } 129 130 static int flush_worker_fn(void *data) 131 { 132 struct delay_c *dc = data; 133 134 while (!kthread_should_stop()) { 135 flush_delayed_bios(dc, false); 136 spin_lock(&dc->delayed_bios_lock); 137 if (unlikely(list_empty(&dc->delayed_bios))) { 138 set_current_state(TASK_INTERRUPTIBLE); 139 spin_unlock(&dc->delayed_bios_lock); 140 schedule(); 141 } else { 142 spin_unlock(&dc->delayed_bios_lock); 143 fsleep(dc->worker_sleep_us); 144 cond_resched(); 145 } 146 } 147 148 return 0; 149 } 150 151 static void flush_expired_bios(struct work_struct *work) 152 { 153 struct delay_c *dc; 154 155 dc = container_of(work, struct delay_c, flush_expired_bios); 156 flush_delayed_bios(dc, false); 157 } 158 159 static void delay_dtr(struct dm_target *ti) 160 { 161 struct delay_c *dc = ti->private; 162 163 if (dc->kdelayd_wq) { 164 timer_shutdown_sync(&dc->delay_timer); 165 destroy_workqueue(dc->kdelayd_wq); 166 } 167 168 if (dc->read.dev) 169 dm_put_device(ti, dc->read.dev); 170 if (dc->write.dev) 171 dm_put_device(ti, dc->write.dev); 172 if (dc->flush.dev) 173 dm_put_device(ti, dc->flush.dev); 174 if (dc->worker) 175 kthread_stop(dc->worker); 176 177 mutex_destroy(&dc->process_bios_lock); 178 179 kfree(dc); 180 } 181 182 static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv) 183 { 184 int ret; 185 unsigned long long tmpll; 186 char dummy; 187 188 if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { 189 ti->error = "Invalid device sector"; 190 return -EINVAL; 191 } 192 c->start = tmpll; 193 194 if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) { 195 ti->error = "Invalid delay"; 196 return -EINVAL; 197 } 198 199 ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev); 200 if (ret) { 201 ti->error = "Device lookup failed"; 202 return ret; 203 } 204 205 return 0; 206 } 207 208 /* 209 * Mapping parameters: 210 * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] 211 * 212 * With separate write parameters, the first set is only used for reads. 213 * Offsets are specified in sectors. 214 * Delays are specified in milliseconds. 215 */ 216 static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) 217 { 218 struct delay_c *dc; 219 int ret; 220 unsigned int max_delay, min_delay; 221 222 if (argc != 3 && argc != 6 && argc != 9) { 223 ti->error = "Requires exactly 3, 6 or 9 arguments"; 224 return -EINVAL; 225 } 226 227 dc = kzalloc(sizeof(*dc), GFP_KERNEL); 228 if (!dc) { 229 ti->error = "Cannot allocate context"; 230 return -ENOMEM; 231 } 232 233 ti->private = dc; 234 INIT_LIST_HEAD(&dc->delayed_bios); 235 mutex_init(&dc->process_bios_lock); 236 spin_lock_init(&dc->delayed_bios_lock); 237 dc->may_delay = true; 238 dc->argc = argc; 239 240 ret = delay_class_ctr(ti, &dc->read, argv); 241 if (ret) 242 goto bad; 243 min_delay = max_delay = dc->read.delay; 244 245 if (argc == 3) { 246 ret = delay_class_ctr(ti, &dc->write, argv); 247 if (ret) 248 goto bad; 249 ret = delay_class_ctr(ti, &dc->flush, argv); 250 if (ret) 251 goto bad; 252 goto out; 253 } 254 255 ret = delay_class_ctr(ti, &dc->write, argv + 3); 256 if (ret) 257 goto bad; 258 max_delay = max(max_delay, dc->write.delay); 259 min_delay = min_not_zero(min_delay, dc->write.delay); 260 261 if (argc == 6) { 262 ret = delay_class_ctr(ti, &dc->flush, argv + 3); 263 if (ret) 264 goto bad; 265 goto out; 266 } 267 268 ret = delay_class_ctr(ti, &dc->flush, argv + 6); 269 if (ret) 270 goto bad; 271 max_delay = max(max_delay, dc->flush.delay); 272 min_delay = min_not_zero(min_delay, dc->flush.delay); 273 274 out: 275 if (max_delay < 50) { 276 if (min_delay >> SLEEP_SHIFT) 277 dc->worker_sleep_us = 1000; 278 else 279 dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT; 280 /* 281 * In case of small requested delays, use kthread instead of 282 * timers and workqueue to achieve better latency. 283 */ 284 dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker"); 285 if (IS_ERR(dc->worker)) { 286 ret = PTR_ERR(dc->worker); 287 dc->worker = NULL; 288 goto bad; 289 } 290 } else { 291 timer_setup(&dc->delay_timer, handle_delayed_timer, 0); 292 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); 293 dc->kdelayd_wq = alloc_workqueue("kdelayd", 294 WQ_MEM_RECLAIM | WQ_PERCPU, 295 0); 296 if (!dc->kdelayd_wq) { 297 ret = -EINVAL; 298 DMERR("Couldn't start kdelayd"); 299 goto bad; 300 } 301 } 302 303 ti->num_flush_bios = 1; 304 ti->num_discard_bios = 1; 305 ti->accounts_remapped_io = true; 306 ti->per_io_data_size = sizeof(struct dm_delay_info); 307 return 0; 308 309 bad: 310 delay_dtr(ti); 311 return ret; 312 } 313 314 static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) 315 { 316 struct dm_delay_info *delayed; 317 unsigned long expires = 0; 318 319 if (!c->delay) 320 return DM_MAPIO_REMAPPED; 321 322 delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); 323 324 delayed->context = dc; 325 delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); 326 327 spin_lock(&dc->delayed_bios_lock); 328 if (unlikely(!dc->may_delay)) { 329 spin_unlock(&dc->delayed_bios_lock); 330 return DM_MAPIO_REMAPPED; 331 } 332 c->ops++; 333 list_add_tail(&delayed->list, &dc->delayed_bios); 334 spin_unlock(&dc->delayed_bios_lock); 335 336 if (delay_is_fast(dc)) 337 wake_up_process(dc->worker); 338 else 339 queue_timeout(dc, expires); 340 341 return DM_MAPIO_SUBMITTED; 342 } 343 344 static void delay_presuspend(struct dm_target *ti) 345 { 346 struct delay_c *dc = ti->private; 347 348 spin_lock(&dc->delayed_bios_lock); 349 dc->may_delay = false; 350 spin_unlock(&dc->delayed_bios_lock); 351 352 if (!delay_is_fast(dc)) 353 timer_delete(&dc->delay_timer); 354 flush_delayed_bios(dc, true); 355 } 356 357 static void delay_resume(struct dm_target *ti) 358 { 359 struct delay_c *dc = ti->private; 360 361 dc->may_delay = true; 362 } 363 364 static int delay_map(struct dm_target *ti, struct bio *bio) 365 { 366 struct delay_c *dc = ti->private; 367 struct delay_class *c; 368 struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); 369 370 if (bio_data_dir(bio) == WRITE) { 371 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) 372 c = &dc->flush; 373 else 374 c = &dc->write; 375 } else { 376 c = &dc->read; 377 } 378 delayed->class = c; 379 bio_set_dev(bio, c->dev->bdev); 380 bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); 381 382 return delay_bio(dc, c, bio); 383 } 384 385 #ifdef CONFIG_BLK_DEV_ZONED 386 static int delay_report_zones(struct dm_target *ti, 387 struct dm_report_zones_args *args, unsigned int nr_zones) 388 { 389 struct delay_c *dc = ti->private; 390 struct delay_class *c = &dc->read; 391 392 return dm_report_zones(c->dev->bdev, c->start, 393 c->start + dm_target_offset(ti, args->next_sector), 394 args, nr_zones); 395 } 396 #else 397 #define delay_report_zones NULL 398 #endif 399 400 #define DMEMIT_DELAY_CLASS(c) \ 401 DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) 402 403 static void delay_status(struct dm_target *ti, status_type_t type, 404 unsigned int status_flags, char *result, unsigned int maxlen) 405 { 406 struct delay_c *dc = ti->private; 407 int sz = 0; 408 409 switch (type) { 410 case STATUSTYPE_INFO: 411 DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops); 412 break; 413 414 case STATUSTYPE_TABLE: 415 DMEMIT_DELAY_CLASS(&dc->read); 416 if (dc->argc >= 6) { 417 DMEMIT(" "); 418 DMEMIT_DELAY_CLASS(&dc->write); 419 } 420 if (dc->argc >= 9) { 421 DMEMIT(" "); 422 DMEMIT_DELAY_CLASS(&dc->flush); 423 } 424 break; 425 426 case STATUSTYPE_IMA: 427 *result = '\0'; 428 break; 429 } 430 } 431 432 static int delay_iterate_devices(struct dm_target *ti, 433 iterate_devices_callout_fn fn, void *data) 434 { 435 struct delay_c *dc = ti->private; 436 int ret = 0; 437 438 ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data); 439 if (ret) 440 goto out; 441 ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data); 442 if (ret) 443 goto out; 444 ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data); 445 if (ret) 446 goto out; 447 448 out: 449 return ret; 450 } 451 452 static struct target_type delay_target = { 453 .name = "delay", 454 .version = {1, 5, 0}, 455 .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, 456 .module = THIS_MODULE, 457 .ctr = delay_ctr, 458 .dtr = delay_dtr, 459 .map = delay_map, 460 .report_zones = delay_report_zones, 461 .presuspend = delay_presuspend, 462 .resume = delay_resume, 463 .status = delay_status, 464 .iterate_devices = delay_iterate_devices, 465 }; 466 module_dm(delay); 467 468 MODULE_DESCRIPTION(DM_NAME " delay target"); 469 MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>"); 470 MODULE_LICENSE("GPL"); 471