1 /* 2 * Copyright (C) 2020 Red Hat GmbH 3 * 4 * This file is released under the GPL. 5 * 6 * Device-mapper target to emulate smaller logical block 7 * size on backing devices exposing (natively) larger ones. 8 * 9 * E.g. 512 byte sector emulation on 4K native disks. 10 */ 11 12 #include "dm.h" 13 #include <linux/module.h> 14 #include <linux/workqueue.h> 15 #include <linux/dm-bufio.h> 16 17 #define DM_MSG_PREFIX "ebs" 18 19 static void ebs_dtr(struct dm_target *ti); 20 21 /* Emulated block size context. */ 22 struct ebs_c { 23 struct dm_dev *dev; /* Underlying device to emulate block size on. */ 24 struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ 25 struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ 26 struct work_struct ws; /* Work item used for ^. */ 27 struct bio_list bios_in; /* Worker bios input list. */ 28 spinlock_t lock; /* Guard bios input list above. */ 29 sector_t start; /* <start> table line argument, see ebs_ctr below. */ 30 unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ 31 unsigned int u_bs; /* Underlying block size in sectors retrieved from/set on lower layer device. */ 32 unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ 33 bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ 34 }; 35 36 static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) 37 { 38 return sector >> ec->block_shift; 39 } 40 41 static inline sector_t __block_mod(sector_t sector, unsigned int bs) 42 { 43 return sector & (bs - 1); 44 } 45 46 /* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */ 47 static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) 48 { 49 sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); 50 51 return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0); 52 } 53 54 static inline bool __ebs_check_bs(unsigned int bs) 55 { 56 return bs && is_power_of_2(bs); 57 } 58 59 /* 60 * READ/WRITE: 61 * 62 * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. 63 */ 64 static int __ebs_rw_bvec(struct ebs_c *ec, enum req_op op, struct bio_vec *bv, 65 struct bvec_iter *iter) 66 { 67 int r = 0; 68 unsigned char *ba, *pa; 69 unsigned int cur_len; 70 unsigned int bv_len = bv->bv_len; 71 unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs)); 72 sector_t block = __sector_to_block(ec, iter->bi_sector); 73 struct dm_buffer *b; 74 75 if (unlikely(!bv->bv_page || !bv_len)) 76 return -EIO; 77 78 pa = bvec_virt(bv); 79 80 /* Handle overlapping page <-> blocks */ 81 while (bv_len) { 82 cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); 83 84 /* Avoid reading for writes in case bio vector's page overwrites block completely. */ 85 if (op == REQ_OP_READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) 86 ba = dm_bufio_read(ec->bufio, block, &b); 87 else 88 ba = dm_bufio_new(ec->bufio, block, &b); 89 90 if (IS_ERR(ba)) { 91 /* 92 * Carry on with next buffer, if any, to issue all possible 93 * data but return error. 94 */ 95 r = PTR_ERR(ba); 96 } else { 97 /* Copy data to/from bio to buffer if read/new was successful above. */ 98 ba += buf_off; 99 if (op == REQ_OP_READ) { 100 memcpy(pa, ba, cur_len); 101 flush_dcache_page(bv->bv_page); 102 } else { 103 flush_dcache_page(bv->bv_page); 104 memcpy(ba, pa, cur_len); 105 dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); 106 } 107 108 dm_bufio_release(b); 109 } 110 111 pa += cur_len; 112 bv_len -= cur_len; 113 buf_off = 0; 114 block++; 115 } 116 117 return r; 118 } 119 120 /* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ 121 static int __ebs_rw_bio(struct ebs_c *ec, enum req_op op, struct bio *bio) 122 { 123 int r = 0, rr; 124 struct bio_vec bv; 125 struct bvec_iter iter; 126 127 bio_for_each_bvec(bv, bio, iter) { 128 rr = __ebs_rw_bvec(ec, op, &bv, &iter); 129 if (rr) 130 r = rr; 131 } 132 133 return r; 134 } 135 136 /* 137 * Discard bio's blocks, i.e. pass discards down. 138 * 139 * Avoid discarding partial blocks at beginning and end; 140 * return 0 in case no blocks can be discarded as a result. 141 */ 142 static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) 143 { 144 sector_t block, blocks, sector = bio->bi_iter.bi_sector; 145 146 block = __sector_to_block(ec, sector); 147 blocks = __nr_blocks(ec, bio); 148 149 /* 150 * Partial first underlying block (__nr_blocks() may have 151 * resulted in one block). 152 */ 153 if (__block_mod(sector, ec->u_bs)) { 154 block++; 155 blocks--; 156 } 157 158 /* Partial last underlying block if any. */ 159 if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs)) 160 blocks--; 161 162 return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0; 163 } 164 165 /* Release blocks them from the bufio cache. */ 166 static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) 167 { 168 sector_t blocks, sector = bio->bi_iter.bi_sector; 169 170 blocks = __nr_blocks(ec, bio); 171 172 dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); 173 } 174 175 /* Worker function to process incoming bios. */ 176 static void __ebs_process_bios(struct work_struct *ws) 177 { 178 int r; 179 bool write = false; 180 sector_t block1, block2; 181 struct ebs_c *ec = container_of(ws, struct ebs_c, ws); 182 struct bio *bio; 183 struct bio_list bios; 184 185 bio_list_init(&bios); 186 187 spin_lock_irq(&ec->lock); 188 bios = ec->bios_in; 189 bio_list_init(&ec->bios_in); 190 spin_unlock_irq(&ec->lock); 191 192 /* Prefetch all read and any mis-aligned write buffers */ 193 bio_list_for_each(bio, &bios) { 194 block1 = __sector_to_block(ec, bio->bi_iter.bi_sector); 195 if (bio_op(bio) == REQ_OP_READ) 196 dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio)); 197 else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { 198 block2 = __sector_to_block(ec, bio_end_sector(bio)); 199 if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs)) 200 dm_bufio_prefetch(ec->bufio, block1, 1); 201 if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1) 202 dm_bufio_prefetch(ec->bufio, block2, 1); 203 } 204 } 205 206 bio_list_for_each(bio, &bios) { 207 r = -EIO; 208 if (bio_op(bio) == REQ_OP_READ) 209 r = __ebs_rw_bio(ec, REQ_OP_READ, bio); 210 else if (bio_op(bio) == REQ_OP_WRITE) { 211 write = true; 212 r = __ebs_rw_bio(ec, REQ_OP_WRITE, bio); 213 } else if (bio_op(bio) == REQ_OP_DISCARD) { 214 __ebs_forget_bio(ec, bio); 215 r = __ebs_discard_bio(ec, bio); 216 } 217 218 if (r < 0) 219 bio->bi_status = errno_to_blk_status(r); 220 } 221 222 /* 223 * We write dirty buffers after processing I/O on them 224 * but before we endio thus addressing REQ_FUA/REQ_SYNC. 225 */ 226 r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0; 227 228 while ((bio = bio_list_pop(&bios))) { 229 /* Any other request is endioed. */ 230 if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) 231 bio_io_error(bio); 232 else 233 bio_endio(bio); 234 } 235 } 236 237 /* 238 * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>] 239 * 240 * <dev_path>: path of the underlying device 241 * <offset>: offset in 512 bytes sectors into <dev_path> 242 * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer 243 * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer; 244 * optional, if not supplied, retrieve logical block size from underlying device 245 */ 246 static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) 247 { 248 int r; 249 unsigned short tmp1; 250 unsigned long long tmp; 251 char dummy; 252 struct ebs_c *ec; 253 254 if (argc < 3 || argc > 4) { 255 ti->error = "Invalid argument count"; 256 return -EINVAL; 257 } 258 259 ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); 260 if (!ec) { 261 ti->error = "Cannot allocate ebs context"; 262 return -ENOMEM; 263 } 264 265 r = -EINVAL; 266 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || 267 tmp != (sector_t)tmp || 268 (sector_t)tmp >= ti->len) { 269 ti->error = "Invalid device offset sector"; 270 goto bad; 271 } 272 ec->start = tmp; 273 274 if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 || 275 !__ebs_check_bs(tmp1) || 276 to_bytes(tmp1) > PAGE_SIZE) { 277 ti->error = "Invalid emulated block size"; 278 goto bad; 279 } 280 ec->e_bs = tmp1; 281 282 if (argc > 3) { 283 if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) { 284 ti->error = "Invalid underlying block size"; 285 goto bad; 286 } 287 ec->u_bs = tmp1; 288 ec->u_bs_set = true; 289 } else 290 ec->u_bs_set = false; 291 292 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev); 293 if (r) { 294 ti->error = "Device lookup failed"; 295 ec->dev = NULL; 296 goto bad; 297 } 298 299 r = -EINVAL; 300 if (!ec->u_bs_set) { 301 ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); 302 if (!__ebs_check_bs(ec->u_bs)) { 303 ti->error = "Invalid retrieved underlying block size"; 304 goto bad; 305 } 306 } 307 308 if (!ec->u_bs_set && ec->e_bs == ec->u_bs) 309 DMINFO("Emulation superfluous: emulated equal to underlying block size"); 310 311 if (__block_mod(ec->start, ec->u_bs)) { 312 ti->error = "Device offset must be multiple of underlying block size"; 313 goto bad; 314 } 315 316 ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 317 0, NULL, NULL, 0); 318 if (IS_ERR(ec->bufio)) { 319 ti->error = "Cannot create dm bufio client"; 320 r = PTR_ERR(ec->bufio); 321 ec->bufio = NULL; 322 goto bad; 323 } 324 325 ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 326 if (!ec->wq) { 327 ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; 328 r = -ENOMEM; 329 goto bad; 330 } 331 332 ec->block_shift = __ffs(ec->u_bs); 333 INIT_WORK(&ec->ws, &__ebs_process_bios); 334 bio_list_init(&ec->bios_in); 335 spin_lock_init(&ec->lock); 336 337 ti->num_flush_bios = 1; 338 ti->num_discard_bios = 1; 339 ti->num_secure_erase_bios = 0; 340 ti->num_write_zeroes_bios = 0; 341 return 0; 342 bad: 343 ebs_dtr(ti); 344 return r; 345 } 346 347 static void ebs_dtr(struct dm_target *ti) 348 { 349 struct ebs_c *ec = ti->private; 350 351 if (ec->wq) 352 destroy_workqueue(ec->wq); 353 if (ec->bufio) 354 dm_bufio_client_destroy(ec->bufio); 355 if (ec->dev) 356 dm_put_device(ti, ec->dev); 357 kfree(ec); 358 } 359 360 static int ebs_map(struct dm_target *ti, struct bio *bio) 361 { 362 struct ebs_c *ec = ti->private; 363 364 bio_set_dev(bio, ec->dev->bdev); 365 bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); 366 367 if (unlikely(bio_op(bio) == REQ_OP_FLUSH)) 368 return DM_MAPIO_REMAPPED; 369 /* 370 * Only queue for bufio processing in case of partial or overlapping buffers 371 * -or- 372 * emulation with ebs == ubs aiming for tests of dm-bufio overhead. 373 */ 374 if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || 375 __block_mod(bio_end_sector(bio), ec->u_bs) || 376 ec->e_bs == ec->u_bs)) { 377 spin_lock_irq(&ec->lock); 378 bio_list_add(&ec->bios_in, bio); 379 spin_unlock_irq(&ec->lock); 380 381 queue_work(ec->wq, &ec->ws); 382 383 return DM_MAPIO_SUBMITTED; 384 } 385 386 /* Forget any buffer content relative to this direct backing device I/O. */ 387 __ebs_forget_bio(ec, bio); 388 389 return DM_MAPIO_REMAPPED; 390 } 391 392 static void ebs_status(struct dm_target *ti, status_type_t type, 393 unsigned status_flags, char *result, unsigned maxlen) 394 { 395 struct ebs_c *ec = ti->private; 396 397 switch (type) { 398 case STATUSTYPE_INFO: 399 *result = '\0'; 400 break; 401 case STATUSTYPE_TABLE: 402 snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u", 403 ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); 404 break; 405 case STATUSTYPE_IMA: 406 *result = '\0'; 407 break; 408 } 409 } 410 411 static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 412 { 413 struct ebs_c *ec = ti->private; 414 struct dm_dev *dev = ec->dev; 415 416 /* 417 * Only pass ioctls through if the device sizes match exactly. 418 */ 419 *bdev = dev->bdev; 420 return !!(ec->start || ti->len != bdev_nr_sectors(dev->bdev)); 421 } 422 423 static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) 424 { 425 struct ebs_c *ec = ti->private; 426 427 limits->logical_block_size = to_bytes(ec->e_bs); 428 limits->physical_block_size = to_bytes(ec->u_bs); 429 limits->alignment_offset = limits->physical_block_size; 430 blk_limits_io_min(limits, limits->logical_block_size); 431 } 432 433 static int ebs_iterate_devices(struct dm_target *ti, 434 iterate_devices_callout_fn fn, void *data) 435 { 436 struct ebs_c *ec = ti->private; 437 438 return fn(ti, ec->dev, ec->start, ti->len, data); 439 } 440 441 static struct target_type ebs_target = { 442 .name = "ebs", 443 .version = {1, 0, 1}, 444 .features = DM_TARGET_PASSES_INTEGRITY, 445 .module = THIS_MODULE, 446 .ctr = ebs_ctr, 447 .dtr = ebs_dtr, 448 .map = ebs_map, 449 .status = ebs_status, 450 .io_hints = ebs_io_hints, 451 .prepare_ioctl = ebs_prepare_ioctl, 452 .iterate_devices = ebs_iterate_devices, 453 }; 454 455 static int __init dm_ebs_init(void) 456 { 457 int r = dm_register_target(&ebs_target); 458 459 if (r < 0) 460 DMERR("register failed %d", r); 461 462 return r; 463 } 464 465 static void dm_ebs_exit(void) 466 { 467 dm_unregister_target(&ebs_target); 468 } 469 470 module_init(dm_ebs_init); 471 module_exit(dm_ebs_exit); 472 473 MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); 474 MODULE_DESCRIPTION(DM_NAME " emulated block size target"); 475 MODULE_LICENSE("GPL"); 476