1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Userspace block device - block device which IO is handled from userspace 4 * 5 * Take full use of io_uring passthrough command for communicating with 6 * ublk userspace daemon(ublksrvd) for handling basic IO request. 7 * 8 * Copyright 2022 Ming Lei <ming.lei@redhat.com> 9 * 10 * (part of code stolen from loop.c) 11 */ 12 #include <linux/module.h> 13 #include <linux/moduleparam.h> 14 #include <linux/sched.h> 15 #include <linux/fs.h> 16 #include <linux/pagemap.h> 17 #include <linux/file.h> 18 #include <linux/stat.h> 19 #include <linux/errno.h> 20 #include <linux/major.h> 21 #include <linux/wait.h> 22 #include <linux/blkdev.h> 23 #include <linux/init.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/compat.h> 27 #include <linux/mutex.h> 28 #include <linux/writeback.h> 29 #include <linux/completion.h> 30 #include <linux/highmem.h> 31 #include <linux/sysfs.h> 32 #include <linux/miscdevice.h> 33 #include <linux/falloc.h> 34 #include <linux/uio.h> 35 #include <linux/ioprio.h> 36 #include <linux/sched/mm.h> 37 #include <linux/uaccess.h> 38 #include <linux/cdev.h> 39 #include <linux/io_uring.h> 40 #include <linux/blk-mq.h> 41 #include <linux/delay.h> 42 #include <linux/mm.h> 43 #include <asm/page.h> 44 #include <linux/task_work.h> 45 #include <uapi/linux/ublk_cmd.h> 46 47 #define UBLK_MINORS (1U << MINORBITS) 48 49 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 50 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK) 51 52 struct ublk_rq_data { 53 struct callback_head work; 54 }; 55 56 struct ublk_uring_cmd_pdu { 57 struct request *req; 58 }; 59 60 /* 61 * io command is active: sqe cmd is received, and its cqe isn't done 62 * 63 * If the flag is set, the io command is owned by ublk driver, and waited 64 * for incoming blk-mq request from the ublk block device. 65 * 66 * If the flag is cleared, the io command will be completed, and owned by 67 * ublk server. 68 */ 69 #define UBLK_IO_FLAG_ACTIVE 0x01 70 71 /* 72 * IO command is completed via cqe, and it is being handled by ublksrv, and 73 * not committed yet 74 * 75 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for 76 * cross verification 77 */ 78 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 79 80 /* 81 * IO command is aborted, so this flag is set in case of 82 * !UBLK_IO_FLAG_ACTIVE. 83 * 84 * After this flag is observed, any pending or new incoming request 85 * associated with this io command will be failed immediately 86 */ 87 #define UBLK_IO_FLAG_ABORTED 0x04 88 89 struct ublk_io { 90 /* userspace buffer address from io cmd */ 91 __u64 addr; 92 unsigned int flags; 93 int res; 94 95 struct io_uring_cmd *cmd; 96 }; 97 98 struct ublk_queue { 99 int q_id; 100 int q_depth; 101 102 unsigned long flags; 103 struct task_struct *ubq_daemon; 104 char *io_cmd_buf; 105 106 unsigned long io_addr; /* mapped vm address */ 107 unsigned int max_io_sz; 108 bool abort_work_pending; 109 unsigned short nr_io_ready; /* how many ios setup */ 110 struct ublk_device *dev; 111 struct ublk_io ios[0]; 112 }; 113 114 #define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ) 115 116 struct ublk_device { 117 struct gendisk *ub_disk; 118 119 char *__queues; 120 121 unsigned short queue_size; 122 unsigned short bs_shift; 123 struct ublksrv_ctrl_dev_info dev_info; 124 125 struct blk_mq_tag_set tag_set; 126 127 struct cdev cdev; 128 struct device cdev_dev; 129 130 #define UB_STATE_OPEN 0 131 #define UB_STATE_USED 1 132 unsigned long state; 133 int ub_number; 134 135 struct mutex mutex; 136 137 spinlock_t mm_lock; 138 struct mm_struct *mm; 139 140 struct completion completion; 141 unsigned int nr_queues_ready; 142 atomic_t nr_aborted_queues; 143 144 /* 145 * Our ubq->daemon may be killed without any notification, so 146 * monitor each queue's daemon periodically 147 */ 148 struct delayed_work monitor_work; 149 struct work_struct stop_work; 150 }; 151 152 static dev_t ublk_chr_devt; 153 static struct class *ublk_chr_class; 154 155 static DEFINE_IDR(ublk_index_idr); 156 static DEFINE_SPINLOCK(ublk_idr_lock); 157 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ 158 159 static DEFINE_MUTEX(ublk_ctl_mutex); 160 161 static struct miscdevice ublk_misc; 162 163 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) 164 { 165 if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && 166 !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) 167 return true; 168 return false; 169 } 170 171 static struct ublk_device *ublk_get_device(struct ublk_device *ub) 172 { 173 if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) 174 return ub; 175 return NULL; 176 } 177 178 static void ublk_put_device(struct ublk_device *ub) 179 { 180 put_device(&ub->cdev_dev); 181 } 182 183 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, 184 int qid) 185 { 186 return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); 187 } 188 189 static inline bool ublk_rq_has_data(const struct request *rq) 190 { 191 return rq->bio && bio_has_data(rq->bio); 192 } 193 194 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 195 int tag) 196 { 197 return (struct ublksrv_io_desc *) 198 &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); 199 } 200 201 static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) 202 { 203 return ublk_get_queue(ub, q_id)->io_cmd_buf; 204 } 205 206 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) 207 { 208 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 209 210 return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), 211 PAGE_SIZE); 212 } 213 214 static void ublk_free_disk(struct gendisk *disk) 215 { 216 struct ublk_device *ub = disk->private_data; 217 218 clear_bit(UB_STATE_USED, &ub->state); 219 put_device(&ub->cdev_dev); 220 } 221 222 static const struct block_device_operations ub_fops = { 223 .owner = THIS_MODULE, 224 .free_disk = ublk_free_disk, 225 }; 226 227 #define UBLK_MAX_PIN_PAGES 32 228 229 struct ublk_map_data { 230 const struct ublk_queue *ubq; 231 const struct request *rq; 232 const struct ublk_io *io; 233 unsigned max_bytes; 234 }; 235 236 struct ublk_io_iter { 237 struct page *pages[UBLK_MAX_PIN_PAGES]; 238 unsigned pg_off; /* offset in the 1st page in pages */ 239 int nr_pages; /* how many page pointers in pages */ 240 struct bio *bio; 241 struct bvec_iter iter; 242 }; 243 244 static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, 245 unsigned max_bytes, bool to_vm) 246 { 247 const unsigned total = min_t(unsigned, max_bytes, 248 PAGE_SIZE - data->pg_off + 249 ((data->nr_pages - 1) << PAGE_SHIFT)); 250 unsigned done = 0; 251 unsigned pg_idx = 0; 252 253 while (done < total) { 254 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); 255 const unsigned int bytes = min3(bv.bv_len, total - done, 256 (unsigned)(PAGE_SIZE - data->pg_off)); 257 void *bv_buf = bvec_kmap_local(&bv); 258 void *pg_buf = kmap_local_page(data->pages[pg_idx]); 259 260 if (to_vm) 261 memcpy(pg_buf + data->pg_off, bv_buf, bytes); 262 else 263 memcpy(bv_buf, pg_buf + data->pg_off, bytes); 264 265 kunmap_local(pg_buf); 266 kunmap_local(bv_buf); 267 268 /* advance page array */ 269 data->pg_off += bytes; 270 if (data->pg_off == PAGE_SIZE) { 271 pg_idx += 1; 272 data->pg_off = 0; 273 } 274 275 done += bytes; 276 277 /* advance bio */ 278 bio_advance_iter_single(data->bio, &data->iter, bytes); 279 if (!data->iter.bi_size) { 280 data->bio = data->bio->bi_next; 281 if (data->bio == NULL) 282 break; 283 data->iter = data->bio->bi_iter; 284 } 285 } 286 287 return done; 288 } 289 290 static inline int ublk_copy_user_pages(struct ublk_map_data *data, 291 bool to_vm) 292 { 293 const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; 294 const unsigned long start_vm = data->io->addr; 295 unsigned int done = 0; 296 struct ublk_io_iter iter = { 297 .pg_off = start_vm & (PAGE_SIZE - 1), 298 .bio = data->rq->bio, 299 .iter = data->rq->bio->bi_iter, 300 }; 301 const unsigned int nr_pages = round_up(data->max_bytes + 302 (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; 303 304 while (done < nr_pages) { 305 const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES, 306 nr_pages - done); 307 unsigned i, len; 308 309 iter.nr_pages = get_user_pages_fast(start_vm + 310 (done << PAGE_SHIFT), to_pin, gup_flags, 311 iter.pages); 312 if (iter.nr_pages <= 0) 313 return done == 0 ? iter.nr_pages : done; 314 len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); 315 for (i = 0; i < iter.nr_pages; i++) { 316 if (to_vm) 317 set_page_dirty(iter.pages[i]); 318 put_page(iter.pages[i]); 319 } 320 data->max_bytes -= len; 321 done += iter.nr_pages; 322 } 323 324 return done; 325 } 326 327 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, 328 struct ublk_io *io) 329 { 330 const unsigned int rq_bytes = blk_rq_bytes(req); 331 /* 332 * no zero copy, we delay copy WRITE request data into ublksrv 333 * context and the big benefit is that pinning pages in current 334 * context is pretty fast, see ublk_pin_user_pages 335 */ 336 if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) 337 return rq_bytes; 338 339 if (ublk_rq_has_data(req)) { 340 struct ublk_map_data data = { 341 .ubq = ubq, 342 .rq = req, 343 .io = io, 344 .max_bytes = rq_bytes, 345 }; 346 347 ublk_copy_user_pages(&data, true); 348 349 return rq_bytes - data.max_bytes; 350 } 351 return rq_bytes; 352 } 353 354 static int ublk_unmap_io(const struct ublk_queue *ubq, 355 const struct request *req, 356 struct ublk_io *io) 357 { 358 const unsigned int rq_bytes = blk_rq_bytes(req); 359 360 if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { 361 struct ublk_map_data data = { 362 .ubq = ubq, 363 .rq = req, 364 .io = io, 365 .max_bytes = io->res, 366 }; 367 368 WARN_ON_ONCE(io->res > rq_bytes); 369 370 ublk_copy_user_pages(&data, false); 371 372 return io->res - data.max_bytes; 373 } 374 return rq_bytes; 375 } 376 377 static inline unsigned int ublk_req_build_flags(struct request *req) 378 { 379 unsigned flags = 0; 380 381 if (req->cmd_flags & REQ_FAILFAST_DEV) 382 flags |= UBLK_IO_F_FAILFAST_DEV; 383 384 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT) 385 flags |= UBLK_IO_F_FAILFAST_TRANSPORT; 386 387 if (req->cmd_flags & REQ_FAILFAST_DRIVER) 388 flags |= UBLK_IO_F_FAILFAST_DRIVER; 389 390 if (req->cmd_flags & REQ_META) 391 flags |= UBLK_IO_F_META; 392 393 if (req->cmd_flags & REQ_FUA) 394 flags |= UBLK_IO_F_FUA; 395 396 if (req->cmd_flags & REQ_NOUNMAP) 397 flags |= UBLK_IO_F_NOUNMAP; 398 399 if (req->cmd_flags & REQ_SWAP) 400 flags |= UBLK_IO_F_SWAP; 401 402 return flags; 403 } 404 405 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) 406 { 407 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 408 struct ublk_io *io = &ubq->ios[req->tag]; 409 u32 ublk_op; 410 411 switch (req_op(req)) { 412 case REQ_OP_READ: 413 ublk_op = UBLK_IO_OP_READ; 414 break; 415 case REQ_OP_WRITE: 416 ublk_op = UBLK_IO_OP_WRITE; 417 break; 418 case REQ_OP_FLUSH: 419 ublk_op = UBLK_IO_OP_FLUSH; 420 break; 421 case REQ_OP_DISCARD: 422 ublk_op = UBLK_IO_OP_DISCARD; 423 break; 424 case REQ_OP_WRITE_ZEROES: 425 ublk_op = UBLK_IO_OP_WRITE_ZEROES; 426 break; 427 default: 428 return BLK_STS_IOERR; 429 } 430 431 /* need to translate since kernel may change */ 432 iod->op_flags = ublk_op | ublk_req_build_flags(req); 433 iod->nr_sectors = blk_rq_sectors(req); 434 iod->start_sector = blk_rq_pos(req); 435 iod->addr = io->addr; 436 437 return BLK_STS_OK; 438 } 439 440 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( 441 struct io_uring_cmd *ioucmd) 442 { 443 return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; 444 } 445 446 static bool ubq_daemon_is_dying(struct ublk_queue *ubq) 447 { 448 return ubq->ubq_daemon->flags & PF_EXITING; 449 } 450 451 /* todo: handle partial completion */ 452 static void ublk_complete_rq(struct request *req) 453 { 454 struct ublk_queue *ubq = req->mq_hctx->driver_data; 455 struct ublk_io *io = &ubq->ios[req->tag]; 456 unsigned int unmapped_bytes; 457 458 /* failed read IO if nothing is read */ 459 if (!io->res && req_op(req) == REQ_OP_READ) 460 io->res = -EIO; 461 462 if (io->res < 0) { 463 blk_mq_end_request(req, errno_to_blk_status(io->res)); 464 return; 465 } 466 467 /* 468 * FLUSH or DISCARD usually won't return bytes returned, so end them 469 * directly. 470 * 471 * Both the two needn't unmap. 472 */ 473 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { 474 blk_mq_end_request(req, BLK_STS_OK); 475 return; 476 } 477 478 /* for READ request, writing data in iod->addr to rq buffers */ 479 unmapped_bytes = ublk_unmap_io(ubq, req, io); 480 481 /* 482 * Extremely impossible since we got data filled in just before 483 * 484 * Re-read simply for this unlikely case. 485 */ 486 if (unlikely(unmapped_bytes < io->res)) 487 io->res = unmapped_bytes; 488 489 if (blk_update_request(req, BLK_STS_OK, io->res)) 490 blk_mq_requeue_request(req, true); 491 else 492 __blk_mq_end_request(req, BLK_STS_OK); 493 } 494 495 /* 496 * __ublk_fail_req() may be called from abort context or ->ubq_daemon 497 * context during exiting, so lock is required. 498 * 499 * Also aborting may not be started yet, keep in mind that one failed 500 * request may be issued by block layer again. 501 */ 502 static void __ublk_fail_req(struct ublk_io *io, struct request *req) 503 { 504 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); 505 506 if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { 507 io->flags |= UBLK_IO_FLAG_ABORTED; 508 blk_mq_end_request(req, BLK_STS_IOERR); 509 } 510 } 511 512 #define UBLK_REQUEUE_DELAY_MS 3 513 514 static inline void __ublk_rq_task_work(struct request *req) 515 { 516 struct ublk_queue *ubq = req->mq_hctx->driver_data; 517 struct ublk_device *ub = ubq->dev; 518 int tag = req->tag; 519 struct ublk_io *io = &ubq->ios[tag]; 520 bool task_exiting = current != ubq->ubq_daemon || 521 (current->flags & PF_EXITING); 522 unsigned int mapped_bytes; 523 524 pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", 525 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, 526 ublk_get_iod(ubq, req->tag)->addr); 527 528 if (unlikely(task_exiting)) { 529 blk_mq_end_request(req, BLK_STS_IOERR); 530 mod_delayed_work(system_wq, &ub->monitor_work, 0); 531 return; 532 } 533 534 mapped_bytes = ublk_map_io(ubq, req, io); 535 536 /* partially mapped, update io descriptor */ 537 if (unlikely(mapped_bytes != blk_rq_bytes(req))) { 538 /* 539 * Nothing mapped, retry until we succeed. 540 * 541 * We may never succeed in mapping any bytes here because 542 * of OOM. TODO: reserve one buffer with single page pinned 543 * for providing forward progress guarantee. 544 */ 545 if (unlikely(!mapped_bytes)) { 546 blk_mq_requeue_request(req, false); 547 blk_mq_delay_kick_requeue_list(req->q, 548 UBLK_REQUEUE_DELAY_MS); 549 return; 550 } 551 552 ublk_get_iod(ubq, req->tag)->nr_sectors = 553 mapped_bytes >> 9; 554 } 555 556 /* mark this cmd owned by ublksrv */ 557 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; 558 559 /* 560 * clear ACTIVE since we are done with this sqe/cmd slot 561 * We can only accept io cmd in case of being not active. 562 */ 563 io->flags &= ~UBLK_IO_FLAG_ACTIVE; 564 565 /* tell ublksrv one io request is coming */ 566 io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0); 567 } 568 569 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) 570 { 571 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 572 573 __ublk_rq_task_work(pdu->req); 574 } 575 576 static void ublk_rq_task_work_fn(struct callback_head *work) 577 { 578 struct ublk_rq_data *data = container_of(work, 579 struct ublk_rq_data, work); 580 struct request *req = blk_mq_rq_from_pdu(data); 581 582 __ublk_rq_task_work(req); 583 } 584 585 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, 586 const struct blk_mq_queue_data *bd) 587 { 588 struct ublk_queue *ubq = hctx->driver_data; 589 struct request *rq = bd->rq; 590 blk_status_t res; 591 592 /* fill iod to slot in io cmd buffer */ 593 res = ublk_setup_iod(ubq, rq); 594 if (unlikely(res != BLK_STS_OK)) 595 return BLK_STS_IOERR; 596 597 blk_mq_start_request(bd->rq); 598 599 if (unlikely(ubq_daemon_is_dying(ubq))) { 600 fail: 601 mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); 602 return BLK_STS_IOERR; 603 } 604 605 if (ublk_can_use_task_work(ubq)) { 606 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); 607 enum task_work_notify_mode notify_mode = bd->last ? 608 TWA_SIGNAL_NO_IPI : TWA_NONE; 609 610 if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) 611 goto fail; 612 } else { 613 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; 614 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 615 616 pdu->req = rq; 617 io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); 618 } 619 620 return BLK_STS_OK; 621 } 622 623 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) 624 { 625 struct ublk_queue *ubq = hctx->driver_data; 626 627 if (ublk_can_use_task_work(ubq)) 628 __set_notify_signal(ubq->ubq_daemon); 629 } 630 631 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, 632 unsigned int hctx_idx) 633 { 634 struct ublk_device *ub = driver_data; 635 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num); 636 637 hctx->driver_data = ubq; 638 return 0; 639 } 640 641 static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, 642 unsigned int hctx_idx, unsigned int numa_node) 643 { 644 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 645 646 init_task_work(&data->work, ublk_rq_task_work_fn); 647 return 0; 648 } 649 650 static const struct blk_mq_ops ublk_mq_ops = { 651 .queue_rq = ublk_queue_rq, 652 .commit_rqs = ublk_commit_rqs, 653 .init_hctx = ublk_init_hctx, 654 .init_request = ublk_init_rq, 655 }; 656 657 static int ublk_ch_open(struct inode *inode, struct file *filp) 658 { 659 struct ublk_device *ub = container_of(inode->i_cdev, 660 struct ublk_device, cdev); 661 662 if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) 663 return -EBUSY; 664 filp->private_data = ub; 665 return 0; 666 } 667 668 static int ublk_ch_release(struct inode *inode, struct file *filp) 669 { 670 struct ublk_device *ub = filp->private_data; 671 672 clear_bit(UB_STATE_OPEN, &ub->state); 673 return 0; 674 } 675 676 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */ 677 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) 678 { 679 struct ublk_device *ub = filp->private_data; 680 size_t sz = vma->vm_end - vma->vm_start; 681 unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); 682 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; 683 int q_id, ret = 0; 684 685 spin_lock(&ub->mm_lock); 686 if (!ub->mm) 687 ub->mm = current->mm; 688 if (current->mm != ub->mm) 689 ret = -EINVAL; 690 spin_unlock(&ub->mm_lock); 691 692 if (ret) 693 return ret; 694 695 if (vma->vm_flags & VM_WRITE) 696 return -EPERM; 697 698 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz; 699 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end) 700 return -EINVAL; 701 702 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz; 703 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n", 704 __func__, q_id, current->pid, vma->vm_start, 705 phys_off, (unsigned long)sz); 706 707 if (sz != ublk_queue_cmd_buf_size(ub, q_id)) 708 return -EINVAL; 709 710 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; 711 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 712 } 713 714 static void ublk_commit_completion(struct ublk_device *ub, 715 struct ublksrv_io_cmd *ub_cmd) 716 { 717 u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; 718 struct ublk_queue *ubq = ublk_get_queue(ub, qid); 719 struct ublk_io *io = &ubq->ios[tag]; 720 struct request *req; 721 722 /* now this cmd slot is owned by nbd driver */ 723 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; 724 io->res = ub_cmd->result; 725 726 /* find the io request and complete */ 727 req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); 728 729 if (req && likely(!blk_should_fake_timeout(req->q))) 730 ublk_complete_rq(req); 731 } 732 733 /* 734 * When ->ubq_daemon is exiting, either new request is ended immediately, 735 * or any queued io command is drained, so it is safe to abort queue 736 * lockless 737 */ 738 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) 739 { 740 int i; 741 742 if (!ublk_get_device(ub)) 743 return; 744 745 for (i = 0; i < ubq->q_depth; i++) { 746 struct ublk_io *io = &ubq->ios[i]; 747 748 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { 749 struct request *rq; 750 751 /* 752 * Either we fail the request or ublk_rq_task_work_fn 753 * will do it 754 */ 755 rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); 756 if (rq) 757 __ublk_fail_req(io, rq); 758 } 759 } 760 ublk_put_device(ub); 761 } 762 763 static void ublk_daemon_monitor_work(struct work_struct *work) 764 { 765 struct ublk_device *ub = 766 container_of(work, struct ublk_device, monitor_work.work); 767 int i; 768 769 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 770 struct ublk_queue *ubq = ublk_get_queue(ub, i); 771 772 if (ubq_daemon_is_dying(ubq)) { 773 schedule_work(&ub->stop_work); 774 775 /* abort queue is for making forward progress */ 776 ublk_abort_queue(ub, ubq); 777 } 778 } 779 780 /* 781 * We can't schedule monitor work after ublk_remove() is started. 782 * 783 * No need ub->mutex, monitor work are canceled after state is marked 784 * as DEAD, so DEAD state is observed reliably. 785 */ 786 if (ub->dev_info.state != UBLK_S_DEV_DEAD) 787 schedule_delayed_work(&ub->monitor_work, 788 UBLK_DAEMON_MONITOR_PERIOD); 789 } 790 791 static void ublk_cancel_queue(struct ublk_queue *ubq) 792 { 793 int i; 794 795 for (i = 0; i < ubq->q_depth; i++) { 796 struct ublk_io *io = &ubq->ios[i]; 797 798 if (io->flags & UBLK_IO_FLAG_ACTIVE) 799 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); 800 } 801 } 802 803 /* Cancel all pending commands, must be called after del_gendisk() returns */ 804 static void ublk_cancel_dev(struct ublk_device *ub) 805 { 806 int i; 807 808 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 809 ublk_cancel_queue(ublk_get_queue(ub, i)); 810 } 811 812 static void ublk_stop_dev(struct ublk_device *ub) 813 { 814 mutex_lock(&ub->mutex); 815 if (ub->dev_info.state != UBLK_S_DEV_LIVE) 816 goto unlock; 817 818 del_gendisk(ub->ub_disk); 819 ub->dev_info.state = UBLK_S_DEV_DEAD; 820 ub->dev_info.ublksrv_pid = -1; 821 ublk_cancel_dev(ub); 822 put_disk(ub->ub_disk); 823 ub->ub_disk = NULL; 824 unlock: 825 mutex_unlock(&ub->mutex); 826 cancel_delayed_work_sync(&ub->monitor_work); 827 } 828 829 static inline bool ublk_queue_ready(struct ublk_queue *ubq) 830 { 831 return ubq->nr_io_ready == ubq->q_depth; 832 } 833 834 /* device can only be started after all IOs are ready */ 835 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) 836 { 837 mutex_lock(&ub->mutex); 838 ubq->nr_io_ready++; 839 if (ublk_queue_ready(ubq)) { 840 ubq->ubq_daemon = current; 841 get_task_struct(ubq->ubq_daemon); 842 ub->nr_queues_ready++; 843 } 844 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) 845 complete_all(&ub->completion); 846 mutex_unlock(&ub->mutex); 847 } 848 849 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 850 { 851 struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; 852 struct ublk_device *ub = cmd->file->private_data; 853 struct ublk_queue *ubq; 854 struct ublk_io *io; 855 u32 cmd_op = cmd->cmd_op; 856 unsigned tag = ub_cmd->tag; 857 int ret = -EINVAL; 858 859 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", 860 __func__, cmd->cmd_op, ub_cmd->q_id, tag, 861 ub_cmd->result); 862 863 if (!(issue_flags & IO_URING_F_SQE128)) 864 goto out; 865 866 if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) 867 goto out; 868 869 ubq = ublk_get_queue(ub, ub_cmd->q_id); 870 if (!ubq || ub_cmd->q_id != ubq->q_id) 871 goto out; 872 873 if (ubq->ubq_daemon && ubq->ubq_daemon != current) 874 goto out; 875 876 if (tag >= ubq->q_depth) 877 goto out; 878 879 io = &ubq->ios[tag]; 880 881 /* there is pending io cmd, something must be wrong */ 882 if (io->flags & UBLK_IO_FLAG_ACTIVE) { 883 ret = -EBUSY; 884 goto out; 885 } 886 887 switch (cmd_op) { 888 case UBLK_IO_FETCH_REQ: 889 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ 890 if (ublk_queue_ready(ubq)) { 891 ret = -EBUSY; 892 goto out; 893 } 894 /* 895 * The io is being handled by server, so COMMIT_RQ is expected 896 * instead of FETCH_REQ 897 */ 898 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) 899 goto out; 900 /* FETCH_RQ has to provide IO buffer */ 901 if (!ub_cmd->addr) 902 goto out; 903 io->cmd = cmd; 904 io->flags |= UBLK_IO_FLAG_ACTIVE; 905 io->addr = ub_cmd->addr; 906 907 ublk_mark_io_ready(ub, ubq); 908 break; 909 case UBLK_IO_COMMIT_AND_FETCH_REQ: 910 /* FETCH_RQ has to provide IO buffer */ 911 if (!ub_cmd->addr) 912 goto out; 913 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 914 goto out; 915 io->addr = ub_cmd->addr; 916 io->flags |= UBLK_IO_FLAG_ACTIVE; 917 io->cmd = cmd; 918 ublk_commit_completion(ub, ub_cmd); 919 break; 920 default: 921 goto out; 922 } 923 return -EIOCBQUEUED; 924 925 out: 926 io_uring_cmd_done(cmd, ret, 0); 927 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", 928 __func__, cmd_op, tag, ret, io->flags); 929 return -EIOCBQUEUED; 930 } 931 932 static const struct file_operations ublk_ch_fops = { 933 .owner = THIS_MODULE, 934 .open = ublk_ch_open, 935 .release = ublk_ch_release, 936 .llseek = no_llseek, 937 .uring_cmd = ublk_ch_uring_cmd, 938 .mmap = ublk_ch_mmap, 939 }; 940 941 static void ublk_deinit_queue(struct ublk_device *ub, int q_id) 942 { 943 int size = ublk_queue_cmd_buf_size(ub, q_id); 944 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 945 946 if (ubq->ubq_daemon) 947 put_task_struct(ubq->ubq_daemon); 948 if (ubq->io_cmd_buf) 949 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); 950 } 951 952 static int ublk_init_queue(struct ublk_device *ub, int q_id) 953 { 954 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 955 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; 956 void *ptr; 957 int size; 958 959 ubq->flags = ub->dev_info.flags; 960 ubq->q_id = q_id; 961 ubq->q_depth = ub->dev_info.queue_depth; 962 size = ublk_queue_cmd_buf_size(ub, q_id); 963 964 ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); 965 if (!ptr) 966 return -ENOMEM; 967 968 ubq->io_cmd_buf = ptr; 969 ubq->dev = ub; 970 return 0; 971 } 972 973 static void ublk_deinit_queues(struct ublk_device *ub) 974 { 975 int nr_queues = ub->dev_info.nr_hw_queues; 976 int i; 977 978 if (!ub->__queues) 979 return; 980 981 for (i = 0; i < nr_queues; i++) 982 ublk_deinit_queue(ub, i); 983 kfree(ub->__queues); 984 } 985 986 static int ublk_init_queues(struct ublk_device *ub) 987 { 988 int nr_queues = ub->dev_info.nr_hw_queues; 989 int depth = ub->dev_info.queue_depth; 990 int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); 991 int i, ret = -ENOMEM; 992 993 ub->queue_size = ubq_size; 994 ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); 995 if (!ub->__queues) 996 return ret; 997 998 for (i = 0; i < nr_queues; i++) { 999 if (ublk_init_queue(ub, i)) 1000 goto fail; 1001 } 1002 1003 init_completion(&ub->completion); 1004 return 0; 1005 1006 fail: 1007 ublk_deinit_queues(ub); 1008 return ret; 1009 } 1010 1011 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) 1012 { 1013 int i = idx; 1014 int err; 1015 1016 spin_lock(&ublk_idr_lock); 1017 /* allocate id, if @id >= 0, we're requesting that specific id */ 1018 if (i >= 0) { 1019 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT); 1020 if (err == -ENOSPC) 1021 err = -EEXIST; 1022 } else { 1023 err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT); 1024 } 1025 spin_unlock(&ublk_idr_lock); 1026 1027 if (err >= 0) 1028 ub->ub_number = err; 1029 1030 return err; 1031 } 1032 1033 static void ublk_free_dev_number(struct ublk_device *ub) 1034 { 1035 spin_lock(&ublk_idr_lock); 1036 idr_remove(&ublk_index_idr, ub->ub_number); 1037 wake_up_all(&ublk_idr_wq); 1038 spin_unlock(&ublk_idr_lock); 1039 } 1040 1041 static void ublk_cdev_rel(struct device *dev) 1042 { 1043 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); 1044 1045 blk_mq_free_tag_set(&ub->tag_set); 1046 ublk_deinit_queues(ub); 1047 ublk_free_dev_number(ub); 1048 mutex_destroy(&ub->mutex); 1049 kfree(ub); 1050 } 1051 1052 static int ublk_add_chdev(struct ublk_device *ub) 1053 { 1054 struct device *dev = &ub->cdev_dev; 1055 int minor = ub->ub_number; 1056 int ret; 1057 1058 dev->parent = ublk_misc.this_device; 1059 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); 1060 dev->class = ublk_chr_class; 1061 dev->release = ublk_cdev_rel; 1062 device_initialize(dev); 1063 1064 ret = dev_set_name(dev, "ublkc%d", minor); 1065 if (ret) 1066 goto fail; 1067 1068 cdev_init(&ub->cdev, &ublk_ch_fops); 1069 ret = cdev_device_add(&ub->cdev, dev); 1070 if (ret) 1071 goto fail; 1072 return 0; 1073 fail: 1074 put_device(dev); 1075 return ret; 1076 } 1077 1078 static void ublk_stop_work_fn(struct work_struct *work) 1079 { 1080 struct ublk_device *ub = 1081 container_of(work, struct ublk_device, stop_work); 1082 1083 ublk_stop_dev(ub); 1084 } 1085 1086 /* align maximum I/O size to PAGE_SIZE */ 1087 static void ublk_align_max_io_size(struct ublk_device *ub) 1088 { 1089 unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift; 1090 1091 ub->dev_info.rq_max_blocks = 1092 round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift; 1093 } 1094 1095 static int ublk_add_tag_set(struct ublk_device *ub) 1096 { 1097 ub->tag_set.ops = &ublk_mq_ops; 1098 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; 1099 ub->tag_set.queue_depth = ub->dev_info.queue_depth; 1100 ub->tag_set.numa_node = NUMA_NO_NODE; 1101 ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); 1102 ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 1103 ub->tag_set.driver_data = ub; 1104 return blk_mq_alloc_tag_set(&ub->tag_set); 1105 } 1106 1107 static void ublk_remove(struct ublk_device *ub) 1108 { 1109 ublk_stop_dev(ub); 1110 cancel_work_sync(&ub->stop_work); 1111 cdev_device_del(&ub->cdev, &ub->cdev_dev); 1112 put_device(&ub->cdev_dev); 1113 } 1114 1115 static struct ublk_device *ublk_get_device_from_id(int idx) 1116 { 1117 struct ublk_device *ub = NULL; 1118 1119 if (idx < 0) 1120 return NULL; 1121 1122 spin_lock(&ublk_idr_lock); 1123 ub = idr_find(&ublk_index_idr, idx); 1124 if (ub) 1125 ub = ublk_get_device(ub); 1126 spin_unlock(&ublk_idr_lock); 1127 1128 return ub; 1129 } 1130 1131 static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) 1132 { 1133 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1134 int ublksrv_pid = (int)header->data[0]; 1135 unsigned long dev_blocks = header->data[1]; 1136 struct ublk_device *ub; 1137 struct gendisk *disk; 1138 int ret = -EINVAL; 1139 1140 if (ublksrv_pid <= 0) 1141 return -EINVAL; 1142 1143 ub = ublk_get_device_from_id(header->dev_id); 1144 if (!ub) 1145 return -EINVAL; 1146 1147 wait_for_completion_interruptible(&ub->completion); 1148 1149 schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); 1150 1151 mutex_lock(&ub->mutex); 1152 if (ub->dev_info.state == UBLK_S_DEV_LIVE || 1153 test_bit(UB_STATE_USED, &ub->state)) { 1154 ret = -EEXIST; 1155 goto out_unlock; 1156 } 1157 1158 /* We may get disk size updated */ 1159 if (dev_blocks) 1160 ub->dev_info.dev_blocks = dev_blocks; 1161 1162 disk = blk_mq_alloc_disk(&ub->tag_set, ub); 1163 if (IS_ERR(disk)) { 1164 ret = PTR_ERR(disk); 1165 goto out_unlock; 1166 } 1167 sprintf(disk->disk_name, "ublkb%d", ub->ub_number); 1168 disk->fops = &ub_fops; 1169 disk->private_data = ub; 1170 1171 blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size); 1172 blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size); 1173 blk_queue_io_min(disk->queue, ub->dev_info.block_size); 1174 blk_queue_max_hw_sectors(disk->queue, 1175 ub->dev_info.rq_max_blocks << (ub->bs_shift - 9)); 1176 disk->queue->limits.discard_granularity = PAGE_SIZE; 1177 blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9); 1178 blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9); 1179 1180 set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9)); 1181 1182 ub->dev_info.ublksrv_pid = ublksrv_pid; 1183 ub->ub_disk = disk; 1184 get_device(&ub->cdev_dev); 1185 ret = add_disk(disk); 1186 if (ret) { 1187 put_disk(disk); 1188 goto out_unlock; 1189 } 1190 set_bit(UB_STATE_USED, &ub->state); 1191 ub->dev_info.state = UBLK_S_DEV_LIVE; 1192 out_unlock: 1193 mutex_unlock(&ub->mutex); 1194 ublk_put_device(ub); 1195 return ret; 1196 } 1197 1198 static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) 1199 { 1200 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1201 void __user *argp = (void __user *)(unsigned long)header->addr; 1202 struct ublk_device *ub; 1203 cpumask_var_t cpumask; 1204 unsigned long queue; 1205 unsigned int retlen; 1206 unsigned int i; 1207 int ret = -EINVAL; 1208 1209 if (header->len * BITS_PER_BYTE < nr_cpu_ids) 1210 return -EINVAL; 1211 if (header->len & (sizeof(unsigned long)-1)) 1212 return -EINVAL; 1213 if (!header->addr) 1214 return -EINVAL; 1215 1216 ub = ublk_get_device_from_id(header->dev_id); 1217 if (!ub) 1218 return -EINVAL; 1219 1220 queue = header->data[0]; 1221 if (queue >= ub->dev_info.nr_hw_queues) 1222 goto out_put_device; 1223 1224 ret = -ENOMEM; 1225 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 1226 goto out_put_device; 1227 1228 for_each_possible_cpu(i) { 1229 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) 1230 cpumask_set_cpu(i, cpumask); 1231 } 1232 1233 ret = -EFAULT; 1234 retlen = min_t(unsigned short, header->len, cpumask_size()); 1235 if (copy_to_user(argp, cpumask, retlen)) 1236 goto out_free_cpumask; 1237 if (retlen != header->len && 1238 clear_user(argp + retlen, header->len - retlen)) 1239 goto out_free_cpumask; 1240 1241 ret = 0; 1242 out_free_cpumask: 1243 free_cpumask_var(cpumask); 1244 out_put_device: 1245 ublk_put_device(ub); 1246 return ret; 1247 } 1248 1249 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) 1250 { 1251 pr_devel("%s: dev id %d flags %llx\n", __func__, 1252 info->dev_id, info->flags); 1253 pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", 1254 info->nr_hw_queues, info->queue_depth, 1255 info->block_size, info->dev_blocks); 1256 } 1257 1258 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) 1259 { 1260 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1261 void __user *argp = (void __user *)(unsigned long)header->addr; 1262 struct ublksrv_ctrl_dev_info info; 1263 struct ublk_device *ub; 1264 int ret = -EINVAL; 1265 1266 if (header->len < sizeof(info) || !header->addr) 1267 return -EINVAL; 1268 if (header->queue_id != (u16)-1) { 1269 pr_warn("%s: queue_id is wrong %x\n", 1270 __func__, header->queue_id); 1271 return -EINVAL; 1272 } 1273 if (copy_from_user(&info, argp, sizeof(info))) 1274 return -EFAULT; 1275 ublk_dump_dev_info(&info); 1276 if (header->dev_id != info.dev_id) { 1277 pr_warn("%s: dev id not match %u %u\n", 1278 __func__, header->dev_id, info.dev_id); 1279 return -EINVAL; 1280 } 1281 1282 ret = mutex_lock_killable(&ublk_ctl_mutex); 1283 if (ret) 1284 return ret; 1285 1286 ret = -ENOMEM; 1287 ub = kzalloc(sizeof(*ub), GFP_KERNEL); 1288 if (!ub) 1289 goto out_unlock; 1290 mutex_init(&ub->mutex); 1291 spin_lock_init(&ub->mm_lock); 1292 INIT_WORK(&ub->stop_work, ublk_stop_work_fn); 1293 INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); 1294 1295 ret = ublk_alloc_dev_number(ub, header->dev_id); 1296 if (ret < 0) 1297 goto out_free_ub; 1298 1299 memcpy(&ub->dev_info, &info, sizeof(info)); 1300 1301 /* update device id */ 1302 ub->dev_info.dev_id = ub->ub_number; 1303 1304 /* 1305 * 64bit flags will be copied back to userspace as feature 1306 * negotiation result, so have to clear flags which driver 1307 * doesn't support yet, then userspace can get correct flags 1308 * (features) to handle. 1309 */ 1310 ub->dev_info.flags &= UBLK_F_ALL; 1311 1312 /* We are not ready to support zero copy */ 1313 ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; 1314 1315 ub->bs_shift = ilog2(ub->dev_info.block_size); 1316 ub->dev_info.nr_hw_queues = min_t(unsigned int, 1317 ub->dev_info.nr_hw_queues, nr_cpu_ids); 1318 ublk_align_max_io_size(ub); 1319 1320 ret = ublk_init_queues(ub); 1321 if (ret) 1322 goto out_free_dev_number; 1323 1324 ret = ublk_add_tag_set(ub); 1325 if (ret) 1326 goto out_deinit_queues; 1327 1328 ret = -EFAULT; 1329 if (copy_to_user(argp, &ub->dev_info, sizeof(info))) 1330 goto out_free_tag_set; 1331 1332 /* 1333 * Add the char dev so that ublksrv daemon can be setup. 1334 * ublk_add_chdev() will cleanup everything if it fails. 1335 */ 1336 ret = ublk_add_chdev(ub); 1337 goto out_unlock; 1338 1339 out_free_tag_set: 1340 blk_mq_free_tag_set(&ub->tag_set); 1341 out_deinit_queues: 1342 ublk_deinit_queues(ub); 1343 out_free_dev_number: 1344 ublk_free_dev_number(ub); 1345 out_free_ub: 1346 mutex_destroy(&ub->mutex); 1347 kfree(ub); 1348 out_unlock: 1349 mutex_unlock(&ublk_ctl_mutex); 1350 return ret; 1351 } 1352 1353 static inline bool ublk_idr_freed(int id) 1354 { 1355 void *ptr; 1356 1357 spin_lock(&ublk_idr_lock); 1358 ptr = idr_find(&ublk_index_idr, id); 1359 spin_unlock(&ublk_idr_lock); 1360 1361 return ptr == NULL; 1362 } 1363 1364 static int ublk_ctrl_del_dev(int idx) 1365 { 1366 struct ublk_device *ub; 1367 int ret; 1368 1369 ret = mutex_lock_killable(&ublk_ctl_mutex); 1370 if (ret) 1371 return ret; 1372 1373 ub = ublk_get_device_from_id(idx); 1374 if (ub) { 1375 ublk_remove(ub); 1376 ublk_put_device(ub); 1377 ret = 0; 1378 } else { 1379 ret = -ENODEV; 1380 } 1381 1382 /* 1383 * Wait until the idr is removed, then it can be reused after 1384 * DEL_DEV command is returned. 1385 */ 1386 if (!ret) 1387 wait_event(ublk_idr_wq, ublk_idr_freed(idx)); 1388 mutex_unlock(&ublk_ctl_mutex); 1389 1390 return ret; 1391 } 1392 1393 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) 1394 { 1395 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1396 1397 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", 1398 __func__, cmd->cmd_op, header->dev_id, header->queue_id, 1399 header->data[0], header->addr, header->len); 1400 } 1401 1402 static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) 1403 { 1404 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1405 struct ublk_device *ub; 1406 1407 ub = ublk_get_device_from_id(header->dev_id); 1408 if (!ub) 1409 return -EINVAL; 1410 1411 ublk_stop_dev(ub); 1412 cancel_work_sync(&ub->stop_work); 1413 1414 ublk_put_device(ub); 1415 return 0; 1416 } 1417 1418 static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd) 1419 { 1420 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1421 void __user *argp = (void __user *)(unsigned long)header->addr; 1422 struct ublk_device *ub; 1423 int ret = 0; 1424 1425 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) 1426 return -EINVAL; 1427 1428 ub = ublk_get_device_from_id(header->dev_id); 1429 if (!ub) 1430 return -EINVAL; 1431 1432 if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) 1433 ret = -EFAULT; 1434 ublk_put_device(ub); 1435 1436 return ret; 1437 } 1438 1439 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, 1440 unsigned int issue_flags) 1441 { 1442 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1443 int ret = -EINVAL; 1444 1445 ublk_ctrl_cmd_dump(cmd); 1446 1447 if (!(issue_flags & IO_URING_F_SQE128)) 1448 goto out; 1449 1450 ret = -EPERM; 1451 if (!capable(CAP_SYS_ADMIN)) 1452 goto out; 1453 1454 ret = -ENODEV; 1455 switch (cmd->cmd_op) { 1456 case UBLK_CMD_START_DEV: 1457 ret = ublk_ctrl_start_dev(cmd); 1458 break; 1459 case UBLK_CMD_STOP_DEV: 1460 ret = ublk_ctrl_stop_dev(cmd); 1461 break; 1462 case UBLK_CMD_GET_DEV_INFO: 1463 ret = ublk_ctrl_get_dev_info(cmd); 1464 break; 1465 case UBLK_CMD_ADD_DEV: 1466 ret = ublk_ctrl_add_dev(cmd); 1467 break; 1468 case UBLK_CMD_DEL_DEV: 1469 ret = ublk_ctrl_del_dev(header->dev_id); 1470 break; 1471 case UBLK_CMD_GET_QUEUE_AFFINITY: 1472 ret = ublk_ctrl_get_queue_affinity(cmd); 1473 break; 1474 default: 1475 break; 1476 } 1477 out: 1478 io_uring_cmd_done(cmd, ret, 0); 1479 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", 1480 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); 1481 return -EIOCBQUEUED; 1482 } 1483 1484 static const struct file_operations ublk_ctl_fops = { 1485 .open = nonseekable_open, 1486 .uring_cmd = ublk_ctrl_uring_cmd, 1487 .owner = THIS_MODULE, 1488 .llseek = noop_llseek, 1489 }; 1490 1491 static struct miscdevice ublk_misc = { 1492 .minor = MISC_DYNAMIC_MINOR, 1493 .name = "ublk-control", 1494 .fops = &ublk_ctl_fops, 1495 }; 1496 1497 static int __init ublk_init(void) 1498 { 1499 int ret; 1500 1501 init_waitqueue_head(&ublk_idr_wq); 1502 1503 ret = misc_register(&ublk_misc); 1504 if (ret) 1505 return ret; 1506 1507 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char"); 1508 if (ret) 1509 goto unregister_mis; 1510 1511 ublk_chr_class = class_create(THIS_MODULE, "ublk-char"); 1512 if (IS_ERR(ublk_chr_class)) { 1513 ret = PTR_ERR(ublk_chr_class); 1514 goto free_chrdev_region; 1515 } 1516 return 0; 1517 1518 free_chrdev_region: 1519 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 1520 unregister_mis: 1521 misc_deregister(&ublk_misc); 1522 return ret; 1523 } 1524 1525 static void __exit ublk_exit(void) 1526 { 1527 struct ublk_device *ub; 1528 int id; 1529 1530 class_destroy(ublk_chr_class); 1531 1532 misc_deregister(&ublk_misc); 1533 1534 idr_for_each_entry(&ublk_index_idr, ub, id) 1535 ublk_remove(ub); 1536 1537 idr_destroy(&ublk_index_idr); 1538 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 1539 } 1540 1541 module_init(ublk_init); 1542 module_exit(ublk_exit); 1543 1544 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); 1545 MODULE_LICENSE("GPL"); 1546