1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Userspace block device - block device which IO is handled from userspace 4 * 5 * Take full use of io_uring passthrough command for communicating with 6 * ublk userspace daemon(ublksrvd) for handling basic IO request. 7 * 8 * Copyright 2022 Ming Lei <ming.lei@redhat.com> 9 * 10 * (part of code stolen from loop.c) 11 */ 12 #include <linux/module.h> 13 #include <linux/moduleparam.h> 14 #include <linux/sched.h> 15 #include <linux/fs.h> 16 #include <linux/pagemap.h> 17 #include <linux/file.h> 18 #include <linux/stat.h> 19 #include <linux/errno.h> 20 #include <linux/major.h> 21 #include <linux/wait.h> 22 #include <linux/blkdev.h> 23 #include <linux/init.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/compat.h> 27 #include <linux/mutex.h> 28 #include <linux/writeback.h> 29 #include <linux/completion.h> 30 #include <linux/highmem.h> 31 #include <linux/sysfs.h> 32 #include <linux/miscdevice.h> 33 #include <linux/falloc.h> 34 #include <linux/uio.h> 35 #include <linux/ioprio.h> 36 #include <linux/sched/mm.h> 37 #include <linux/uaccess.h> 38 #include <linux/cdev.h> 39 #include <linux/io_uring/cmd.h> 40 #include <linux/blk-mq.h> 41 #include <linux/delay.h> 42 #include <linux/mm.h> 43 #include <asm/page.h> 44 #include <linux/task_work.h> 45 #include <linux/namei.h> 46 #include <linux/kref.h> 47 #include <uapi/linux/ublk_cmd.h> 48 49 #define UBLK_MINORS (1U << MINORBITS) 50 51 /* private ioctl command mirror */ 52 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) 53 54 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) 55 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) 56 57 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 58 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 59 | UBLK_F_URING_CMD_COMP_IN_TASK \ 60 | UBLK_F_NEED_GET_DATA \ 61 | UBLK_F_USER_RECOVERY \ 62 | UBLK_F_USER_RECOVERY_REISSUE \ 63 | UBLK_F_UNPRIVILEGED_DEV \ 64 | UBLK_F_CMD_IOCTL_ENCODE \ 65 | UBLK_F_USER_COPY \ 66 | UBLK_F_ZONED \ 67 | UBLK_F_USER_RECOVERY_FAIL_IO) 68 69 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ 70 | UBLK_F_USER_RECOVERY_REISSUE \ 71 | UBLK_F_USER_RECOVERY_FAIL_IO) 72 73 /* All UBLK_PARAM_TYPE_* should be included here */ 74 #define UBLK_PARAM_TYPE_ALL \ 75 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ 76 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ 77 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) 78 79 struct ublk_rq_data { 80 struct kref ref; 81 }; 82 83 struct ublk_uring_cmd_pdu { 84 /* 85 * Store requests in same batch temporarily for queuing them to 86 * daemon context. 87 * 88 * It should have been stored to request payload, but we do want 89 * to avoid extra pre-allocation, and uring_cmd payload is always 90 * free for us 91 */ 92 union { 93 struct request *req; 94 struct request *req_list; 95 }; 96 97 /* 98 * The following two are valid in this cmd whole lifetime, and 99 * setup in ublk uring_cmd handler 100 */ 101 struct ublk_queue *ubq; 102 u16 tag; 103 }; 104 105 /* 106 * io command is active: sqe cmd is received, and its cqe isn't done 107 * 108 * If the flag is set, the io command is owned by ublk driver, and waited 109 * for incoming blk-mq request from the ublk block device. 110 * 111 * If the flag is cleared, the io command will be completed, and owned by 112 * ublk server. 113 */ 114 #define UBLK_IO_FLAG_ACTIVE 0x01 115 116 /* 117 * IO command is completed via cqe, and it is being handled by ublksrv, and 118 * not committed yet 119 * 120 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for 121 * cross verification 122 */ 123 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 124 125 /* 126 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires 127 * get data buffer address from ublksrv. 128 * 129 * Then, bio data could be copied into this data buffer for a WRITE request 130 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset. 131 */ 132 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 133 134 /* atomic RW with ubq->cancel_lock */ 135 #define UBLK_IO_FLAG_CANCELED 0x80000000 136 137 struct ublk_io { 138 /* userspace buffer address from io cmd */ 139 __u64 addr; 140 unsigned int flags; 141 int res; 142 143 struct io_uring_cmd *cmd; 144 }; 145 146 struct ublk_queue { 147 int q_id; 148 int q_depth; 149 150 unsigned long flags; 151 struct task_struct *ubq_daemon; 152 struct ublksrv_io_desc *io_cmd_buf; 153 154 bool force_abort; 155 bool timeout; 156 bool canceling; 157 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ 158 unsigned short nr_io_ready; /* how many ios setup */ 159 spinlock_t cancel_lock; 160 struct ublk_device *dev; 161 struct ublk_io ios[]; 162 }; 163 164 struct ublk_device { 165 struct gendisk *ub_disk; 166 167 char *__queues; 168 169 unsigned int queue_size; 170 struct ublksrv_ctrl_dev_info dev_info; 171 172 struct blk_mq_tag_set tag_set; 173 174 struct cdev cdev; 175 struct device cdev_dev; 176 177 #define UB_STATE_OPEN 0 178 #define UB_STATE_USED 1 179 #define UB_STATE_DELETED 2 180 unsigned long state; 181 int ub_number; 182 183 struct mutex mutex; 184 185 spinlock_t lock; 186 struct mm_struct *mm; 187 188 struct ublk_params params; 189 190 struct completion completion; 191 unsigned int nr_queues_ready; 192 unsigned int nr_privileged_daemon; 193 }; 194 195 /* header of ublk_params */ 196 struct ublk_params_header { 197 __u32 len; 198 __u32 types; 199 }; 200 201 static void ublk_stop_dev_unlocked(struct ublk_device *ub); 202 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); 203 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 204 struct ublk_queue *ubq, int tag, size_t offset); 205 static inline unsigned int ublk_req_build_flags(struct request *req); 206 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 207 int tag); 208 static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) 209 { 210 return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); 211 } 212 213 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) 214 { 215 return ub->dev_info.flags & UBLK_F_ZONED; 216 } 217 218 static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) 219 { 220 return ubq->flags & UBLK_F_ZONED; 221 } 222 223 #ifdef CONFIG_BLK_DEV_ZONED 224 225 struct ublk_zoned_report_desc { 226 __u64 sector; 227 __u32 operation; 228 __u32 nr_zones; 229 }; 230 231 static DEFINE_XARRAY(ublk_zoned_report_descs); 232 233 static int ublk_zoned_insert_report_desc(const struct request *req, 234 struct ublk_zoned_report_desc *desc) 235 { 236 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, 237 desc, GFP_KERNEL); 238 } 239 240 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( 241 const struct request *req) 242 { 243 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); 244 } 245 246 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( 247 const struct request *req) 248 { 249 return xa_load(&ublk_zoned_report_descs, (unsigned long)req); 250 } 251 252 static int ublk_get_nr_zones(const struct ublk_device *ub) 253 { 254 const struct ublk_param_basic *p = &ub->params.basic; 255 256 /* Zone size is a power of 2 */ 257 return p->dev_sectors >> ilog2(p->chunk_sectors); 258 } 259 260 static int ublk_revalidate_disk_zones(struct ublk_device *ub) 261 { 262 return blk_revalidate_disk_zones(ub->ub_disk); 263 } 264 265 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 266 { 267 const struct ublk_param_zoned *p = &ub->params.zoned; 268 int nr_zones; 269 270 if (!ublk_dev_is_zoned(ub)) 271 return -EINVAL; 272 273 if (!p->max_zone_append_sectors) 274 return -EINVAL; 275 276 nr_zones = ublk_get_nr_zones(ub); 277 278 if (p->max_active_zones > nr_zones) 279 return -EINVAL; 280 281 if (p->max_open_zones > nr_zones) 282 return -EINVAL; 283 284 return 0; 285 } 286 287 static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 288 { 289 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); 290 } 291 292 /* Based on virtblk_alloc_report_buffer */ 293 static void *ublk_alloc_report_buffer(struct ublk_device *ublk, 294 unsigned int nr_zones, size_t *buflen) 295 { 296 struct request_queue *q = ublk->ub_disk->queue; 297 size_t bufsize; 298 void *buf; 299 300 nr_zones = min_t(unsigned int, nr_zones, 301 ublk->ub_disk->nr_zones); 302 303 bufsize = nr_zones * sizeof(struct blk_zone); 304 bufsize = 305 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); 306 307 while (bufsize >= sizeof(struct blk_zone)) { 308 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); 309 if (buf) { 310 *buflen = bufsize; 311 return buf; 312 } 313 bufsize >>= 1; 314 } 315 316 *buflen = 0; 317 return NULL; 318 } 319 320 static int ublk_report_zones(struct gendisk *disk, sector_t sector, 321 unsigned int nr_zones, report_zones_cb cb, void *data) 322 { 323 struct ublk_device *ub = disk->private_data; 324 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; 325 unsigned int first_zone = sector >> ilog2(zone_size_sectors); 326 unsigned int done_zones = 0; 327 unsigned int max_zones_per_request; 328 int ret; 329 struct blk_zone *buffer; 330 size_t buffer_length; 331 332 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, 333 nr_zones); 334 335 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); 336 if (!buffer) 337 return -ENOMEM; 338 339 max_zones_per_request = buffer_length / sizeof(struct blk_zone); 340 341 while (done_zones < nr_zones) { 342 unsigned int remaining_zones = nr_zones - done_zones; 343 unsigned int zones_in_request = 344 min_t(unsigned int, remaining_zones, max_zones_per_request); 345 struct request *req; 346 struct ublk_zoned_report_desc desc; 347 blk_status_t status; 348 349 memset(buffer, 0, buffer_length); 350 351 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); 352 if (IS_ERR(req)) { 353 ret = PTR_ERR(req); 354 goto out; 355 } 356 357 desc.operation = UBLK_IO_OP_REPORT_ZONES; 358 desc.sector = sector; 359 desc.nr_zones = zones_in_request; 360 ret = ublk_zoned_insert_report_desc(req, &desc); 361 if (ret) 362 goto free_req; 363 364 ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, 365 GFP_KERNEL); 366 if (ret) 367 goto erase_desc; 368 369 status = blk_execute_rq(req, 0); 370 ret = blk_status_to_errno(status); 371 erase_desc: 372 ublk_zoned_erase_report_desc(req); 373 free_req: 374 blk_mq_free_request(req); 375 if (ret) 376 goto out; 377 378 for (unsigned int i = 0; i < zones_in_request; i++) { 379 struct blk_zone *zone = buffer + i; 380 381 /* A zero length zone means no more zones in this response */ 382 if (!zone->len) 383 break; 384 385 ret = cb(zone, i, data); 386 if (ret) 387 goto out; 388 389 done_zones++; 390 sector += zone_size_sectors; 391 392 } 393 } 394 395 ret = done_zones; 396 397 out: 398 kvfree(buffer); 399 return ret; 400 } 401 402 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 403 struct request *req) 404 { 405 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 406 struct ublk_io *io = &ubq->ios[req->tag]; 407 struct ublk_zoned_report_desc *desc; 408 u32 ublk_op; 409 410 switch (req_op(req)) { 411 case REQ_OP_ZONE_OPEN: 412 ublk_op = UBLK_IO_OP_ZONE_OPEN; 413 break; 414 case REQ_OP_ZONE_CLOSE: 415 ublk_op = UBLK_IO_OP_ZONE_CLOSE; 416 break; 417 case REQ_OP_ZONE_FINISH: 418 ublk_op = UBLK_IO_OP_ZONE_FINISH; 419 break; 420 case REQ_OP_ZONE_RESET: 421 ublk_op = UBLK_IO_OP_ZONE_RESET; 422 break; 423 case REQ_OP_ZONE_APPEND: 424 ublk_op = UBLK_IO_OP_ZONE_APPEND; 425 break; 426 case REQ_OP_ZONE_RESET_ALL: 427 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; 428 break; 429 case REQ_OP_DRV_IN: 430 desc = ublk_zoned_get_report_desc(req); 431 if (!desc) 432 return BLK_STS_IOERR; 433 ublk_op = desc->operation; 434 switch (ublk_op) { 435 case UBLK_IO_OP_REPORT_ZONES: 436 iod->op_flags = ublk_op | ublk_req_build_flags(req); 437 iod->nr_zones = desc->nr_zones; 438 iod->start_sector = desc->sector; 439 return BLK_STS_OK; 440 default: 441 return BLK_STS_IOERR; 442 } 443 case REQ_OP_DRV_OUT: 444 /* We do not support drv_out */ 445 return BLK_STS_NOTSUPP; 446 default: 447 return BLK_STS_IOERR; 448 } 449 450 iod->op_flags = ublk_op | ublk_req_build_flags(req); 451 iod->nr_sectors = blk_rq_sectors(req); 452 iod->start_sector = blk_rq_pos(req); 453 iod->addr = io->addr; 454 455 return BLK_STS_OK; 456 } 457 458 #else 459 460 #define ublk_report_zones (NULL) 461 462 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 463 { 464 return -EOPNOTSUPP; 465 } 466 467 static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 468 { 469 } 470 471 static int ublk_revalidate_disk_zones(struct ublk_device *ub) 472 { 473 return 0; 474 } 475 476 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 477 struct request *req) 478 { 479 return BLK_STS_NOTSUPP; 480 } 481 482 #endif 483 484 static inline void __ublk_complete_rq(struct request *req); 485 static void ublk_complete_rq(struct kref *ref); 486 487 static dev_t ublk_chr_devt; 488 static const struct class ublk_chr_class = { 489 .name = "ublk-char", 490 }; 491 492 static DEFINE_IDR(ublk_index_idr); 493 static DEFINE_SPINLOCK(ublk_idr_lock); 494 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ 495 496 static DEFINE_MUTEX(ublk_ctl_mutex); 497 498 499 #define UBLK_MAX_UBLKS UBLK_MINORS 500 501 /* 502 * Max unprivileged ublk devices allowed to add 503 * 504 * It can be extended to one per-user limit in future or even controlled 505 * by cgroup. 506 */ 507 static unsigned int unprivileged_ublks_max = 64; 508 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */ 509 510 static struct miscdevice ublk_misc; 511 512 static inline unsigned ublk_pos_to_hwq(loff_t pos) 513 { 514 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & 515 UBLK_QID_BITS_MASK; 516 } 517 518 static inline unsigned ublk_pos_to_buf_off(loff_t pos) 519 { 520 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; 521 } 522 523 static inline unsigned ublk_pos_to_tag(loff_t pos) 524 { 525 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & 526 UBLK_TAG_BITS_MASK; 527 } 528 529 static void ublk_dev_param_basic_apply(struct ublk_device *ub) 530 { 531 const struct ublk_param_basic *p = &ub->params.basic; 532 533 if (p->attrs & UBLK_ATTR_READ_ONLY) 534 set_disk_ro(ub->ub_disk, true); 535 536 set_capacity(ub->ub_disk, p->dev_sectors); 537 } 538 539 static int ublk_validate_params(const struct ublk_device *ub) 540 { 541 /* basic param is the only one which must be set */ 542 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { 543 const struct ublk_param_basic *p = &ub->params.basic; 544 545 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) 546 return -EINVAL; 547 548 if (p->logical_bs_shift > p->physical_bs_shift) 549 return -EINVAL; 550 551 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) 552 return -EINVAL; 553 554 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) 555 return -EINVAL; 556 } else 557 return -EINVAL; 558 559 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 560 const struct ublk_param_discard *p = &ub->params.discard; 561 562 /* So far, only support single segment discard */ 563 if (p->max_discard_sectors && p->max_discard_segments != 1) 564 return -EINVAL; 565 566 if (!p->discard_granularity) 567 return -EINVAL; 568 } 569 570 /* dev_t is read-only */ 571 if (ub->params.types & UBLK_PARAM_TYPE_DEVT) 572 return -EINVAL; 573 574 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 575 return ublk_dev_param_zoned_validate(ub); 576 else if (ublk_dev_is_zoned(ub)) 577 return -EINVAL; 578 579 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { 580 const struct ublk_param_dma_align *p = &ub->params.dma; 581 582 if (p->alignment >= PAGE_SIZE) 583 return -EINVAL; 584 585 if (!is_power_of_2(p->alignment + 1)) 586 return -EINVAL; 587 } 588 589 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { 590 const struct ublk_param_segment *p = &ub->params.seg; 591 592 if (!is_power_of_2(p->seg_boundary_mask + 1)) 593 return -EINVAL; 594 595 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) 596 return -EINVAL; 597 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) 598 return -EINVAL; 599 } 600 601 return 0; 602 } 603 604 static void ublk_apply_params(struct ublk_device *ub) 605 { 606 ublk_dev_param_basic_apply(ub); 607 608 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 609 ublk_dev_param_zoned_apply(ub); 610 } 611 612 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) 613 { 614 return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); 615 } 616 617 static inline bool ublk_need_map_io(const struct ublk_queue *ubq) 618 { 619 return !ublk_support_user_copy(ubq); 620 } 621 622 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) 623 { 624 /* 625 * read()/write() is involved in user copy, so request reference 626 * has to be grabbed 627 */ 628 return ublk_support_user_copy(ubq); 629 } 630 631 static inline void ublk_init_req_ref(const struct ublk_queue *ubq, 632 struct request *req) 633 { 634 if (ublk_need_req_ref(ubq)) { 635 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 636 637 kref_init(&data->ref); 638 } 639 } 640 641 static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, 642 struct request *req) 643 { 644 if (ublk_need_req_ref(ubq)) { 645 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 646 647 return kref_get_unless_zero(&data->ref); 648 } 649 650 return true; 651 } 652 653 static inline void ublk_put_req_ref(const struct ublk_queue *ubq, 654 struct request *req) 655 { 656 if (ublk_need_req_ref(ubq)) { 657 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 658 659 kref_put(&data->ref, ublk_complete_rq); 660 } else { 661 __ublk_complete_rq(req); 662 } 663 } 664 665 static inline bool ublk_need_get_data(const struct ublk_queue *ubq) 666 { 667 return ubq->flags & UBLK_F_NEED_GET_DATA; 668 } 669 670 /* Called in slow path only, keep it noinline for trace purpose */ 671 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) 672 { 673 if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) 674 return ub; 675 return NULL; 676 } 677 678 /* Called in slow path only, keep it noinline for trace purpose */ 679 static noinline void ublk_put_device(struct ublk_device *ub) 680 { 681 put_device(&ub->cdev_dev); 682 } 683 684 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, 685 int qid) 686 { 687 return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); 688 } 689 690 static inline bool ublk_rq_has_data(const struct request *rq) 691 { 692 return bio_has_data(rq->bio); 693 } 694 695 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 696 int tag) 697 { 698 return &ubq->io_cmd_buf[tag]; 699 } 700 701 static inline struct ublksrv_io_desc * 702 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) 703 { 704 return ublk_get_queue(ub, q_id)->io_cmd_buf; 705 } 706 707 static inline int __ublk_queue_cmd_buf_size(int depth) 708 { 709 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); 710 } 711 712 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) 713 { 714 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 715 716 return __ublk_queue_cmd_buf_size(ubq->q_depth); 717 } 718 719 static int ublk_max_cmd_buf_size(void) 720 { 721 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); 722 } 723 724 /* 725 * Should I/O outstanding to the ublk server when it exits be reissued? 726 * If not, outstanding I/O will get errors. 727 */ 728 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) 729 { 730 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 731 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); 732 } 733 734 /* 735 * Should I/O issued while there is no ublk server queue? If not, I/O 736 * issued while there is no ublk server will get errors. 737 */ 738 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) 739 { 740 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 741 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); 742 } 743 744 /* 745 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy 746 * of the device flags for smaller cache footprint - better for fast 747 * paths. 748 */ 749 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) 750 { 751 return (ubq->flags & UBLK_F_USER_RECOVERY) && 752 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); 753 } 754 755 /* 756 * Should ublk devices be stopped (i.e. no recovery possible) when the 757 * ublk server exits? If not, devices can be used again by a future 758 * incarnation of a ublk server via the start_recovery/end_recovery 759 * commands. 760 */ 761 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) 762 { 763 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); 764 } 765 766 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) 767 { 768 return ub->dev_info.state == UBLK_S_DEV_QUIESCED || 769 ub->dev_info.state == UBLK_S_DEV_FAIL_IO; 770 } 771 772 static void ublk_free_disk(struct gendisk *disk) 773 { 774 struct ublk_device *ub = disk->private_data; 775 776 clear_bit(UB_STATE_USED, &ub->state); 777 ublk_put_device(ub); 778 } 779 780 static void ublk_store_owner_uid_gid(unsigned int *owner_uid, 781 unsigned int *owner_gid) 782 { 783 kuid_t uid; 784 kgid_t gid; 785 786 current_uid_gid(&uid, &gid); 787 788 *owner_uid = from_kuid(&init_user_ns, uid); 789 *owner_gid = from_kgid(&init_user_ns, gid); 790 } 791 792 static int ublk_open(struct gendisk *disk, blk_mode_t mode) 793 { 794 struct ublk_device *ub = disk->private_data; 795 796 if (capable(CAP_SYS_ADMIN)) 797 return 0; 798 799 /* 800 * If it is one unprivileged device, only owner can open 801 * the disk. Otherwise it could be one trap made by one 802 * evil user who grants this disk's privileges to other 803 * users deliberately. 804 * 805 * This way is reasonable too given anyone can create 806 * unprivileged device, and no need other's grant. 807 */ 808 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { 809 unsigned int curr_uid, curr_gid; 810 811 ublk_store_owner_uid_gid(&curr_uid, &curr_gid); 812 813 if (curr_uid != ub->dev_info.owner_uid || curr_gid != 814 ub->dev_info.owner_gid) 815 return -EPERM; 816 } 817 818 return 0; 819 } 820 821 static const struct block_device_operations ub_fops = { 822 .owner = THIS_MODULE, 823 .open = ublk_open, 824 .free_disk = ublk_free_disk, 825 .report_zones = ublk_report_zones, 826 }; 827 828 #define UBLK_MAX_PIN_PAGES 32 829 830 struct ublk_io_iter { 831 struct page *pages[UBLK_MAX_PIN_PAGES]; 832 struct bio *bio; 833 struct bvec_iter iter; 834 }; 835 836 /* return how many pages are copied */ 837 static void ublk_copy_io_pages(struct ublk_io_iter *data, 838 size_t total, size_t pg_off, int dir) 839 { 840 unsigned done = 0; 841 unsigned pg_idx = 0; 842 843 while (done < total) { 844 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); 845 unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, 846 (unsigned)(PAGE_SIZE - pg_off)); 847 void *bv_buf = bvec_kmap_local(&bv); 848 void *pg_buf = kmap_local_page(data->pages[pg_idx]); 849 850 if (dir == ITER_DEST) 851 memcpy(pg_buf + pg_off, bv_buf, bytes); 852 else 853 memcpy(bv_buf, pg_buf + pg_off, bytes); 854 855 kunmap_local(pg_buf); 856 kunmap_local(bv_buf); 857 858 /* advance page array */ 859 pg_off += bytes; 860 if (pg_off == PAGE_SIZE) { 861 pg_idx += 1; 862 pg_off = 0; 863 } 864 865 done += bytes; 866 867 /* advance bio */ 868 bio_advance_iter_single(data->bio, &data->iter, bytes); 869 if (!data->iter.bi_size) { 870 data->bio = data->bio->bi_next; 871 if (data->bio == NULL) 872 break; 873 data->iter = data->bio->bi_iter; 874 } 875 } 876 } 877 878 static bool ublk_advance_io_iter(const struct request *req, 879 struct ublk_io_iter *iter, unsigned int offset) 880 { 881 struct bio *bio = req->bio; 882 883 for_each_bio(bio) { 884 if (bio->bi_iter.bi_size > offset) { 885 iter->bio = bio; 886 iter->iter = bio->bi_iter; 887 bio_advance_iter(iter->bio, &iter->iter, offset); 888 return true; 889 } 890 offset -= bio->bi_iter.bi_size; 891 } 892 return false; 893 } 894 895 /* 896 * Copy data between request pages and io_iter, and 'offset' 897 * is the start point of linear offset of request. 898 */ 899 static size_t ublk_copy_user_pages(const struct request *req, 900 unsigned offset, struct iov_iter *uiter, int dir) 901 { 902 struct ublk_io_iter iter; 903 size_t done = 0; 904 905 if (!ublk_advance_io_iter(req, &iter, offset)) 906 return 0; 907 908 while (iov_iter_count(uiter) && iter.bio) { 909 unsigned nr_pages; 910 ssize_t len; 911 size_t off; 912 int i; 913 914 len = iov_iter_get_pages2(uiter, iter.pages, 915 iov_iter_count(uiter), 916 UBLK_MAX_PIN_PAGES, &off); 917 if (len <= 0) 918 return done; 919 920 ublk_copy_io_pages(&iter, len, off, dir); 921 nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); 922 for (i = 0; i < nr_pages; i++) { 923 if (dir == ITER_DEST) 924 set_page_dirty(iter.pages[i]); 925 put_page(iter.pages[i]); 926 } 927 done += len; 928 } 929 930 return done; 931 } 932 933 static inline bool ublk_need_map_req(const struct request *req) 934 { 935 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; 936 } 937 938 static inline bool ublk_need_unmap_req(const struct request *req) 939 { 940 return ublk_rq_has_data(req) && 941 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); 942 } 943 944 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, 945 struct ublk_io *io) 946 { 947 const unsigned int rq_bytes = blk_rq_bytes(req); 948 949 if (!ublk_need_map_io(ubq)) 950 return rq_bytes; 951 952 /* 953 * no zero copy, we delay copy WRITE request data into ublksrv 954 * context and the big benefit is that pinning pages in current 955 * context is pretty fast, see ublk_pin_user_pages 956 */ 957 if (ublk_need_map_req(req)) { 958 struct iov_iter iter; 959 const int dir = ITER_DEST; 960 961 import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); 962 return ublk_copy_user_pages(req, 0, &iter, dir); 963 } 964 return rq_bytes; 965 } 966 967 static int ublk_unmap_io(const struct ublk_queue *ubq, 968 const struct request *req, 969 struct ublk_io *io) 970 { 971 const unsigned int rq_bytes = blk_rq_bytes(req); 972 973 if (!ublk_need_map_io(ubq)) 974 return rq_bytes; 975 976 if (ublk_need_unmap_req(req)) { 977 struct iov_iter iter; 978 const int dir = ITER_SOURCE; 979 980 WARN_ON_ONCE(io->res > rq_bytes); 981 982 import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); 983 return ublk_copy_user_pages(req, 0, &iter, dir); 984 } 985 return rq_bytes; 986 } 987 988 static inline unsigned int ublk_req_build_flags(struct request *req) 989 { 990 unsigned flags = 0; 991 992 if (req->cmd_flags & REQ_FAILFAST_DEV) 993 flags |= UBLK_IO_F_FAILFAST_DEV; 994 995 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT) 996 flags |= UBLK_IO_F_FAILFAST_TRANSPORT; 997 998 if (req->cmd_flags & REQ_FAILFAST_DRIVER) 999 flags |= UBLK_IO_F_FAILFAST_DRIVER; 1000 1001 if (req->cmd_flags & REQ_META) 1002 flags |= UBLK_IO_F_META; 1003 1004 if (req->cmd_flags & REQ_FUA) 1005 flags |= UBLK_IO_F_FUA; 1006 1007 if (req->cmd_flags & REQ_NOUNMAP) 1008 flags |= UBLK_IO_F_NOUNMAP; 1009 1010 if (req->cmd_flags & REQ_SWAP) 1011 flags |= UBLK_IO_F_SWAP; 1012 1013 return flags; 1014 } 1015 1016 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) 1017 { 1018 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 1019 struct ublk_io *io = &ubq->ios[req->tag]; 1020 enum req_op op = req_op(req); 1021 u32 ublk_op; 1022 1023 if (!ublk_queue_is_zoned(ubq) && 1024 (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) 1025 return BLK_STS_IOERR; 1026 1027 switch (req_op(req)) { 1028 case REQ_OP_READ: 1029 ublk_op = UBLK_IO_OP_READ; 1030 break; 1031 case REQ_OP_WRITE: 1032 ublk_op = UBLK_IO_OP_WRITE; 1033 break; 1034 case REQ_OP_FLUSH: 1035 ublk_op = UBLK_IO_OP_FLUSH; 1036 break; 1037 case REQ_OP_DISCARD: 1038 ublk_op = UBLK_IO_OP_DISCARD; 1039 break; 1040 case REQ_OP_WRITE_ZEROES: 1041 ublk_op = UBLK_IO_OP_WRITE_ZEROES; 1042 break; 1043 default: 1044 if (ublk_queue_is_zoned(ubq)) 1045 return ublk_setup_iod_zoned(ubq, req); 1046 return BLK_STS_IOERR; 1047 } 1048 1049 /* need to translate since kernel may change */ 1050 iod->op_flags = ublk_op | ublk_req_build_flags(req); 1051 iod->nr_sectors = blk_rq_sectors(req); 1052 iod->start_sector = blk_rq_pos(req); 1053 iod->addr = io->addr; 1054 1055 return BLK_STS_OK; 1056 } 1057 1058 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( 1059 struct io_uring_cmd *ioucmd) 1060 { 1061 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); 1062 } 1063 1064 static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) 1065 { 1066 return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING; 1067 } 1068 1069 /* todo: handle partial completion */ 1070 static inline void __ublk_complete_rq(struct request *req) 1071 { 1072 struct ublk_queue *ubq = req->mq_hctx->driver_data; 1073 struct ublk_io *io = &ubq->ios[req->tag]; 1074 unsigned int unmapped_bytes; 1075 blk_status_t res = BLK_STS_OK; 1076 1077 /* failed read IO if nothing is read */ 1078 if (!io->res && req_op(req) == REQ_OP_READ) 1079 io->res = -EIO; 1080 1081 if (io->res < 0) { 1082 res = errno_to_blk_status(io->res); 1083 goto exit; 1084 } 1085 1086 /* 1087 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them 1088 * directly. 1089 * 1090 * Both the two needn't unmap. 1091 */ 1092 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && 1093 req_op(req) != REQ_OP_DRV_IN) 1094 goto exit; 1095 1096 /* for READ request, writing data in iod->addr to rq buffers */ 1097 unmapped_bytes = ublk_unmap_io(ubq, req, io); 1098 1099 /* 1100 * Extremely impossible since we got data filled in just before 1101 * 1102 * Re-read simply for this unlikely case. 1103 */ 1104 if (unlikely(unmapped_bytes < io->res)) 1105 io->res = unmapped_bytes; 1106 1107 if (blk_update_request(req, BLK_STS_OK, io->res)) 1108 blk_mq_requeue_request(req, true); 1109 else 1110 __blk_mq_end_request(req, BLK_STS_OK); 1111 1112 return; 1113 exit: 1114 blk_mq_end_request(req, res); 1115 } 1116 1117 static void ublk_complete_rq(struct kref *ref) 1118 { 1119 struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, 1120 ref); 1121 struct request *req = blk_mq_rq_from_pdu(data); 1122 1123 __ublk_complete_rq(req); 1124 } 1125 1126 static void ubq_complete_io_cmd(struct ublk_io *io, int res, 1127 unsigned issue_flags) 1128 { 1129 /* mark this cmd owned by ublksrv */ 1130 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; 1131 1132 /* 1133 * clear ACTIVE since we are done with this sqe/cmd slot 1134 * We can only accept io cmd in case of being not active. 1135 */ 1136 io->flags &= ~UBLK_IO_FLAG_ACTIVE; 1137 1138 /* tell ublksrv one io request is coming */ 1139 io_uring_cmd_done(io->cmd, res, 0, issue_flags); 1140 } 1141 1142 #define UBLK_REQUEUE_DELAY_MS 3 1143 1144 static inline void __ublk_abort_rq(struct ublk_queue *ubq, 1145 struct request *rq) 1146 { 1147 /* We cannot process this rq so just requeue it. */ 1148 if (ublk_nosrv_dev_should_queue_io(ubq->dev)) 1149 blk_mq_requeue_request(rq, false); 1150 else 1151 blk_mq_end_request(rq, BLK_STS_IOERR); 1152 } 1153 1154 static void ublk_dispatch_req(struct ublk_queue *ubq, 1155 struct request *req, 1156 unsigned int issue_flags) 1157 { 1158 int tag = req->tag; 1159 struct ublk_io *io = &ubq->ios[tag]; 1160 unsigned int mapped_bytes; 1161 1162 pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", 1163 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, 1164 ublk_get_iod(ubq, req->tag)->addr); 1165 1166 /* 1167 * Task is exiting if either: 1168 * 1169 * (1) current != ubq_daemon. 1170 * io_uring_cmd_complete_in_task() tries to run task_work 1171 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. 1172 * 1173 * (2) current->flags & PF_EXITING. 1174 */ 1175 if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { 1176 __ublk_abort_rq(ubq, req); 1177 return; 1178 } 1179 1180 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { 1181 /* 1182 * We have not handled UBLK_IO_NEED_GET_DATA command yet, 1183 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv 1184 * and notify it. 1185 */ 1186 if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) { 1187 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; 1188 pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", 1189 __func__, io->cmd->cmd_op, ubq->q_id, 1190 req->tag, io->flags); 1191 ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags); 1192 return; 1193 } 1194 /* 1195 * We have handled UBLK_IO_NEED_GET_DATA command, 1196 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just 1197 * do the copy work. 1198 */ 1199 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; 1200 /* update iod->addr because ublksrv may have passed a new io buffer */ 1201 ublk_get_iod(ubq, req->tag)->addr = io->addr; 1202 pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n", 1203 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, 1204 ublk_get_iod(ubq, req->tag)->addr); 1205 } 1206 1207 mapped_bytes = ublk_map_io(ubq, req, io); 1208 1209 /* partially mapped, update io descriptor */ 1210 if (unlikely(mapped_bytes != blk_rq_bytes(req))) { 1211 /* 1212 * Nothing mapped, retry until we succeed. 1213 * 1214 * We may never succeed in mapping any bytes here because 1215 * of OOM. TODO: reserve one buffer with single page pinned 1216 * for providing forward progress guarantee. 1217 */ 1218 if (unlikely(!mapped_bytes)) { 1219 blk_mq_requeue_request(req, false); 1220 blk_mq_delay_kick_requeue_list(req->q, 1221 UBLK_REQUEUE_DELAY_MS); 1222 return; 1223 } 1224 1225 ublk_get_iod(ubq, req->tag)->nr_sectors = 1226 mapped_bytes >> 9; 1227 } 1228 1229 ublk_init_req_ref(ubq, req); 1230 ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); 1231 } 1232 1233 static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, 1234 unsigned int issue_flags) 1235 { 1236 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1237 struct ublk_queue *ubq = pdu->ubq; 1238 1239 ublk_dispatch_req(ubq, pdu->req, issue_flags); 1240 } 1241 1242 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) 1243 { 1244 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; 1245 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1246 1247 pdu->req = rq; 1248 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); 1249 } 1250 1251 static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, 1252 unsigned int issue_flags) 1253 { 1254 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1255 struct request *rq = pdu->req_list; 1256 struct ublk_queue *ubq = pdu->ubq; 1257 struct request *next; 1258 1259 do { 1260 next = rq->rq_next; 1261 rq->rq_next = NULL; 1262 ublk_dispatch_req(ubq, rq, issue_flags); 1263 rq = next; 1264 } while (rq); 1265 } 1266 1267 static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) 1268 { 1269 struct request *rq = rq_list_peek(l); 1270 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; 1271 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1272 1273 pdu->req_list = rq; 1274 rq_list_init(l); 1275 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); 1276 } 1277 1278 static enum blk_eh_timer_return ublk_timeout(struct request *rq) 1279 { 1280 struct ublk_queue *ubq = rq->mq_hctx->driver_data; 1281 1282 if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { 1283 if (!ubq->timeout) { 1284 send_sig(SIGKILL, ubq->ubq_daemon, 0); 1285 ubq->timeout = true; 1286 } 1287 1288 return BLK_EH_DONE; 1289 } 1290 1291 return BLK_EH_RESET_TIMER; 1292 } 1293 1294 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, 1295 bool check_cancel) 1296 { 1297 blk_status_t res; 1298 1299 if (unlikely(ubq->fail_io)) 1300 return BLK_STS_TARGET; 1301 1302 /* With recovery feature enabled, force_abort is set in 1303 * ublk_stop_dev() before calling del_gendisk(). We have to 1304 * abort all requeued and new rqs here to let del_gendisk() 1305 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() 1306 * to avoid UAF on io_uring ctx. 1307 * 1308 * Note: force_abort is guaranteed to be seen because it is set 1309 * before request queue is unqiuesced. 1310 */ 1311 if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) 1312 return BLK_STS_IOERR; 1313 1314 if (check_cancel && unlikely(ubq->canceling)) 1315 return BLK_STS_IOERR; 1316 1317 /* fill iod to slot in io cmd buffer */ 1318 res = ublk_setup_iod(ubq, rq); 1319 if (unlikely(res != BLK_STS_OK)) 1320 return BLK_STS_IOERR; 1321 1322 blk_mq_start_request(rq); 1323 return BLK_STS_OK; 1324 } 1325 1326 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, 1327 const struct blk_mq_queue_data *bd) 1328 { 1329 struct ublk_queue *ubq = hctx->driver_data; 1330 struct request *rq = bd->rq; 1331 blk_status_t res; 1332 1333 res = ublk_prep_req(ubq, rq, false); 1334 if (res != BLK_STS_OK) 1335 return res; 1336 1337 /* 1338 * ->canceling has to be handled after ->force_abort and ->fail_io 1339 * is dealt with, otherwise this request may not be failed in case 1340 * of recovery, and cause hang when deleting disk 1341 */ 1342 if (unlikely(ubq->canceling)) { 1343 __ublk_abort_rq(ubq, rq); 1344 return BLK_STS_OK; 1345 } 1346 1347 ublk_queue_cmd(ubq, rq); 1348 return BLK_STS_OK; 1349 } 1350 1351 static void ublk_queue_rqs(struct rq_list *rqlist) 1352 { 1353 struct rq_list requeue_list = { }; 1354 struct rq_list submit_list = { }; 1355 struct ublk_queue *ubq = NULL; 1356 struct request *req; 1357 1358 while ((req = rq_list_pop(rqlist))) { 1359 struct ublk_queue *this_q = req->mq_hctx->driver_data; 1360 1361 if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) 1362 ublk_queue_cmd_list(ubq, &submit_list); 1363 ubq = this_q; 1364 1365 if (ublk_prep_req(ubq, req, true) == BLK_STS_OK) 1366 rq_list_add_tail(&submit_list, req); 1367 else 1368 rq_list_add_tail(&requeue_list, req); 1369 } 1370 1371 if (ubq && !rq_list_empty(&submit_list)) 1372 ublk_queue_cmd_list(ubq, &submit_list); 1373 *rqlist = requeue_list; 1374 } 1375 1376 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, 1377 unsigned int hctx_idx) 1378 { 1379 struct ublk_device *ub = driver_data; 1380 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num); 1381 1382 hctx->driver_data = ubq; 1383 return 0; 1384 } 1385 1386 static const struct blk_mq_ops ublk_mq_ops = { 1387 .queue_rq = ublk_queue_rq, 1388 .queue_rqs = ublk_queue_rqs, 1389 .init_hctx = ublk_init_hctx, 1390 .timeout = ublk_timeout, 1391 }; 1392 1393 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) 1394 { 1395 int i; 1396 1397 /* All old ioucmds have to be completed */ 1398 ubq->nr_io_ready = 0; 1399 1400 /* 1401 * old daemon is PF_EXITING, put it now 1402 * 1403 * It could be NULL in case of closing one quisced device. 1404 */ 1405 if (ubq->ubq_daemon) 1406 put_task_struct(ubq->ubq_daemon); 1407 /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ 1408 ubq->ubq_daemon = NULL; 1409 ubq->timeout = false; 1410 1411 for (i = 0; i < ubq->q_depth; i++) { 1412 struct ublk_io *io = &ubq->ios[i]; 1413 1414 /* 1415 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch 1416 * io->cmd 1417 */ 1418 io->flags &= UBLK_IO_FLAG_CANCELED; 1419 io->cmd = NULL; 1420 io->addr = 0; 1421 } 1422 } 1423 1424 static int ublk_ch_open(struct inode *inode, struct file *filp) 1425 { 1426 struct ublk_device *ub = container_of(inode->i_cdev, 1427 struct ublk_device, cdev); 1428 1429 if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) 1430 return -EBUSY; 1431 filp->private_data = ub; 1432 return 0; 1433 } 1434 1435 static void ublk_reset_ch_dev(struct ublk_device *ub) 1436 { 1437 int i; 1438 1439 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1440 ublk_queue_reinit(ub, ublk_get_queue(ub, i)); 1441 1442 /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ 1443 ub->mm = NULL; 1444 ub->nr_queues_ready = 0; 1445 ub->nr_privileged_daemon = 0; 1446 } 1447 1448 static struct gendisk *ublk_get_disk(struct ublk_device *ub) 1449 { 1450 struct gendisk *disk; 1451 1452 spin_lock(&ub->lock); 1453 disk = ub->ub_disk; 1454 if (disk) 1455 get_device(disk_to_dev(disk)); 1456 spin_unlock(&ub->lock); 1457 1458 return disk; 1459 } 1460 1461 static void ublk_put_disk(struct gendisk *disk) 1462 { 1463 if (disk) 1464 put_device(disk_to_dev(disk)); 1465 } 1466 1467 static int ublk_ch_release(struct inode *inode, struct file *filp) 1468 { 1469 struct ublk_device *ub = filp->private_data; 1470 struct gendisk *disk; 1471 int i; 1472 1473 /* 1474 * disk isn't attached yet, either device isn't live, or it has 1475 * been removed already, so we needn't to do anything 1476 */ 1477 disk = ublk_get_disk(ub); 1478 if (!disk) 1479 goto out; 1480 1481 /* 1482 * All uring_cmd are done now, so abort any request outstanding to 1483 * the ublk server 1484 * 1485 * This can be done in lockless way because ublk server has been 1486 * gone 1487 * 1488 * More importantly, we have to provide forward progress guarantee 1489 * without holding ub->mutex, otherwise control task grabbing 1490 * ub->mutex triggers deadlock 1491 * 1492 * All requests may be inflight, so ->canceling may not be set, set 1493 * it now. 1494 */ 1495 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 1496 struct ublk_queue *ubq = ublk_get_queue(ub, i); 1497 1498 ubq->canceling = true; 1499 ublk_abort_queue(ub, ubq); 1500 } 1501 blk_mq_kick_requeue_list(disk->queue); 1502 1503 /* 1504 * All infligh requests have been completed or requeued and any new 1505 * request will be failed or requeued via `->canceling` now, so it is 1506 * fine to grab ub->mutex now. 1507 */ 1508 mutex_lock(&ub->mutex); 1509 1510 /* double check after grabbing lock */ 1511 if (!ub->ub_disk) 1512 goto unlock; 1513 1514 /* 1515 * Transition the device to the nosrv state. What exactly this 1516 * means depends on the recovery flags 1517 */ 1518 blk_mq_quiesce_queue(disk->queue); 1519 if (ublk_nosrv_should_stop_dev(ub)) { 1520 /* 1521 * Allow any pending/future I/O to pass through quickly 1522 * with an error. This is needed because del_gendisk 1523 * waits for all pending I/O to complete 1524 */ 1525 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1526 ublk_get_queue(ub, i)->force_abort = true; 1527 blk_mq_unquiesce_queue(disk->queue); 1528 1529 ublk_stop_dev_unlocked(ub); 1530 } else { 1531 if (ublk_nosrv_dev_should_queue_io(ub)) { 1532 /* ->canceling is set and all requests are aborted */ 1533 ub->dev_info.state = UBLK_S_DEV_QUIESCED; 1534 } else { 1535 ub->dev_info.state = UBLK_S_DEV_FAIL_IO; 1536 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1537 ublk_get_queue(ub, i)->fail_io = true; 1538 } 1539 blk_mq_unquiesce_queue(disk->queue); 1540 } 1541 unlock: 1542 mutex_unlock(&ub->mutex); 1543 ublk_put_disk(disk); 1544 1545 /* all uring_cmd has been done now, reset device & ubq */ 1546 ublk_reset_ch_dev(ub); 1547 out: 1548 clear_bit(UB_STATE_OPEN, &ub->state); 1549 return 0; 1550 } 1551 1552 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */ 1553 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) 1554 { 1555 struct ublk_device *ub = filp->private_data; 1556 size_t sz = vma->vm_end - vma->vm_start; 1557 unsigned max_sz = ublk_max_cmd_buf_size(); 1558 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; 1559 int q_id, ret = 0; 1560 1561 spin_lock(&ub->lock); 1562 if (!ub->mm) 1563 ub->mm = current->mm; 1564 if (current->mm != ub->mm) 1565 ret = -EINVAL; 1566 spin_unlock(&ub->lock); 1567 1568 if (ret) 1569 return ret; 1570 1571 if (vma->vm_flags & VM_WRITE) 1572 return -EPERM; 1573 1574 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz; 1575 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end) 1576 return -EINVAL; 1577 1578 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz; 1579 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n", 1580 __func__, q_id, current->pid, vma->vm_start, 1581 phys_off, (unsigned long)sz); 1582 1583 if (sz != ublk_queue_cmd_buf_size(ub, q_id)) 1584 return -EINVAL; 1585 1586 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; 1587 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 1588 } 1589 1590 static void ublk_commit_completion(struct ublk_device *ub, 1591 const struct ublksrv_io_cmd *ub_cmd) 1592 { 1593 u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; 1594 struct ublk_queue *ubq = ublk_get_queue(ub, qid); 1595 struct ublk_io *io = &ubq->ios[tag]; 1596 struct request *req; 1597 1598 /* now this cmd slot is owned by nbd driver */ 1599 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; 1600 io->res = ub_cmd->result; 1601 1602 /* find the io request and complete */ 1603 req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); 1604 if (WARN_ON_ONCE(unlikely(!req))) 1605 return; 1606 1607 if (req_op(req) == REQ_OP_ZONE_APPEND) 1608 req->__sector = ub_cmd->zone_append_lba; 1609 1610 if (likely(!blk_should_fake_timeout(req->q))) 1611 ublk_put_req_ref(ubq, req); 1612 } 1613 1614 static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, 1615 struct request *req) 1616 { 1617 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); 1618 1619 if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) 1620 blk_mq_requeue_request(req, false); 1621 else { 1622 io->res = -EIO; 1623 __ublk_complete_rq(req); 1624 } 1625 } 1626 1627 /* 1628 * Called from ublk char device release handler, when any uring_cmd is 1629 * done, meantime request queue is "quiesced" since all inflight requests 1630 * can't be completed because ublk server is dead. 1631 * 1632 * So no one can hold our request IO reference any more, simply ignore the 1633 * reference, and complete the request immediately 1634 */ 1635 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) 1636 { 1637 int i; 1638 1639 for (i = 0; i < ubq->q_depth; i++) { 1640 struct ublk_io *io = &ubq->ios[i]; 1641 1642 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { 1643 struct request *rq; 1644 1645 /* 1646 * Either we fail the request or ublk_rq_task_work_cb 1647 * will do it 1648 */ 1649 rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); 1650 if (rq && blk_mq_request_started(rq)) 1651 __ublk_fail_req(ubq, io, rq); 1652 } 1653 } 1654 } 1655 1656 /* Must be called when queue is frozen */ 1657 static void ublk_mark_queue_canceling(struct ublk_queue *ubq) 1658 { 1659 spin_lock(&ubq->cancel_lock); 1660 if (!ubq->canceling) 1661 ubq->canceling = true; 1662 spin_unlock(&ubq->cancel_lock); 1663 } 1664 1665 static void ublk_start_cancel(struct ublk_queue *ubq) 1666 { 1667 struct ublk_device *ub = ubq->dev; 1668 struct gendisk *disk = ublk_get_disk(ub); 1669 1670 /* Our disk has been dead */ 1671 if (!disk) 1672 return; 1673 /* 1674 * Now we are serialized with ublk_queue_rq() 1675 * 1676 * Make sure that ubq->canceling is set when queue is frozen, 1677 * because ublk_queue_rq() has to rely on this flag for avoiding to 1678 * touch completed uring_cmd 1679 */ 1680 blk_mq_quiesce_queue(disk->queue); 1681 ublk_mark_queue_canceling(ubq); 1682 blk_mq_unquiesce_queue(disk->queue); 1683 ublk_put_disk(disk); 1684 } 1685 1686 static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, 1687 unsigned int issue_flags) 1688 { 1689 bool done; 1690 1691 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) 1692 return; 1693 1694 spin_lock(&ubq->cancel_lock); 1695 done = !!(io->flags & UBLK_IO_FLAG_CANCELED); 1696 if (!done) 1697 io->flags |= UBLK_IO_FLAG_CANCELED; 1698 spin_unlock(&ubq->cancel_lock); 1699 1700 if (!done) 1701 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); 1702 } 1703 1704 /* 1705 * The ublk char device won't be closed when calling cancel fn, so both 1706 * ublk device and queue are guaranteed to be live 1707 * 1708 * Two-stage cancel: 1709 * 1710 * - make every active uring_cmd done in ->cancel_fn() 1711 * 1712 * - aborting inflight ublk IO requests in ublk char device release handler, 1713 * which depends on 1st stage because device can only be closed iff all 1714 * uring_cmd are done 1715 * 1716 * Do _not_ try to acquire ub->mutex before all inflight requests are 1717 * aborted, otherwise deadlock may be caused. 1718 */ 1719 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, 1720 unsigned int issue_flags) 1721 { 1722 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1723 struct ublk_queue *ubq = pdu->ubq; 1724 struct task_struct *task; 1725 struct ublk_io *io; 1726 1727 if (WARN_ON_ONCE(!ubq)) 1728 return; 1729 1730 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth)) 1731 return; 1732 1733 task = io_uring_cmd_get_task(cmd); 1734 if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) 1735 return; 1736 1737 if (!ubq->canceling) 1738 ublk_start_cancel(ubq); 1739 1740 io = &ubq->ios[pdu->tag]; 1741 WARN_ON_ONCE(io->cmd != cmd); 1742 ublk_cancel_cmd(ubq, io, issue_flags); 1743 } 1744 1745 static inline bool ublk_queue_ready(struct ublk_queue *ubq) 1746 { 1747 return ubq->nr_io_ready == ubq->q_depth; 1748 } 1749 1750 static void ublk_cancel_queue(struct ublk_queue *ubq) 1751 { 1752 int i; 1753 1754 for (i = 0; i < ubq->q_depth; i++) 1755 ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED); 1756 } 1757 1758 /* Cancel all pending commands, must be called after del_gendisk() returns */ 1759 static void ublk_cancel_dev(struct ublk_device *ub) 1760 { 1761 int i; 1762 1763 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1764 ublk_cancel_queue(ublk_get_queue(ub, i)); 1765 } 1766 1767 static bool ublk_check_inflight_rq(struct request *rq, void *data) 1768 { 1769 bool *idle = data; 1770 1771 if (blk_mq_request_started(rq)) { 1772 *idle = false; 1773 return false; 1774 } 1775 return true; 1776 } 1777 1778 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) 1779 { 1780 bool idle; 1781 1782 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); 1783 while (true) { 1784 idle = true; 1785 blk_mq_tagset_busy_iter(&ub->tag_set, 1786 ublk_check_inflight_rq, &idle); 1787 if (idle) 1788 break; 1789 msleep(UBLK_REQUEUE_DELAY_MS); 1790 } 1791 } 1792 1793 static void ublk_force_abort_dev(struct ublk_device *ub) 1794 { 1795 int i; 1796 1797 pr_devel("%s: force abort ub: dev_id %d state %s\n", 1798 __func__, ub->dev_info.dev_id, 1799 ub->dev_info.state == UBLK_S_DEV_LIVE ? 1800 "LIVE" : "QUIESCED"); 1801 blk_mq_quiesce_queue(ub->ub_disk->queue); 1802 if (ub->dev_info.state == UBLK_S_DEV_LIVE) 1803 ublk_wait_tagset_rqs_idle(ub); 1804 1805 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1806 ublk_get_queue(ub, i)->force_abort = true; 1807 blk_mq_unquiesce_queue(ub->ub_disk->queue); 1808 /* We may have requeued some rqs in ublk_quiesce_queue() */ 1809 blk_mq_kick_requeue_list(ub->ub_disk->queue); 1810 } 1811 1812 static struct gendisk *ublk_detach_disk(struct ublk_device *ub) 1813 { 1814 struct gendisk *disk; 1815 1816 /* Sync with ublk_abort_queue() by holding the lock */ 1817 spin_lock(&ub->lock); 1818 disk = ub->ub_disk; 1819 ub->dev_info.state = UBLK_S_DEV_DEAD; 1820 ub->dev_info.ublksrv_pid = -1; 1821 ub->ub_disk = NULL; 1822 spin_unlock(&ub->lock); 1823 1824 return disk; 1825 } 1826 1827 static void ublk_stop_dev_unlocked(struct ublk_device *ub) 1828 __must_hold(&ub->mutex) 1829 { 1830 struct gendisk *disk; 1831 1832 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 1833 return; 1834 1835 if (ublk_nosrv_dev_should_queue_io(ub)) 1836 ublk_force_abort_dev(ub); 1837 del_gendisk(ub->ub_disk); 1838 disk = ublk_detach_disk(ub); 1839 put_disk(disk); 1840 } 1841 1842 static void ublk_stop_dev(struct ublk_device *ub) 1843 { 1844 mutex_lock(&ub->mutex); 1845 ublk_stop_dev_unlocked(ub); 1846 mutex_unlock(&ub->mutex); 1847 ublk_cancel_dev(ub); 1848 } 1849 1850 /* reset ublk io_uring queue & io flags */ 1851 static void ublk_reset_io_flags(struct ublk_device *ub) 1852 { 1853 int i, j; 1854 1855 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 1856 struct ublk_queue *ubq = ublk_get_queue(ub, i); 1857 1858 /* UBLK_IO_FLAG_CANCELED can be cleared now */ 1859 spin_lock(&ubq->cancel_lock); 1860 for (j = 0; j < ubq->q_depth; j++) 1861 ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; 1862 spin_unlock(&ubq->cancel_lock); 1863 ubq->canceling = false; 1864 ubq->fail_io = false; 1865 } 1866 } 1867 1868 /* device can only be started after all IOs are ready */ 1869 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) 1870 __must_hold(&ub->mutex) 1871 { 1872 ubq->nr_io_ready++; 1873 if (ublk_queue_ready(ubq)) { 1874 ubq->ubq_daemon = current; 1875 get_task_struct(ubq->ubq_daemon); 1876 ub->nr_queues_ready++; 1877 1878 if (capable(CAP_SYS_ADMIN)) 1879 ub->nr_privileged_daemon++; 1880 } 1881 1882 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { 1883 /* now we are ready for handling ublk io request */ 1884 ublk_reset_io_flags(ub); 1885 complete_all(&ub->completion); 1886 } 1887 } 1888 1889 static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, 1890 int tag) 1891 { 1892 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 1893 struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); 1894 1895 ublk_queue_cmd(ubq, req); 1896 } 1897 1898 static inline int ublk_check_cmd_op(u32 cmd_op) 1899 { 1900 u32 ioc_type = _IOC_TYPE(cmd_op); 1901 1902 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') 1903 return -EOPNOTSUPP; 1904 1905 if (ioc_type != 'u' && ioc_type != 0) 1906 return -EOPNOTSUPP; 1907 1908 return 0; 1909 } 1910 1911 static inline void ublk_fill_io_cmd(struct ublk_io *io, 1912 struct io_uring_cmd *cmd, unsigned long buf_addr) 1913 { 1914 io->cmd = cmd; 1915 io->flags |= UBLK_IO_FLAG_ACTIVE; 1916 io->addr = buf_addr; 1917 } 1918 1919 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, 1920 unsigned int issue_flags, 1921 struct ublk_queue *ubq, unsigned int tag) 1922 { 1923 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1924 1925 /* 1926 * Safe to refer to @ubq since ublk_queue won't be died until its 1927 * commands are completed 1928 */ 1929 pdu->ubq = ubq; 1930 pdu->tag = tag; 1931 io_uring_cmd_mark_cancelable(cmd, issue_flags); 1932 } 1933 1934 static void ublk_io_release(void *priv) 1935 { 1936 struct request *rq = priv; 1937 struct ublk_queue *ubq = rq->mq_hctx->driver_data; 1938 1939 ublk_put_req_ref(ubq, rq); 1940 } 1941 1942 static int ublk_register_io_buf(struct io_uring_cmd *cmd, 1943 struct ublk_queue *ubq, unsigned int tag, 1944 unsigned int index, unsigned int issue_flags) 1945 { 1946 struct ublk_device *ub = cmd->file->private_data; 1947 struct request *req; 1948 int ret; 1949 1950 req = __ublk_check_and_get_req(ub, ubq, tag, 0); 1951 if (!req) 1952 return -EINVAL; 1953 1954 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, 1955 issue_flags); 1956 if (ret) { 1957 ublk_put_req_ref(ubq, req); 1958 return ret; 1959 } 1960 1961 return 0; 1962 } 1963 1964 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, 1965 unsigned int index, unsigned int issue_flags) 1966 { 1967 return io_buffer_unregister_bvec(cmd, index, issue_flags); 1968 } 1969 1970 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, 1971 struct ublk_io *io, __u64 buf_addr) 1972 { 1973 struct ublk_device *ub = ubq->dev; 1974 int ret = 0; 1975 1976 /* 1977 * When handling FETCH command for setting up ublk uring queue, 1978 * ub->mutex is the innermost lock, and we won't block for handling 1979 * FETCH, so it is fine even for IO_URING_F_NONBLOCK. 1980 */ 1981 mutex_lock(&ub->mutex); 1982 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ 1983 if (ublk_queue_ready(ubq)) { 1984 ret = -EBUSY; 1985 goto out; 1986 } 1987 1988 /* allow each command to be FETCHed at most once */ 1989 if (io->flags & UBLK_IO_FLAG_ACTIVE) { 1990 ret = -EINVAL; 1991 goto out; 1992 } 1993 1994 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); 1995 1996 if (ublk_need_map_io(ubq)) { 1997 /* 1998 * FETCH_RQ has to provide IO buffer if NEED GET 1999 * DATA is not enabled 2000 */ 2001 if (!buf_addr && !ublk_need_get_data(ubq)) 2002 goto out; 2003 } else if (buf_addr) { 2004 /* User copy requires addr to be unset */ 2005 ret = -EINVAL; 2006 goto out; 2007 } 2008 2009 ublk_fill_io_cmd(io, cmd, buf_addr); 2010 ublk_mark_io_ready(ub, ubq); 2011 out: 2012 mutex_unlock(&ub->mutex); 2013 return ret; 2014 } 2015 2016 static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, 2017 unsigned int issue_flags, 2018 const struct ublksrv_io_cmd *ub_cmd) 2019 { 2020 struct ublk_device *ub = cmd->file->private_data; 2021 struct ublk_queue *ubq; 2022 struct ublk_io *io; 2023 u32 cmd_op = cmd->cmd_op; 2024 unsigned tag = ub_cmd->tag; 2025 int ret = -EINVAL; 2026 struct request *req; 2027 2028 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", 2029 __func__, cmd->cmd_op, ub_cmd->q_id, tag, 2030 ub_cmd->result); 2031 2032 if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) 2033 goto out; 2034 2035 ubq = ublk_get_queue(ub, ub_cmd->q_id); 2036 if (!ubq || ub_cmd->q_id != ubq->q_id) 2037 goto out; 2038 2039 if (ubq->ubq_daemon && ubq->ubq_daemon != current) 2040 goto out; 2041 2042 if (tag >= ubq->q_depth) 2043 goto out; 2044 2045 io = &ubq->ios[tag]; 2046 2047 /* there is pending io cmd, something must be wrong */ 2048 if (io->flags & UBLK_IO_FLAG_ACTIVE) { 2049 ret = -EBUSY; 2050 goto out; 2051 } 2052 2053 /* 2054 * ensure that the user issues UBLK_IO_NEED_GET_DATA 2055 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. 2056 */ 2057 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) 2058 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) 2059 goto out; 2060 2061 ret = ublk_check_cmd_op(cmd_op); 2062 if (ret) 2063 goto out; 2064 2065 ret = -EINVAL; 2066 switch (_IOC_NR(cmd_op)) { 2067 case UBLK_IO_REGISTER_IO_BUF: 2068 return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); 2069 case UBLK_IO_UNREGISTER_IO_BUF: 2070 return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); 2071 case UBLK_IO_FETCH_REQ: 2072 ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); 2073 if (ret) 2074 goto out; 2075 break; 2076 case UBLK_IO_COMMIT_AND_FETCH_REQ: 2077 req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); 2078 2079 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 2080 goto out; 2081 2082 if (ublk_need_map_io(ubq)) { 2083 /* 2084 * COMMIT_AND_FETCH_REQ has to provide IO buffer if 2085 * NEED GET DATA is not enabled or it is Read IO. 2086 */ 2087 if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || 2088 req_op(req) == REQ_OP_READ)) 2089 goto out; 2090 } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { 2091 /* 2092 * User copy requires addr to be unset when command is 2093 * not zone append 2094 */ 2095 ret = -EINVAL; 2096 goto out; 2097 } 2098 2099 ublk_fill_io_cmd(io, cmd, ub_cmd->addr); 2100 ublk_commit_completion(ub, ub_cmd); 2101 break; 2102 case UBLK_IO_NEED_GET_DATA: 2103 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 2104 goto out; 2105 ublk_fill_io_cmd(io, cmd, ub_cmd->addr); 2106 ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); 2107 break; 2108 default: 2109 goto out; 2110 } 2111 ublk_prep_cancel(cmd, issue_flags, ubq, tag); 2112 return -EIOCBQUEUED; 2113 2114 out: 2115 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", 2116 __func__, cmd_op, tag, ret, io->flags); 2117 return ret; 2118 } 2119 2120 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 2121 struct ublk_queue *ubq, int tag, size_t offset) 2122 { 2123 struct request *req; 2124 2125 if (!ublk_need_req_ref(ubq)) 2126 return NULL; 2127 2128 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); 2129 if (!req) 2130 return NULL; 2131 2132 if (!ublk_get_req_ref(ubq, req)) 2133 return NULL; 2134 2135 if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) 2136 goto fail_put; 2137 2138 if (!ublk_rq_has_data(req)) 2139 goto fail_put; 2140 2141 if (offset > blk_rq_bytes(req)) 2142 goto fail_put; 2143 2144 return req; 2145 fail_put: 2146 ublk_put_req_ref(ubq, req); 2147 return NULL; 2148 } 2149 2150 static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, 2151 unsigned int issue_flags) 2152 { 2153 /* 2154 * Not necessary for async retry, but let's keep it simple and always 2155 * copy the values to avoid any potential reuse. 2156 */ 2157 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); 2158 const struct ublksrv_io_cmd ub_cmd = { 2159 .q_id = READ_ONCE(ub_src->q_id), 2160 .tag = READ_ONCE(ub_src->tag), 2161 .result = READ_ONCE(ub_src->result), 2162 .addr = READ_ONCE(ub_src->addr) 2163 }; 2164 2165 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); 2166 2167 return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); 2168 } 2169 2170 static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, 2171 unsigned int issue_flags) 2172 { 2173 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); 2174 2175 if (ret != -EIOCBQUEUED) 2176 io_uring_cmd_done(cmd, ret, 0, issue_flags); 2177 } 2178 2179 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 2180 { 2181 if (unlikely(issue_flags & IO_URING_F_CANCEL)) { 2182 ublk_uring_cmd_cancel_fn(cmd, issue_flags); 2183 return 0; 2184 } 2185 2186 /* well-implemented server won't run into unlocked */ 2187 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 2188 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb); 2189 return -EIOCBQUEUED; 2190 } 2191 2192 return ublk_ch_uring_cmd_local(cmd, issue_flags); 2193 } 2194 2195 static inline bool ublk_check_ubuf_dir(const struct request *req, 2196 int ubuf_dir) 2197 { 2198 /* copy ubuf to request pages */ 2199 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && 2200 ubuf_dir == ITER_SOURCE) 2201 return true; 2202 2203 /* copy request pages to ubuf */ 2204 if ((req_op(req) == REQ_OP_WRITE || 2205 req_op(req) == REQ_OP_ZONE_APPEND) && 2206 ubuf_dir == ITER_DEST) 2207 return true; 2208 2209 return false; 2210 } 2211 2212 static struct request *ublk_check_and_get_req(struct kiocb *iocb, 2213 struct iov_iter *iter, size_t *off, int dir) 2214 { 2215 struct ublk_device *ub = iocb->ki_filp->private_data; 2216 struct ublk_queue *ubq; 2217 struct request *req; 2218 size_t buf_off; 2219 u16 tag, q_id; 2220 2221 if (!ub) 2222 return ERR_PTR(-EACCES); 2223 2224 if (!user_backed_iter(iter)) 2225 return ERR_PTR(-EACCES); 2226 2227 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 2228 return ERR_PTR(-EACCES); 2229 2230 tag = ublk_pos_to_tag(iocb->ki_pos); 2231 q_id = ublk_pos_to_hwq(iocb->ki_pos); 2232 buf_off = ublk_pos_to_buf_off(iocb->ki_pos); 2233 2234 if (q_id >= ub->dev_info.nr_hw_queues) 2235 return ERR_PTR(-EINVAL); 2236 2237 ubq = ublk_get_queue(ub, q_id); 2238 if (!ubq) 2239 return ERR_PTR(-EINVAL); 2240 2241 if (tag >= ubq->q_depth) 2242 return ERR_PTR(-EINVAL); 2243 2244 req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); 2245 if (!req) 2246 return ERR_PTR(-EINVAL); 2247 2248 if (!req->mq_hctx || !req->mq_hctx->driver_data) 2249 goto fail; 2250 2251 if (!ublk_check_ubuf_dir(req, dir)) 2252 goto fail; 2253 2254 *off = buf_off; 2255 return req; 2256 fail: 2257 ublk_put_req_ref(ubq, req); 2258 return ERR_PTR(-EACCES); 2259 } 2260 2261 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) 2262 { 2263 struct ublk_queue *ubq; 2264 struct request *req; 2265 size_t buf_off; 2266 size_t ret; 2267 2268 req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST); 2269 if (IS_ERR(req)) 2270 return PTR_ERR(req); 2271 2272 ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); 2273 ubq = req->mq_hctx->driver_data; 2274 ublk_put_req_ref(ubq, req); 2275 2276 return ret; 2277 } 2278 2279 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) 2280 { 2281 struct ublk_queue *ubq; 2282 struct request *req; 2283 size_t buf_off; 2284 size_t ret; 2285 2286 req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE); 2287 if (IS_ERR(req)) 2288 return PTR_ERR(req); 2289 2290 ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); 2291 ubq = req->mq_hctx->driver_data; 2292 ublk_put_req_ref(ubq, req); 2293 2294 return ret; 2295 } 2296 2297 static const struct file_operations ublk_ch_fops = { 2298 .owner = THIS_MODULE, 2299 .open = ublk_ch_open, 2300 .release = ublk_ch_release, 2301 .read_iter = ublk_ch_read_iter, 2302 .write_iter = ublk_ch_write_iter, 2303 .uring_cmd = ublk_ch_uring_cmd, 2304 .mmap = ublk_ch_mmap, 2305 }; 2306 2307 static void ublk_deinit_queue(struct ublk_device *ub, int q_id) 2308 { 2309 int size = ublk_queue_cmd_buf_size(ub, q_id); 2310 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 2311 2312 if (ubq->ubq_daemon) 2313 put_task_struct(ubq->ubq_daemon); 2314 if (ubq->io_cmd_buf) 2315 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); 2316 } 2317 2318 static int ublk_init_queue(struct ublk_device *ub, int q_id) 2319 { 2320 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 2321 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; 2322 void *ptr; 2323 int size; 2324 2325 spin_lock_init(&ubq->cancel_lock); 2326 ubq->flags = ub->dev_info.flags; 2327 ubq->q_id = q_id; 2328 ubq->q_depth = ub->dev_info.queue_depth; 2329 size = ublk_queue_cmd_buf_size(ub, q_id); 2330 2331 ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); 2332 if (!ptr) 2333 return -ENOMEM; 2334 2335 ubq->io_cmd_buf = ptr; 2336 ubq->dev = ub; 2337 return 0; 2338 } 2339 2340 static void ublk_deinit_queues(struct ublk_device *ub) 2341 { 2342 int nr_queues = ub->dev_info.nr_hw_queues; 2343 int i; 2344 2345 if (!ub->__queues) 2346 return; 2347 2348 for (i = 0; i < nr_queues; i++) 2349 ublk_deinit_queue(ub, i); 2350 kfree(ub->__queues); 2351 } 2352 2353 static int ublk_init_queues(struct ublk_device *ub) 2354 { 2355 int nr_queues = ub->dev_info.nr_hw_queues; 2356 int depth = ub->dev_info.queue_depth; 2357 int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); 2358 int i, ret = -ENOMEM; 2359 2360 ub->queue_size = ubq_size; 2361 ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); 2362 if (!ub->__queues) 2363 return ret; 2364 2365 for (i = 0; i < nr_queues; i++) { 2366 if (ublk_init_queue(ub, i)) 2367 goto fail; 2368 } 2369 2370 init_completion(&ub->completion); 2371 return 0; 2372 2373 fail: 2374 ublk_deinit_queues(ub); 2375 return ret; 2376 } 2377 2378 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) 2379 { 2380 int i = idx; 2381 int err; 2382 2383 spin_lock(&ublk_idr_lock); 2384 /* allocate id, if @id >= 0, we're requesting that specific id */ 2385 if (i >= 0) { 2386 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT); 2387 if (err == -ENOSPC) 2388 err = -EEXIST; 2389 } else { 2390 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS, 2391 GFP_NOWAIT); 2392 } 2393 spin_unlock(&ublk_idr_lock); 2394 2395 if (err >= 0) 2396 ub->ub_number = err; 2397 2398 return err; 2399 } 2400 2401 static void ublk_free_dev_number(struct ublk_device *ub) 2402 { 2403 spin_lock(&ublk_idr_lock); 2404 idr_remove(&ublk_index_idr, ub->ub_number); 2405 wake_up_all(&ublk_idr_wq); 2406 spin_unlock(&ublk_idr_lock); 2407 } 2408 2409 static void ublk_cdev_rel(struct device *dev) 2410 { 2411 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); 2412 2413 blk_mq_free_tag_set(&ub->tag_set); 2414 ublk_deinit_queues(ub); 2415 ublk_free_dev_number(ub); 2416 mutex_destroy(&ub->mutex); 2417 kfree(ub); 2418 } 2419 2420 static int ublk_add_chdev(struct ublk_device *ub) 2421 { 2422 struct device *dev = &ub->cdev_dev; 2423 int minor = ub->ub_number; 2424 int ret; 2425 2426 dev->parent = ublk_misc.this_device; 2427 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); 2428 dev->class = &ublk_chr_class; 2429 dev->release = ublk_cdev_rel; 2430 device_initialize(dev); 2431 2432 ret = dev_set_name(dev, "ublkc%d", minor); 2433 if (ret) 2434 goto fail; 2435 2436 cdev_init(&ub->cdev, &ublk_ch_fops); 2437 ret = cdev_device_add(&ub->cdev, dev); 2438 if (ret) 2439 goto fail; 2440 2441 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) 2442 unprivileged_ublks_added++; 2443 return 0; 2444 fail: 2445 put_device(dev); 2446 return ret; 2447 } 2448 2449 /* align max io buffer size with PAGE_SIZE */ 2450 static void ublk_align_max_io_size(struct ublk_device *ub) 2451 { 2452 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes; 2453 2454 ub->dev_info.max_io_buf_bytes = 2455 round_down(max_io_bytes, PAGE_SIZE); 2456 } 2457 2458 static int ublk_add_tag_set(struct ublk_device *ub) 2459 { 2460 ub->tag_set.ops = &ublk_mq_ops; 2461 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; 2462 ub->tag_set.queue_depth = ub->dev_info.queue_depth; 2463 ub->tag_set.numa_node = NUMA_NO_NODE; 2464 ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); 2465 ub->tag_set.driver_data = ub; 2466 return blk_mq_alloc_tag_set(&ub->tag_set); 2467 } 2468 2469 static void ublk_remove(struct ublk_device *ub) 2470 { 2471 bool unprivileged; 2472 2473 ublk_stop_dev(ub); 2474 cdev_device_del(&ub->cdev, &ub->cdev_dev); 2475 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; 2476 ublk_put_device(ub); 2477 2478 if (unprivileged) 2479 unprivileged_ublks_added--; 2480 } 2481 2482 static struct ublk_device *ublk_get_device_from_id(int idx) 2483 { 2484 struct ublk_device *ub = NULL; 2485 2486 if (idx < 0) 2487 return NULL; 2488 2489 spin_lock(&ublk_idr_lock); 2490 ub = idr_find(&ublk_index_idr, idx); 2491 if (ub) 2492 ub = ublk_get_device(ub); 2493 spin_unlock(&ublk_idr_lock); 2494 2495 return ub; 2496 } 2497 2498 static int ublk_ctrl_start_dev(struct ublk_device *ub, 2499 const struct ublksrv_ctrl_cmd *header) 2500 { 2501 const struct ublk_param_basic *p = &ub->params.basic; 2502 int ublksrv_pid = (int)header->data[0]; 2503 struct queue_limits lim = { 2504 .logical_block_size = 1 << p->logical_bs_shift, 2505 .physical_block_size = 1 << p->physical_bs_shift, 2506 .io_min = 1 << p->io_min_shift, 2507 .io_opt = 1 << p->io_opt_shift, 2508 .max_hw_sectors = p->max_sectors, 2509 .chunk_sectors = p->chunk_sectors, 2510 .virt_boundary_mask = p->virt_boundary_mask, 2511 .max_segments = USHRT_MAX, 2512 .max_segment_size = UINT_MAX, 2513 .dma_alignment = 3, 2514 }; 2515 struct gendisk *disk; 2516 int ret = -EINVAL; 2517 2518 if (ublksrv_pid <= 0) 2519 return -EINVAL; 2520 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) 2521 return -EINVAL; 2522 2523 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 2524 const struct ublk_param_discard *pd = &ub->params.discard; 2525 2526 lim.discard_alignment = pd->discard_alignment; 2527 lim.discard_granularity = pd->discard_granularity; 2528 lim.max_hw_discard_sectors = pd->max_discard_sectors; 2529 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; 2530 lim.max_discard_segments = pd->max_discard_segments; 2531 } 2532 2533 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { 2534 const struct ublk_param_zoned *p = &ub->params.zoned; 2535 2536 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 2537 return -EOPNOTSUPP; 2538 2539 lim.features |= BLK_FEAT_ZONED; 2540 lim.max_active_zones = p->max_active_zones; 2541 lim.max_open_zones = p->max_open_zones; 2542 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; 2543 } 2544 2545 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { 2546 lim.features |= BLK_FEAT_WRITE_CACHE; 2547 if (ub->params.basic.attrs & UBLK_ATTR_FUA) 2548 lim.features |= BLK_FEAT_FUA; 2549 } 2550 2551 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) 2552 lim.features |= BLK_FEAT_ROTATIONAL; 2553 2554 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) 2555 lim.dma_alignment = ub->params.dma.alignment; 2556 2557 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { 2558 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask; 2559 lim.max_segment_size = ub->params.seg.max_segment_size; 2560 lim.max_segments = ub->params.seg.max_segments; 2561 } 2562 2563 if (wait_for_completion_interruptible(&ub->completion) != 0) 2564 return -EINTR; 2565 2566 mutex_lock(&ub->mutex); 2567 if (ub->dev_info.state == UBLK_S_DEV_LIVE || 2568 test_bit(UB_STATE_USED, &ub->state)) { 2569 ret = -EEXIST; 2570 goto out_unlock; 2571 } 2572 2573 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); 2574 if (IS_ERR(disk)) { 2575 ret = PTR_ERR(disk); 2576 goto out_unlock; 2577 } 2578 sprintf(disk->disk_name, "ublkb%d", ub->ub_number); 2579 disk->fops = &ub_fops; 2580 disk->private_data = ub; 2581 2582 ub->dev_info.ublksrv_pid = ublksrv_pid; 2583 ub->ub_disk = disk; 2584 2585 ublk_apply_params(ub); 2586 2587 /* don't probe partitions if any one ubq daemon is un-trusted */ 2588 if (ub->nr_privileged_daemon != ub->nr_queues_ready) 2589 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 2590 2591 ublk_get_device(ub); 2592 ub->dev_info.state = UBLK_S_DEV_LIVE; 2593 2594 if (ublk_dev_is_zoned(ub)) { 2595 ret = ublk_revalidate_disk_zones(ub); 2596 if (ret) 2597 goto out_put_cdev; 2598 } 2599 2600 ret = add_disk(disk); 2601 if (ret) 2602 goto out_put_cdev; 2603 2604 set_bit(UB_STATE_USED, &ub->state); 2605 2606 out_put_cdev: 2607 if (ret) { 2608 ublk_detach_disk(ub); 2609 ublk_put_device(ub); 2610 } 2611 if (ret) 2612 put_disk(disk); 2613 out_unlock: 2614 mutex_unlock(&ub->mutex); 2615 return ret; 2616 } 2617 2618 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, 2619 const struct ublksrv_ctrl_cmd *header) 2620 { 2621 void __user *argp = (void __user *)(unsigned long)header->addr; 2622 cpumask_var_t cpumask; 2623 unsigned long queue; 2624 unsigned int retlen; 2625 unsigned int i; 2626 int ret; 2627 2628 if (header->len * BITS_PER_BYTE < nr_cpu_ids) 2629 return -EINVAL; 2630 if (header->len & (sizeof(unsigned long)-1)) 2631 return -EINVAL; 2632 if (!header->addr) 2633 return -EINVAL; 2634 2635 queue = header->data[0]; 2636 if (queue >= ub->dev_info.nr_hw_queues) 2637 return -EINVAL; 2638 2639 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 2640 return -ENOMEM; 2641 2642 for_each_possible_cpu(i) { 2643 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) 2644 cpumask_set_cpu(i, cpumask); 2645 } 2646 2647 ret = -EFAULT; 2648 retlen = min_t(unsigned short, header->len, cpumask_size()); 2649 if (copy_to_user(argp, cpumask, retlen)) 2650 goto out_free_cpumask; 2651 if (retlen != header->len && 2652 clear_user(argp + retlen, header->len - retlen)) 2653 goto out_free_cpumask; 2654 2655 ret = 0; 2656 out_free_cpumask: 2657 free_cpumask_var(cpumask); 2658 return ret; 2659 } 2660 2661 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) 2662 { 2663 pr_devel("%s: dev id %d flags %llx\n", __func__, 2664 info->dev_id, info->flags); 2665 pr_devel("\t nr_hw_queues %d queue_depth %d\n", 2666 info->nr_hw_queues, info->queue_depth); 2667 } 2668 2669 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) 2670 { 2671 void __user *argp = (void __user *)(unsigned long)header->addr; 2672 struct ublksrv_ctrl_dev_info info; 2673 struct ublk_device *ub; 2674 int ret = -EINVAL; 2675 2676 if (header->len < sizeof(info) || !header->addr) 2677 return -EINVAL; 2678 if (header->queue_id != (u16)-1) { 2679 pr_warn("%s: queue_id is wrong %x\n", 2680 __func__, header->queue_id); 2681 return -EINVAL; 2682 } 2683 2684 if (copy_from_user(&info, argp, sizeof(info))) 2685 return -EFAULT; 2686 2687 if (capable(CAP_SYS_ADMIN)) 2688 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; 2689 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) 2690 return -EPERM; 2691 2692 /* forbid nonsense combinations of recovery flags */ 2693 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { 2694 case 0: 2695 case UBLK_F_USER_RECOVERY: 2696 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): 2697 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): 2698 break; 2699 default: 2700 pr_warn("%s: invalid recovery flags %llx\n", __func__, 2701 info.flags & UBLK_F_ALL_RECOVERY_FLAGS); 2702 return -EINVAL; 2703 } 2704 2705 /* 2706 * unprivileged device can't be trusted, but RECOVERY and 2707 * RECOVERY_REISSUE still may hang error handling, so can't 2708 * support recovery features for unprivileged ublk now 2709 * 2710 * TODO: provide forward progress for RECOVERY handler, so that 2711 * unprivileged device can benefit from it 2712 */ 2713 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { 2714 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | 2715 UBLK_F_USER_RECOVERY); 2716 2717 /* 2718 * For USER_COPY, we depends on userspace to fill request 2719 * buffer by pwrite() to ublk char device, which can't be 2720 * used for unprivileged device 2721 */ 2722 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) 2723 return -EINVAL; 2724 } 2725 2726 /* the created device is always owned by current user */ 2727 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); 2728 2729 if (header->dev_id != info.dev_id) { 2730 pr_warn("%s: dev id not match %u %u\n", 2731 __func__, header->dev_id, info.dev_id); 2732 return -EINVAL; 2733 } 2734 2735 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) { 2736 pr_warn("%s: dev id is too large. Max supported is %d\n", 2737 __func__, UBLK_MAX_UBLKS - 1); 2738 return -EINVAL; 2739 } 2740 2741 ublk_dump_dev_info(&info); 2742 2743 ret = mutex_lock_killable(&ublk_ctl_mutex); 2744 if (ret) 2745 return ret; 2746 2747 ret = -EACCES; 2748 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) && 2749 unprivileged_ublks_added >= unprivileged_ublks_max) 2750 goto out_unlock; 2751 2752 ret = -ENOMEM; 2753 ub = kzalloc(sizeof(*ub), GFP_KERNEL); 2754 if (!ub) 2755 goto out_unlock; 2756 mutex_init(&ub->mutex); 2757 spin_lock_init(&ub->lock); 2758 2759 ret = ublk_alloc_dev_number(ub, header->dev_id); 2760 if (ret < 0) 2761 goto out_free_ub; 2762 2763 memcpy(&ub->dev_info, &info, sizeof(info)); 2764 2765 /* update device id */ 2766 ub->dev_info.dev_id = ub->ub_number; 2767 2768 /* 2769 * 64bit flags will be copied back to userspace as feature 2770 * negotiation result, so have to clear flags which driver 2771 * doesn't support yet, then userspace can get correct flags 2772 * (features) to handle. 2773 */ 2774 ub->dev_info.flags &= UBLK_F_ALL; 2775 2776 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | 2777 UBLK_F_URING_CMD_COMP_IN_TASK; 2778 2779 /* GET_DATA isn't needed any more with USER_COPY */ 2780 if (ublk_dev_is_user_copy(ub)) 2781 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; 2782 2783 /* Zoned storage support requires user copy feature */ 2784 if (ublk_dev_is_zoned(ub) && 2785 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { 2786 ret = -EINVAL; 2787 goto out_free_dev_number; 2788 } 2789 2790 ub->dev_info.nr_hw_queues = min_t(unsigned int, 2791 ub->dev_info.nr_hw_queues, nr_cpu_ids); 2792 ublk_align_max_io_size(ub); 2793 2794 ret = ublk_init_queues(ub); 2795 if (ret) 2796 goto out_free_dev_number; 2797 2798 ret = ublk_add_tag_set(ub); 2799 if (ret) 2800 goto out_deinit_queues; 2801 2802 ret = -EFAULT; 2803 if (copy_to_user(argp, &ub->dev_info, sizeof(info))) 2804 goto out_free_tag_set; 2805 2806 /* 2807 * Add the char dev so that ublksrv daemon can be setup. 2808 * ublk_add_chdev() will cleanup everything if it fails. 2809 */ 2810 ret = ublk_add_chdev(ub); 2811 goto out_unlock; 2812 2813 out_free_tag_set: 2814 blk_mq_free_tag_set(&ub->tag_set); 2815 out_deinit_queues: 2816 ublk_deinit_queues(ub); 2817 out_free_dev_number: 2818 ublk_free_dev_number(ub); 2819 out_free_ub: 2820 mutex_destroy(&ub->mutex); 2821 kfree(ub); 2822 out_unlock: 2823 mutex_unlock(&ublk_ctl_mutex); 2824 return ret; 2825 } 2826 2827 static inline bool ublk_idr_freed(int id) 2828 { 2829 void *ptr; 2830 2831 spin_lock(&ublk_idr_lock); 2832 ptr = idr_find(&ublk_index_idr, id); 2833 spin_unlock(&ublk_idr_lock); 2834 2835 return ptr == NULL; 2836 } 2837 2838 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) 2839 { 2840 struct ublk_device *ub = *p_ub; 2841 int idx = ub->ub_number; 2842 int ret; 2843 2844 ret = mutex_lock_killable(&ublk_ctl_mutex); 2845 if (ret) 2846 return ret; 2847 2848 if (!test_bit(UB_STATE_DELETED, &ub->state)) { 2849 ublk_remove(ub); 2850 set_bit(UB_STATE_DELETED, &ub->state); 2851 } 2852 2853 /* Mark the reference as consumed */ 2854 *p_ub = NULL; 2855 ublk_put_device(ub); 2856 mutex_unlock(&ublk_ctl_mutex); 2857 2858 /* 2859 * Wait until the idr is removed, then it can be reused after 2860 * DEL_DEV command is returned. 2861 * 2862 * If we returns because of user interrupt, future delete command 2863 * may come: 2864 * 2865 * - the device number isn't freed, this device won't or needn't 2866 * be deleted again, since UB_STATE_DELETED is set, and device 2867 * will be released after the last reference is dropped 2868 * 2869 * - the device number is freed already, we will not find this 2870 * device via ublk_get_device_from_id() 2871 */ 2872 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) 2873 return -EINTR; 2874 return 0; 2875 } 2876 2877 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) 2878 { 2879 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2880 2881 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", 2882 __func__, cmd->cmd_op, header->dev_id, header->queue_id, 2883 header->data[0], header->addr, header->len); 2884 } 2885 2886 static int ublk_ctrl_stop_dev(struct ublk_device *ub) 2887 { 2888 ublk_stop_dev(ub); 2889 return 0; 2890 } 2891 2892 static int ublk_ctrl_get_dev_info(struct ublk_device *ub, 2893 const struct ublksrv_ctrl_cmd *header) 2894 { 2895 void __user *argp = (void __user *)(unsigned long)header->addr; 2896 2897 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) 2898 return -EINVAL; 2899 2900 if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) 2901 return -EFAULT; 2902 2903 return 0; 2904 } 2905 2906 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */ 2907 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) 2908 { 2909 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); 2910 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); 2911 2912 if (ub->ub_disk) { 2913 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); 2914 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); 2915 } else { 2916 ub->params.devt.disk_major = 0; 2917 ub->params.devt.disk_minor = 0; 2918 } 2919 ub->params.types |= UBLK_PARAM_TYPE_DEVT; 2920 } 2921 2922 static int ublk_ctrl_get_params(struct ublk_device *ub, 2923 const struct ublksrv_ctrl_cmd *header) 2924 { 2925 void __user *argp = (void __user *)(unsigned long)header->addr; 2926 struct ublk_params_header ph; 2927 int ret; 2928 2929 if (header->len <= sizeof(ph) || !header->addr) 2930 return -EINVAL; 2931 2932 if (copy_from_user(&ph, argp, sizeof(ph))) 2933 return -EFAULT; 2934 2935 if (ph.len > header->len || !ph.len) 2936 return -EINVAL; 2937 2938 if (ph.len > sizeof(struct ublk_params)) 2939 ph.len = sizeof(struct ublk_params); 2940 2941 mutex_lock(&ub->mutex); 2942 ublk_ctrl_fill_params_devt(ub); 2943 if (copy_to_user(argp, &ub->params, ph.len)) 2944 ret = -EFAULT; 2945 else 2946 ret = 0; 2947 mutex_unlock(&ub->mutex); 2948 2949 return ret; 2950 } 2951 2952 static int ublk_ctrl_set_params(struct ublk_device *ub, 2953 const struct ublksrv_ctrl_cmd *header) 2954 { 2955 void __user *argp = (void __user *)(unsigned long)header->addr; 2956 struct ublk_params_header ph; 2957 int ret = -EFAULT; 2958 2959 if (header->len <= sizeof(ph) || !header->addr) 2960 return -EINVAL; 2961 2962 if (copy_from_user(&ph, argp, sizeof(ph))) 2963 return -EFAULT; 2964 2965 if (ph.len > header->len || !ph.len || !ph.types) 2966 return -EINVAL; 2967 2968 if (ph.len > sizeof(struct ublk_params)) 2969 ph.len = sizeof(struct ublk_params); 2970 2971 mutex_lock(&ub->mutex); 2972 if (test_bit(UB_STATE_USED, &ub->state)) { 2973 /* 2974 * Parameters can only be changed when device hasn't 2975 * been started yet 2976 */ 2977 ret = -EACCES; 2978 } else if (copy_from_user(&ub->params, argp, ph.len)) { 2979 ret = -EFAULT; 2980 } else { 2981 /* clear all we don't support yet */ 2982 ub->params.types &= UBLK_PARAM_TYPE_ALL; 2983 ret = ublk_validate_params(ub); 2984 if (ret) 2985 ub->params.types = 0; 2986 } 2987 mutex_unlock(&ub->mutex); 2988 2989 return ret; 2990 } 2991 2992 static int ublk_ctrl_start_recovery(struct ublk_device *ub, 2993 const struct ublksrv_ctrl_cmd *header) 2994 { 2995 int ret = -EINVAL; 2996 2997 mutex_lock(&ub->mutex); 2998 if (ublk_nosrv_should_stop_dev(ub)) 2999 goto out_unlock; 3000 /* 3001 * START_RECOVERY is only allowd after: 3002 * 3003 * (1) UB_STATE_OPEN is not set, which means the dying process is exited 3004 * and related io_uring ctx is freed so file struct of /dev/ublkcX is 3005 * released. 3006 * 3007 * and one of the following holds 3008 * 3009 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: 3010 * (a)has quiesced request queue 3011 * (b)has requeued every inflight rqs whose io_flags is ACTIVE 3012 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE 3013 * (d)has completed/camceled all ioucmds owned by ther dying process 3014 * 3015 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not 3016 * quiesced, but all I/O is being immediately errored 3017 */ 3018 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { 3019 ret = -EBUSY; 3020 goto out_unlock; 3021 } 3022 pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); 3023 init_completion(&ub->completion); 3024 ret = 0; 3025 out_unlock: 3026 mutex_unlock(&ub->mutex); 3027 return ret; 3028 } 3029 3030 static int ublk_ctrl_end_recovery(struct ublk_device *ub, 3031 const struct ublksrv_ctrl_cmd *header) 3032 { 3033 int ublksrv_pid = (int)header->data[0]; 3034 int ret = -EINVAL; 3035 3036 pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", 3037 __func__, ub->dev_info.nr_hw_queues, header->dev_id); 3038 /* wait until new ubq_daemon sending all FETCH_REQ */ 3039 if (wait_for_completion_interruptible(&ub->completion)) 3040 return -EINTR; 3041 3042 pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", 3043 __func__, ub->dev_info.nr_hw_queues, header->dev_id); 3044 3045 mutex_lock(&ub->mutex); 3046 if (ublk_nosrv_should_stop_dev(ub)) 3047 goto out_unlock; 3048 3049 if (!ublk_dev_in_recoverable_state(ub)) { 3050 ret = -EBUSY; 3051 goto out_unlock; 3052 } 3053 ub->dev_info.ublksrv_pid = ublksrv_pid; 3054 ub->dev_info.state = UBLK_S_DEV_LIVE; 3055 pr_devel("%s: new ublksrv_pid %d, dev id %d\n", 3056 __func__, ublksrv_pid, header->dev_id); 3057 blk_mq_kick_requeue_list(ub->ub_disk->queue); 3058 ret = 0; 3059 out_unlock: 3060 mutex_unlock(&ub->mutex); 3061 return ret; 3062 } 3063 3064 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header) 3065 { 3066 void __user *argp = (void __user *)(unsigned long)header->addr; 3067 u64 features = UBLK_F_ALL; 3068 3069 if (header->len != UBLK_FEATURES_LEN || !header->addr) 3070 return -EINVAL; 3071 3072 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) 3073 return -EFAULT; 3074 3075 return 0; 3076 } 3077 3078 /* 3079 * All control commands are sent via /dev/ublk-control, so we have to check 3080 * the destination device's permission 3081 */ 3082 static int ublk_char_dev_permission(struct ublk_device *ub, 3083 const char *dev_path, int mask) 3084 { 3085 int err; 3086 struct path path; 3087 struct kstat stat; 3088 3089 err = kern_path(dev_path, LOOKUP_FOLLOW, &path); 3090 if (err) 3091 return err; 3092 3093 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); 3094 if (err) 3095 goto exit; 3096 3097 err = -EPERM; 3098 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) 3099 goto exit; 3100 3101 err = inode_permission(&nop_mnt_idmap, 3102 d_backing_inode(path.dentry), mask); 3103 exit: 3104 path_put(&path); 3105 return err; 3106 } 3107 3108 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, 3109 struct io_uring_cmd *cmd) 3110 { 3111 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe); 3112 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; 3113 void __user *argp = (void __user *)(unsigned long)header->addr; 3114 char *dev_path = NULL; 3115 int ret = 0; 3116 int mask; 3117 3118 if (!unprivileged) { 3119 if (!capable(CAP_SYS_ADMIN)) 3120 return -EPERM; 3121 /* 3122 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes 3123 * char_dev_path in payload too, since userspace may not 3124 * know if the specified device is created as unprivileged 3125 * mode. 3126 */ 3127 if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) 3128 return 0; 3129 } 3130 3131 /* 3132 * User has to provide the char device path for unprivileged ublk 3133 * 3134 * header->addr always points to the dev path buffer, and 3135 * header->dev_path_len records length of dev path buffer. 3136 */ 3137 if (!header->dev_path_len || header->dev_path_len > PATH_MAX) 3138 return -EINVAL; 3139 3140 if (header->len < header->dev_path_len) 3141 return -EINVAL; 3142 3143 dev_path = memdup_user_nul(argp, header->dev_path_len); 3144 if (IS_ERR(dev_path)) 3145 return PTR_ERR(dev_path); 3146 3147 ret = -EINVAL; 3148 switch (_IOC_NR(cmd->cmd_op)) { 3149 case UBLK_CMD_GET_DEV_INFO: 3150 case UBLK_CMD_GET_DEV_INFO2: 3151 case UBLK_CMD_GET_QUEUE_AFFINITY: 3152 case UBLK_CMD_GET_PARAMS: 3153 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): 3154 mask = MAY_READ; 3155 break; 3156 case UBLK_CMD_START_DEV: 3157 case UBLK_CMD_STOP_DEV: 3158 case UBLK_CMD_ADD_DEV: 3159 case UBLK_CMD_DEL_DEV: 3160 case UBLK_CMD_SET_PARAMS: 3161 case UBLK_CMD_START_USER_RECOVERY: 3162 case UBLK_CMD_END_USER_RECOVERY: 3163 mask = MAY_READ | MAY_WRITE; 3164 break; 3165 default: 3166 goto exit; 3167 } 3168 3169 ret = ublk_char_dev_permission(ub, dev_path, mask); 3170 if (!ret) { 3171 header->len -= header->dev_path_len; 3172 header->addr += header->dev_path_len; 3173 } 3174 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", 3175 __func__, ub->ub_number, cmd->cmd_op, 3176 ub->dev_info.owner_uid, ub->dev_info.owner_gid, 3177 dev_path, ret); 3178 exit: 3179 kfree(dev_path); 3180 return ret; 3181 } 3182 3183 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, 3184 unsigned int issue_flags) 3185 { 3186 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 3187 struct ublk_device *ub = NULL; 3188 u32 cmd_op = cmd->cmd_op; 3189 int ret = -EINVAL; 3190 3191 if (issue_flags & IO_URING_F_NONBLOCK) 3192 return -EAGAIN; 3193 3194 ublk_ctrl_cmd_dump(cmd); 3195 3196 if (!(issue_flags & IO_URING_F_SQE128)) 3197 goto out; 3198 3199 ret = ublk_check_cmd_op(cmd_op); 3200 if (ret) 3201 goto out; 3202 3203 if (cmd_op == UBLK_U_CMD_GET_FEATURES) { 3204 ret = ublk_ctrl_get_features(header); 3205 goto out; 3206 } 3207 3208 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { 3209 ret = -ENODEV; 3210 ub = ublk_get_device_from_id(header->dev_id); 3211 if (!ub) 3212 goto out; 3213 3214 ret = ublk_ctrl_uring_cmd_permission(ub, cmd); 3215 if (ret) 3216 goto put_dev; 3217 } 3218 3219 switch (_IOC_NR(cmd_op)) { 3220 case UBLK_CMD_START_DEV: 3221 ret = ublk_ctrl_start_dev(ub, header); 3222 break; 3223 case UBLK_CMD_STOP_DEV: 3224 ret = ublk_ctrl_stop_dev(ub); 3225 break; 3226 case UBLK_CMD_GET_DEV_INFO: 3227 case UBLK_CMD_GET_DEV_INFO2: 3228 ret = ublk_ctrl_get_dev_info(ub, header); 3229 break; 3230 case UBLK_CMD_ADD_DEV: 3231 ret = ublk_ctrl_add_dev(header); 3232 break; 3233 case UBLK_CMD_DEL_DEV: 3234 ret = ublk_ctrl_del_dev(&ub, true); 3235 break; 3236 case UBLK_CMD_DEL_DEV_ASYNC: 3237 ret = ublk_ctrl_del_dev(&ub, false); 3238 break; 3239 case UBLK_CMD_GET_QUEUE_AFFINITY: 3240 ret = ublk_ctrl_get_queue_affinity(ub, header); 3241 break; 3242 case UBLK_CMD_GET_PARAMS: 3243 ret = ublk_ctrl_get_params(ub, header); 3244 break; 3245 case UBLK_CMD_SET_PARAMS: 3246 ret = ublk_ctrl_set_params(ub, header); 3247 break; 3248 case UBLK_CMD_START_USER_RECOVERY: 3249 ret = ublk_ctrl_start_recovery(ub, header); 3250 break; 3251 case UBLK_CMD_END_USER_RECOVERY: 3252 ret = ublk_ctrl_end_recovery(ub, header); 3253 break; 3254 default: 3255 ret = -EOPNOTSUPP; 3256 break; 3257 } 3258 3259 put_dev: 3260 if (ub) 3261 ublk_put_device(ub); 3262 out: 3263 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", 3264 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); 3265 return ret; 3266 } 3267 3268 static const struct file_operations ublk_ctl_fops = { 3269 .open = nonseekable_open, 3270 .uring_cmd = ublk_ctrl_uring_cmd, 3271 .owner = THIS_MODULE, 3272 .llseek = noop_llseek, 3273 }; 3274 3275 static struct miscdevice ublk_misc = { 3276 .minor = MISC_DYNAMIC_MINOR, 3277 .name = "ublk-control", 3278 .fops = &ublk_ctl_fops, 3279 }; 3280 3281 static int __init ublk_init(void) 3282 { 3283 int ret; 3284 3285 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + 3286 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); 3287 3288 init_waitqueue_head(&ublk_idr_wq); 3289 3290 ret = misc_register(&ublk_misc); 3291 if (ret) 3292 return ret; 3293 3294 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char"); 3295 if (ret) 3296 goto unregister_mis; 3297 3298 ret = class_register(&ublk_chr_class); 3299 if (ret) 3300 goto free_chrdev_region; 3301 3302 return 0; 3303 3304 free_chrdev_region: 3305 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 3306 unregister_mis: 3307 misc_deregister(&ublk_misc); 3308 return ret; 3309 } 3310 3311 static void __exit ublk_exit(void) 3312 { 3313 struct ublk_device *ub; 3314 int id; 3315 3316 idr_for_each_entry(&ublk_index_idr, ub, id) 3317 ublk_remove(ub); 3318 3319 class_unregister(&ublk_chr_class); 3320 misc_deregister(&ublk_misc); 3321 3322 idr_destroy(&ublk_index_idr); 3323 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 3324 } 3325 3326 module_init(ublk_init); 3327 module_exit(ublk_exit); 3328 3329 static int ublk_set_max_unprivileged_ublks(const char *buf, 3330 const struct kernel_param *kp) 3331 { 3332 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); 3333 } 3334 3335 static int ublk_get_max_unprivileged_ublks(char *buf, 3336 const struct kernel_param *kp) 3337 { 3338 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max); 3339 } 3340 3341 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = { 3342 .set = ublk_set_max_unprivileged_ublks, 3343 .get = ublk_get_max_unprivileged_ublks, 3344 }; 3345 3346 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops, 3347 &unprivileged_ublks_max, 0644); 3348 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)"); 3349 3350 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); 3351 MODULE_DESCRIPTION("Userspace block device"); 3352 MODULE_LICENSE("GPL"); 3353