1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Userspace block device - block device which IO is handled from userspace 4 * 5 * Take full use of io_uring passthrough command for communicating with 6 * ublk userspace daemon(ublksrvd) for handling basic IO request. 7 * 8 * Copyright 2022 Ming Lei <ming.lei@redhat.com> 9 * 10 * (part of code stolen from loop.c) 11 */ 12 #include <linux/module.h> 13 #include <linux/moduleparam.h> 14 #include <linux/sched.h> 15 #include <linux/fs.h> 16 #include <linux/pagemap.h> 17 #include <linux/file.h> 18 #include <linux/stat.h> 19 #include <linux/errno.h> 20 #include <linux/major.h> 21 #include <linux/wait.h> 22 #include <linux/blkdev.h> 23 #include <linux/init.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/compat.h> 27 #include <linux/mutex.h> 28 #include <linux/writeback.h> 29 #include <linux/completion.h> 30 #include <linux/highmem.h> 31 #include <linux/sysfs.h> 32 #include <linux/miscdevice.h> 33 #include <linux/falloc.h> 34 #include <linux/uio.h> 35 #include <linux/ioprio.h> 36 #include <linux/sched/mm.h> 37 #include <linux/uaccess.h> 38 #include <linux/cdev.h> 39 #include <linux/io_uring/cmd.h> 40 #include <linux/blk-mq.h> 41 #include <linux/delay.h> 42 #include <linux/mm.h> 43 #include <asm/page.h> 44 #include <linux/task_work.h> 45 #include <linux/namei.h> 46 #include <linux/kref.h> 47 #include <uapi/linux/ublk_cmd.h> 48 49 #define UBLK_MINORS (1U << MINORBITS) 50 51 #define UBLK_INVALID_BUF_IDX ((u16)-1) 52 53 /* private ioctl command mirror */ 54 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) 55 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) 56 #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) 57 58 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) 59 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) 60 61 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 62 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 63 | UBLK_F_URING_CMD_COMP_IN_TASK \ 64 | UBLK_F_NEED_GET_DATA \ 65 | UBLK_F_USER_RECOVERY \ 66 | UBLK_F_USER_RECOVERY_REISSUE \ 67 | UBLK_F_UNPRIVILEGED_DEV \ 68 | UBLK_F_CMD_IOCTL_ENCODE \ 69 | UBLK_F_USER_COPY \ 70 | UBLK_F_ZONED \ 71 | UBLK_F_USER_RECOVERY_FAIL_IO \ 72 | UBLK_F_UPDATE_SIZE \ 73 | UBLK_F_AUTO_BUF_REG \ 74 | UBLK_F_QUIESCE \ 75 | UBLK_F_PER_IO_DAEMON \ 76 | UBLK_F_BUF_REG_OFF_DAEMON) 77 78 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ 79 | UBLK_F_USER_RECOVERY_REISSUE \ 80 | UBLK_F_USER_RECOVERY_FAIL_IO) 81 82 /* All UBLK_PARAM_TYPE_* should be included here */ 83 #define UBLK_PARAM_TYPE_ALL \ 84 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ 85 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ 86 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) 87 88 struct ublk_uring_cmd_pdu { 89 /* 90 * Store requests in same batch temporarily for queuing them to 91 * daemon context. 92 * 93 * It should have been stored to request payload, but we do want 94 * to avoid extra pre-allocation, and uring_cmd payload is always 95 * free for us 96 */ 97 union { 98 struct request *req; 99 struct request *req_list; 100 }; 101 102 /* 103 * The following two are valid in this cmd whole lifetime, and 104 * setup in ublk uring_cmd handler 105 */ 106 struct ublk_queue *ubq; 107 108 u16 tag; 109 }; 110 111 /* 112 * io command is active: sqe cmd is received, and its cqe isn't done 113 * 114 * If the flag is set, the io command is owned by ublk driver, and waited 115 * for incoming blk-mq request from the ublk block device. 116 * 117 * If the flag is cleared, the io command will be completed, and owned by 118 * ublk server. 119 */ 120 #define UBLK_IO_FLAG_ACTIVE 0x01 121 122 /* 123 * IO command is completed via cqe, and it is being handled by ublksrv, and 124 * not committed yet 125 * 126 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for 127 * cross verification 128 */ 129 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 130 131 /* 132 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires 133 * get data buffer address from ublksrv. 134 * 135 * Then, bio data could be copied into this data buffer for a WRITE request 136 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset. 137 */ 138 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 139 140 /* 141 * request buffer is registered automatically, so we have to unregister it 142 * before completing this request. 143 * 144 * io_uring will unregister buffer automatically for us during exiting. 145 */ 146 #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10 147 148 /* atomic RW with ubq->cancel_lock */ 149 #define UBLK_IO_FLAG_CANCELED 0x80000000 150 151 /* 152 * Initialize refcount to a large number to include any registered buffers. 153 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for 154 * any buffers registered on the io daemon task. 155 */ 156 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) 157 158 struct ublk_io { 159 /* userspace buffer address from io cmd */ 160 union { 161 __u64 addr; 162 struct ublk_auto_buf_reg buf; 163 }; 164 unsigned int flags; 165 int res; 166 167 union { 168 /* valid if UBLK_IO_FLAG_ACTIVE is set */ 169 struct io_uring_cmd *cmd; 170 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */ 171 struct request *req; 172 }; 173 174 struct task_struct *task; 175 176 /* 177 * The number of uses of this I/O by the ublk server 178 * if user copy or zero copy are enabled: 179 * - UBLK_REFCOUNT_INIT from dispatch to the server 180 * until UBLK_IO_COMMIT_AND_FETCH_REQ 181 * - 1 for each inflight ublk_ch_{read,write}_iter() call 182 * - 1 for each io_uring registered buffer not registered on task 183 * The I/O can only be completed once all references are dropped. 184 * User copy and buffer registration operations are only permitted 185 * if the reference count is nonzero. 186 */ 187 refcount_t ref; 188 /* Count of buffers registered on task and not yet unregistered */ 189 unsigned task_registered_buffers; 190 191 void *buf_ctx_handle; 192 } ____cacheline_aligned_in_smp; 193 194 struct ublk_queue { 195 int q_id; 196 int q_depth; 197 198 unsigned long flags; 199 struct ublksrv_io_desc *io_cmd_buf; 200 201 bool force_abort; 202 bool canceling; 203 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ 204 unsigned short nr_io_ready; /* how many ios setup */ 205 spinlock_t cancel_lock; 206 struct ublk_device *dev; 207 struct ublk_io ios[]; 208 }; 209 210 struct ublk_device { 211 struct gendisk *ub_disk; 212 213 char *__queues; 214 215 unsigned int queue_size; 216 struct ublksrv_ctrl_dev_info dev_info; 217 218 struct blk_mq_tag_set tag_set; 219 220 struct cdev cdev; 221 struct device cdev_dev; 222 223 #define UB_STATE_OPEN 0 224 #define UB_STATE_USED 1 225 #define UB_STATE_DELETED 2 226 unsigned long state; 227 int ub_number; 228 229 struct mutex mutex; 230 231 spinlock_t lock; 232 struct mm_struct *mm; 233 234 struct ublk_params params; 235 236 struct completion completion; 237 unsigned int nr_queues_ready; 238 bool unprivileged_daemons; 239 struct mutex cancel_mutex; 240 bool canceling; 241 pid_t ublksrv_tgid; 242 struct delayed_work exit_work; 243 }; 244 245 /* header of ublk_params */ 246 struct ublk_params_header { 247 __u32 len; 248 __u32 types; 249 }; 250 251 static void ublk_io_release(void *priv); 252 static void ublk_stop_dev_unlocked(struct ublk_device *ub); 253 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); 254 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 255 const struct ublk_queue *ubq, struct ublk_io *io, 256 size_t offset); 257 static inline unsigned int ublk_req_build_flags(struct request *req); 258 259 static inline struct ublksrv_io_desc * 260 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) 261 { 262 return &ubq->io_cmd_buf[tag]; 263 } 264 265 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) 266 { 267 return ub->dev_info.flags & UBLK_F_ZONED; 268 } 269 270 static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) 271 { 272 return ubq->flags & UBLK_F_ZONED; 273 } 274 275 #ifdef CONFIG_BLK_DEV_ZONED 276 277 struct ublk_zoned_report_desc { 278 __u64 sector; 279 __u32 operation; 280 __u32 nr_zones; 281 }; 282 283 static DEFINE_XARRAY(ublk_zoned_report_descs); 284 285 static int ublk_zoned_insert_report_desc(const struct request *req, 286 struct ublk_zoned_report_desc *desc) 287 { 288 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, 289 desc, GFP_KERNEL); 290 } 291 292 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( 293 const struct request *req) 294 { 295 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); 296 } 297 298 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( 299 const struct request *req) 300 { 301 return xa_load(&ublk_zoned_report_descs, (unsigned long)req); 302 } 303 304 static int ublk_get_nr_zones(const struct ublk_device *ub) 305 { 306 const struct ublk_param_basic *p = &ub->params.basic; 307 308 /* Zone size is a power of 2 */ 309 return p->dev_sectors >> ilog2(p->chunk_sectors); 310 } 311 312 static int ublk_revalidate_disk_zones(struct ublk_device *ub) 313 { 314 return blk_revalidate_disk_zones(ub->ub_disk); 315 } 316 317 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 318 { 319 const struct ublk_param_zoned *p = &ub->params.zoned; 320 int nr_zones; 321 322 if (!ublk_dev_is_zoned(ub)) 323 return -EINVAL; 324 325 if (!p->max_zone_append_sectors) 326 return -EINVAL; 327 328 nr_zones = ublk_get_nr_zones(ub); 329 330 if (p->max_active_zones > nr_zones) 331 return -EINVAL; 332 333 if (p->max_open_zones > nr_zones) 334 return -EINVAL; 335 336 return 0; 337 } 338 339 static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 340 { 341 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); 342 } 343 344 /* Based on virtblk_alloc_report_buffer */ 345 static void *ublk_alloc_report_buffer(struct ublk_device *ublk, 346 unsigned int nr_zones, size_t *buflen) 347 { 348 struct request_queue *q = ublk->ub_disk->queue; 349 size_t bufsize; 350 void *buf; 351 352 nr_zones = min_t(unsigned int, nr_zones, 353 ublk->ub_disk->nr_zones); 354 355 bufsize = nr_zones * sizeof(struct blk_zone); 356 bufsize = 357 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); 358 359 while (bufsize >= sizeof(struct blk_zone)) { 360 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); 361 if (buf) { 362 *buflen = bufsize; 363 return buf; 364 } 365 bufsize >>= 1; 366 } 367 368 *buflen = 0; 369 return NULL; 370 } 371 372 static int ublk_report_zones(struct gendisk *disk, sector_t sector, 373 unsigned int nr_zones, report_zones_cb cb, void *data) 374 { 375 struct ublk_device *ub = disk->private_data; 376 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; 377 unsigned int first_zone = sector >> ilog2(zone_size_sectors); 378 unsigned int done_zones = 0; 379 unsigned int max_zones_per_request; 380 int ret; 381 struct blk_zone *buffer; 382 size_t buffer_length; 383 384 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, 385 nr_zones); 386 387 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); 388 if (!buffer) 389 return -ENOMEM; 390 391 max_zones_per_request = buffer_length / sizeof(struct blk_zone); 392 393 while (done_zones < nr_zones) { 394 unsigned int remaining_zones = nr_zones - done_zones; 395 unsigned int zones_in_request = 396 min_t(unsigned int, remaining_zones, max_zones_per_request); 397 struct request *req; 398 struct ublk_zoned_report_desc desc; 399 blk_status_t status; 400 401 memset(buffer, 0, buffer_length); 402 403 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); 404 if (IS_ERR(req)) { 405 ret = PTR_ERR(req); 406 goto out; 407 } 408 409 desc.operation = UBLK_IO_OP_REPORT_ZONES; 410 desc.sector = sector; 411 desc.nr_zones = zones_in_request; 412 ret = ublk_zoned_insert_report_desc(req, &desc); 413 if (ret) 414 goto free_req; 415 416 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL); 417 if (ret) 418 goto erase_desc; 419 420 status = blk_execute_rq(req, 0); 421 ret = blk_status_to_errno(status); 422 erase_desc: 423 ublk_zoned_erase_report_desc(req); 424 free_req: 425 blk_mq_free_request(req); 426 if (ret) 427 goto out; 428 429 for (unsigned int i = 0; i < zones_in_request; i++) { 430 struct blk_zone *zone = buffer + i; 431 432 /* A zero length zone means no more zones in this response */ 433 if (!zone->len) 434 break; 435 436 ret = cb(zone, i, data); 437 if (ret) 438 goto out; 439 440 done_zones++; 441 sector += zone_size_sectors; 442 443 } 444 } 445 446 ret = done_zones; 447 448 out: 449 kvfree(buffer); 450 return ret; 451 } 452 453 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 454 struct request *req) 455 { 456 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 457 struct ublk_io *io = &ubq->ios[req->tag]; 458 struct ublk_zoned_report_desc *desc; 459 u32 ublk_op; 460 461 switch (req_op(req)) { 462 case REQ_OP_ZONE_OPEN: 463 ublk_op = UBLK_IO_OP_ZONE_OPEN; 464 break; 465 case REQ_OP_ZONE_CLOSE: 466 ublk_op = UBLK_IO_OP_ZONE_CLOSE; 467 break; 468 case REQ_OP_ZONE_FINISH: 469 ublk_op = UBLK_IO_OP_ZONE_FINISH; 470 break; 471 case REQ_OP_ZONE_RESET: 472 ublk_op = UBLK_IO_OP_ZONE_RESET; 473 break; 474 case REQ_OP_ZONE_APPEND: 475 ublk_op = UBLK_IO_OP_ZONE_APPEND; 476 break; 477 case REQ_OP_ZONE_RESET_ALL: 478 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; 479 break; 480 case REQ_OP_DRV_IN: 481 desc = ublk_zoned_get_report_desc(req); 482 if (!desc) 483 return BLK_STS_IOERR; 484 ublk_op = desc->operation; 485 switch (ublk_op) { 486 case UBLK_IO_OP_REPORT_ZONES: 487 iod->op_flags = ublk_op | ublk_req_build_flags(req); 488 iod->nr_zones = desc->nr_zones; 489 iod->start_sector = desc->sector; 490 return BLK_STS_OK; 491 default: 492 return BLK_STS_IOERR; 493 } 494 case REQ_OP_DRV_OUT: 495 /* We do not support drv_out */ 496 return BLK_STS_NOTSUPP; 497 default: 498 return BLK_STS_IOERR; 499 } 500 501 iod->op_flags = ublk_op | ublk_req_build_flags(req); 502 iod->nr_sectors = blk_rq_sectors(req); 503 iod->start_sector = blk_rq_pos(req); 504 iod->addr = io->addr; 505 506 return BLK_STS_OK; 507 } 508 509 #else 510 511 #define ublk_report_zones (NULL) 512 513 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 514 { 515 return -EOPNOTSUPP; 516 } 517 518 static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 519 { 520 } 521 522 static int ublk_revalidate_disk_zones(struct ublk_device *ub) 523 { 524 return 0; 525 } 526 527 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 528 struct request *req) 529 { 530 return BLK_STS_NOTSUPP; 531 } 532 533 #endif 534 535 static inline void __ublk_complete_rq(struct request *req); 536 537 static dev_t ublk_chr_devt; 538 static const struct class ublk_chr_class = { 539 .name = "ublk-char", 540 }; 541 542 static DEFINE_IDR(ublk_index_idr); 543 static DEFINE_SPINLOCK(ublk_idr_lock); 544 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ 545 546 static DEFINE_MUTEX(ublk_ctl_mutex); 547 548 549 #define UBLK_MAX_UBLKS UBLK_MINORS 550 551 /* 552 * Max unprivileged ublk devices allowed to add 553 * 554 * It can be extended to one per-user limit in future or even controlled 555 * by cgroup. 556 */ 557 static unsigned int unprivileged_ublks_max = 64; 558 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */ 559 560 static struct miscdevice ublk_misc; 561 562 static inline unsigned ublk_pos_to_hwq(loff_t pos) 563 { 564 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & 565 UBLK_QID_BITS_MASK; 566 } 567 568 static inline unsigned ublk_pos_to_buf_off(loff_t pos) 569 { 570 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; 571 } 572 573 static inline unsigned ublk_pos_to_tag(loff_t pos) 574 { 575 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & 576 UBLK_TAG_BITS_MASK; 577 } 578 579 static void ublk_dev_param_basic_apply(struct ublk_device *ub) 580 { 581 const struct ublk_param_basic *p = &ub->params.basic; 582 583 if (p->attrs & UBLK_ATTR_READ_ONLY) 584 set_disk_ro(ub->ub_disk, true); 585 586 set_capacity(ub->ub_disk, p->dev_sectors); 587 } 588 589 static int ublk_validate_params(const struct ublk_device *ub) 590 { 591 /* basic param is the only one which must be set */ 592 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { 593 const struct ublk_param_basic *p = &ub->params.basic; 594 595 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) 596 return -EINVAL; 597 598 if (p->logical_bs_shift > p->physical_bs_shift) 599 return -EINVAL; 600 601 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) 602 return -EINVAL; 603 604 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) 605 return -EINVAL; 606 } else 607 return -EINVAL; 608 609 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 610 const struct ublk_param_discard *p = &ub->params.discard; 611 612 /* So far, only support single segment discard */ 613 if (p->max_discard_sectors && p->max_discard_segments != 1) 614 return -EINVAL; 615 616 if (!p->discard_granularity) 617 return -EINVAL; 618 } 619 620 /* dev_t is read-only */ 621 if (ub->params.types & UBLK_PARAM_TYPE_DEVT) 622 return -EINVAL; 623 624 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 625 return ublk_dev_param_zoned_validate(ub); 626 else if (ublk_dev_is_zoned(ub)) 627 return -EINVAL; 628 629 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { 630 const struct ublk_param_dma_align *p = &ub->params.dma; 631 632 if (p->alignment >= PAGE_SIZE) 633 return -EINVAL; 634 635 if (!is_power_of_2(p->alignment + 1)) 636 return -EINVAL; 637 } 638 639 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { 640 const struct ublk_param_segment *p = &ub->params.seg; 641 642 if (!is_power_of_2(p->seg_boundary_mask + 1)) 643 return -EINVAL; 644 645 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) 646 return -EINVAL; 647 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) 648 return -EINVAL; 649 } 650 651 return 0; 652 } 653 654 static void ublk_apply_params(struct ublk_device *ub) 655 { 656 ublk_dev_param_basic_apply(ub); 657 658 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 659 ublk_dev_param_zoned_apply(ub); 660 } 661 662 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) 663 { 664 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; 665 } 666 667 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) 668 { 669 return ubq->flags & UBLK_F_AUTO_BUF_REG; 670 } 671 672 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) 673 { 674 return ubq->flags & UBLK_F_USER_COPY; 675 } 676 677 static inline bool ublk_need_map_io(const struct ublk_queue *ubq) 678 { 679 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && 680 !ublk_support_auto_buf_reg(ubq); 681 } 682 683 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) 684 { 685 /* 686 * read()/write() is involved in user copy, so request reference 687 * has to be grabbed 688 * 689 * for zero copy, request buffer need to be registered to io_uring 690 * buffer table, so reference is needed 691 * 692 * For auto buffer register, ublk server still may issue 693 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up, 694 * so reference is required too. 695 */ 696 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) || 697 ublk_support_auto_buf_reg(ubq); 698 } 699 700 static inline void ublk_init_req_ref(const struct ublk_queue *ubq, 701 struct ublk_io *io) 702 { 703 if (ublk_need_req_ref(ubq)) 704 refcount_set(&io->ref, UBLK_REFCOUNT_INIT); 705 } 706 707 static inline bool ublk_get_req_ref(struct ublk_io *io) 708 { 709 return refcount_inc_not_zero(&io->ref); 710 } 711 712 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) 713 { 714 if (refcount_dec_and_test(&io->ref)) 715 __ublk_complete_rq(req); 716 } 717 718 static inline bool ublk_sub_req_ref(struct ublk_io *io) 719 { 720 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers; 721 722 io->task_registered_buffers = 0; 723 return refcount_sub_and_test(sub_refs, &io->ref); 724 } 725 726 static inline bool ublk_need_get_data(const struct ublk_queue *ubq) 727 { 728 return ubq->flags & UBLK_F_NEED_GET_DATA; 729 } 730 731 /* Called in slow path only, keep it noinline for trace purpose */ 732 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) 733 { 734 if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) 735 return ub; 736 return NULL; 737 } 738 739 /* Called in slow path only, keep it noinline for trace purpose */ 740 static noinline void ublk_put_device(struct ublk_device *ub) 741 { 742 put_device(&ub->cdev_dev); 743 } 744 745 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, 746 int qid) 747 { 748 return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); 749 } 750 751 static inline bool ublk_rq_has_data(const struct request *rq) 752 { 753 return bio_has_data(rq->bio); 754 } 755 756 static inline struct ublksrv_io_desc * 757 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) 758 { 759 return ublk_get_queue(ub, q_id)->io_cmd_buf; 760 } 761 762 static inline int __ublk_queue_cmd_buf_size(int depth) 763 { 764 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); 765 } 766 767 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) 768 { 769 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 770 771 return __ublk_queue_cmd_buf_size(ubq->q_depth); 772 } 773 774 static int ublk_max_cmd_buf_size(void) 775 { 776 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); 777 } 778 779 /* 780 * Should I/O outstanding to the ublk server when it exits be reissued? 781 * If not, outstanding I/O will get errors. 782 */ 783 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) 784 { 785 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 786 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); 787 } 788 789 /* 790 * Should I/O issued while there is no ublk server queue? If not, I/O 791 * issued while there is no ublk server will get errors. 792 */ 793 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) 794 { 795 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 796 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); 797 } 798 799 /* 800 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy 801 * of the device flags for smaller cache footprint - better for fast 802 * paths. 803 */ 804 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) 805 { 806 return (ubq->flags & UBLK_F_USER_RECOVERY) && 807 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); 808 } 809 810 /* 811 * Should ublk devices be stopped (i.e. no recovery possible) when the 812 * ublk server exits? If not, devices can be used again by a future 813 * incarnation of a ublk server via the start_recovery/end_recovery 814 * commands. 815 */ 816 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) 817 { 818 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); 819 } 820 821 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) 822 { 823 return ub->dev_info.state == UBLK_S_DEV_QUIESCED || 824 ub->dev_info.state == UBLK_S_DEV_FAIL_IO; 825 } 826 827 static void ublk_free_disk(struct gendisk *disk) 828 { 829 struct ublk_device *ub = disk->private_data; 830 831 clear_bit(UB_STATE_USED, &ub->state); 832 ublk_put_device(ub); 833 } 834 835 static void ublk_store_owner_uid_gid(unsigned int *owner_uid, 836 unsigned int *owner_gid) 837 { 838 kuid_t uid; 839 kgid_t gid; 840 841 current_uid_gid(&uid, &gid); 842 843 *owner_uid = from_kuid(&init_user_ns, uid); 844 *owner_gid = from_kgid(&init_user_ns, gid); 845 } 846 847 static int ublk_open(struct gendisk *disk, blk_mode_t mode) 848 { 849 struct ublk_device *ub = disk->private_data; 850 851 if (capable(CAP_SYS_ADMIN)) 852 return 0; 853 854 /* 855 * If it is one unprivileged device, only owner can open 856 * the disk. Otherwise it could be one trap made by one 857 * evil user who grants this disk's privileges to other 858 * users deliberately. 859 * 860 * This way is reasonable too given anyone can create 861 * unprivileged device, and no need other's grant. 862 */ 863 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { 864 unsigned int curr_uid, curr_gid; 865 866 ublk_store_owner_uid_gid(&curr_uid, &curr_gid); 867 868 if (curr_uid != ub->dev_info.owner_uid || curr_gid != 869 ub->dev_info.owner_gid) 870 return -EPERM; 871 } 872 873 return 0; 874 } 875 876 static const struct block_device_operations ub_fops = { 877 .owner = THIS_MODULE, 878 .open = ublk_open, 879 .free_disk = ublk_free_disk, 880 .report_zones = ublk_report_zones, 881 }; 882 883 #define UBLK_MAX_PIN_PAGES 32 884 885 struct ublk_io_iter { 886 struct page *pages[UBLK_MAX_PIN_PAGES]; 887 struct bio *bio; 888 struct bvec_iter iter; 889 }; 890 891 /* return how many pages are copied */ 892 static void ublk_copy_io_pages(struct ublk_io_iter *data, 893 size_t total, size_t pg_off, int dir) 894 { 895 unsigned done = 0; 896 unsigned pg_idx = 0; 897 898 while (done < total) { 899 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); 900 unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, 901 (unsigned)(PAGE_SIZE - pg_off)); 902 void *bv_buf = bvec_kmap_local(&bv); 903 void *pg_buf = kmap_local_page(data->pages[pg_idx]); 904 905 if (dir == ITER_DEST) 906 memcpy(pg_buf + pg_off, bv_buf, bytes); 907 else 908 memcpy(bv_buf, pg_buf + pg_off, bytes); 909 910 kunmap_local(pg_buf); 911 kunmap_local(bv_buf); 912 913 /* advance page array */ 914 pg_off += bytes; 915 if (pg_off == PAGE_SIZE) { 916 pg_idx += 1; 917 pg_off = 0; 918 } 919 920 done += bytes; 921 922 /* advance bio */ 923 bio_advance_iter_single(data->bio, &data->iter, bytes); 924 if (!data->iter.bi_size) { 925 data->bio = data->bio->bi_next; 926 if (data->bio == NULL) 927 break; 928 data->iter = data->bio->bi_iter; 929 } 930 } 931 } 932 933 static bool ublk_advance_io_iter(const struct request *req, 934 struct ublk_io_iter *iter, unsigned int offset) 935 { 936 struct bio *bio = req->bio; 937 938 for_each_bio(bio) { 939 if (bio->bi_iter.bi_size > offset) { 940 iter->bio = bio; 941 iter->iter = bio->bi_iter; 942 bio_advance_iter(iter->bio, &iter->iter, offset); 943 return true; 944 } 945 offset -= bio->bi_iter.bi_size; 946 } 947 return false; 948 } 949 950 /* 951 * Copy data between request pages and io_iter, and 'offset' 952 * is the start point of linear offset of request. 953 */ 954 static size_t ublk_copy_user_pages(const struct request *req, 955 unsigned offset, struct iov_iter *uiter, int dir) 956 { 957 struct ublk_io_iter iter; 958 size_t done = 0; 959 960 if (!ublk_advance_io_iter(req, &iter, offset)) 961 return 0; 962 963 while (iov_iter_count(uiter) && iter.bio) { 964 unsigned nr_pages; 965 ssize_t len; 966 size_t off; 967 int i; 968 969 len = iov_iter_get_pages2(uiter, iter.pages, 970 iov_iter_count(uiter), 971 UBLK_MAX_PIN_PAGES, &off); 972 if (len <= 0) 973 return done; 974 975 ublk_copy_io_pages(&iter, len, off, dir); 976 nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); 977 for (i = 0; i < nr_pages; i++) { 978 if (dir == ITER_DEST) 979 set_page_dirty(iter.pages[i]); 980 put_page(iter.pages[i]); 981 } 982 done += len; 983 } 984 985 return done; 986 } 987 988 static inline bool ublk_need_map_req(const struct request *req) 989 { 990 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; 991 } 992 993 static inline bool ublk_need_unmap_req(const struct request *req) 994 { 995 return ublk_rq_has_data(req) && 996 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); 997 } 998 999 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, 1000 const struct ublk_io *io) 1001 { 1002 const unsigned int rq_bytes = blk_rq_bytes(req); 1003 1004 if (!ublk_need_map_io(ubq)) 1005 return rq_bytes; 1006 1007 /* 1008 * no zero copy, we delay copy WRITE request data into ublksrv 1009 * context and the big benefit is that pinning pages in current 1010 * context is pretty fast, see ublk_pin_user_pages 1011 */ 1012 if (ublk_need_map_req(req)) { 1013 struct iov_iter iter; 1014 const int dir = ITER_DEST; 1015 1016 import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); 1017 return ublk_copy_user_pages(req, 0, &iter, dir); 1018 } 1019 return rq_bytes; 1020 } 1021 1022 static int ublk_unmap_io(const struct ublk_queue *ubq, 1023 const struct request *req, 1024 const struct ublk_io *io) 1025 { 1026 const unsigned int rq_bytes = blk_rq_bytes(req); 1027 1028 if (!ublk_need_map_io(ubq)) 1029 return rq_bytes; 1030 1031 if (ublk_need_unmap_req(req)) { 1032 struct iov_iter iter; 1033 const int dir = ITER_SOURCE; 1034 1035 WARN_ON_ONCE(io->res > rq_bytes); 1036 1037 import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); 1038 return ublk_copy_user_pages(req, 0, &iter, dir); 1039 } 1040 return rq_bytes; 1041 } 1042 1043 static inline unsigned int ublk_req_build_flags(struct request *req) 1044 { 1045 unsigned flags = 0; 1046 1047 if (req->cmd_flags & REQ_FAILFAST_DEV) 1048 flags |= UBLK_IO_F_FAILFAST_DEV; 1049 1050 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT) 1051 flags |= UBLK_IO_F_FAILFAST_TRANSPORT; 1052 1053 if (req->cmd_flags & REQ_FAILFAST_DRIVER) 1054 flags |= UBLK_IO_F_FAILFAST_DRIVER; 1055 1056 if (req->cmd_flags & REQ_META) 1057 flags |= UBLK_IO_F_META; 1058 1059 if (req->cmd_flags & REQ_FUA) 1060 flags |= UBLK_IO_F_FUA; 1061 1062 if (req->cmd_flags & REQ_NOUNMAP) 1063 flags |= UBLK_IO_F_NOUNMAP; 1064 1065 if (req->cmd_flags & REQ_SWAP) 1066 flags |= UBLK_IO_F_SWAP; 1067 1068 return flags; 1069 } 1070 1071 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) 1072 { 1073 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 1074 struct ublk_io *io = &ubq->ios[req->tag]; 1075 enum req_op op = req_op(req); 1076 u32 ublk_op; 1077 1078 if (!ublk_queue_is_zoned(ubq) && 1079 (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) 1080 return BLK_STS_IOERR; 1081 1082 switch (req_op(req)) { 1083 case REQ_OP_READ: 1084 ublk_op = UBLK_IO_OP_READ; 1085 break; 1086 case REQ_OP_WRITE: 1087 ublk_op = UBLK_IO_OP_WRITE; 1088 break; 1089 case REQ_OP_FLUSH: 1090 ublk_op = UBLK_IO_OP_FLUSH; 1091 break; 1092 case REQ_OP_DISCARD: 1093 ublk_op = UBLK_IO_OP_DISCARD; 1094 break; 1095 case REQ_OP_WRITE_ZEROES: 1096 ublk_op = UBLK_IO_OP_WRITE_ZEROES; 1097 break; 1098 default: 1099 if (ublk_queue_is_zoned(ubq)) 1100 return ublk_setup_iod_zoned(ubq, req); 1101 return BLK_STS_IOERR; 1102 } 1103 1104 /* need to translate since kernel may change */ 1105 iod->op_flags = ublk_op | ublk_req_build_flags(req); 1106 iod->nr_sectors = blk_rq_sectors(req); 1107 iod->start_sector = blk_rq_pos(req); 1108 iod->addr = io->addr; 1109 1110 return BLK_STS_OK; 1111 } 1112 1113 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( 1114 struct io_uring_cmd *ioucmd) 1115 { 1116 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); 1117 } 1118 1119 /* todo: handle partial completion */ 1120 static inline void __ublk_complete_rq(struct request *req) 1121 { 1122 struct ublk_queue *ubq = req->mq_hctx->driver_data; 1123 struct ublk_io *io = &ubq->ios[req->tag]; 1124 unsigned int unmapped_bytes; 1125 blk_status_t res = BLK_STS_OK; 1126 1127 /* failed read IO if nothing is read */ 1128 if (!io->res && req_op(req) == REQ_OP_READ) 1129 io->res = -EIO; 1130 1131 if (io->res < 0) { 1132 res = errno_to_blk_status(io->res); 1133 goto exit; 1134 } 1135 1136 /* 1137 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them 1138 * directly. 1139 * 1140 * Both the two needn't unmap. 1141 */ 1142 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && 1143 req_op(req) != REQ_OP_DRV_IN) 1144 goto exit; 1145 1146 /* for READ request, writing data in iod->addr to rq buffers */ 1147 unmapped_bytes = ublk_unmap_io(ubq, req, io); 1148 1149 /* 1150 * Extremely impossible since we got data filled in just before 1151 * 1152 * Re-read simply for this unlikely case. 1153 */ 1154 if (unlikely(unmapped_bytes < io->res)) 1155 io->res = unmapped_bytes; 1156 1157 if (blk_update_request(req, BLK_STS_OK, io->res)) 1158 blk_mq_requeue_request(req, true); 1159 else if (likely(!blk_should_fake_timeout(req->q))) 1160 __blk_mq_end_request(req, BLK_STS_OK); 1161 1162 return; 1163 exit: 1164 blk_mq_end_request(req, res); 1165 } 1166 1167 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, 1168 struct request *req) 1169 { 1170 /* read cmd first because req will overwrite it */ 1171 struct io_uring_cmd *cmd = io->cmd; 1172 1173 /* mark this cmd owned by ublksrv */ 1174 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; 1175 1176 /* 1177 * clear ACTIVE since we are done with this sqe/cmd slot 1178 * We can only accept io cmd in case of being not active. 1179 */ 1180 io->flags &= ~UBLK_IO_FLAG_ACTIVE; 1181 1182 io->req = req; 1183 return cmd; 1184 } 1185 1186 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, 1187 int res, unsigned issue_flags) 1188 { 1189 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); 1190 1191 /* tell ublksrv one io request is coming */ 1192 io_uring_cmd_done(cmd, res, 0, issue_flags); 1193 } 1194 1195 #define UBLK_REQUEUE_DELAY_MS 3 1196 1197 static inline void __ublk_abort_rq(struct ublk_queue *ubq, 1198 struct request *rq) 1199 { 1200 /* We cannot process this rq so just requeue it. */ 1201 if (ublk_nosrv_dev_should_queue_io(ubq->dev)) 1202 blk_mq_requeue_request(rq, false); 1203 else 1204 blk_mq_end_request(rq, BLK_STS_IOERR); 1205 } 1206 1207 static void 1208 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) 1209 { 1210 unsigned tag = io - ubq->ios; 1211 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); 1212 1213 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; 1214 } 1215 1216 static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, 1217 struct ublk_io *io, unsigned int issue_flags) 1218 { 1219 int ret; 1220 1221 ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, 1222 io->buf.index, issue_flags); 1223 if (ret) { 1224 if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { 1225 ublk_auto_buf_reg_fallback(ubq, io); 1226 return true; 1227 } 1228 blk_mq_end_request(req, BLK_STS_IOERR); 1229 return false; 1230 } 1231 1232 io->task_registered_buffers = 1; 1233 io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); 1234 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; 1235 return true; 1236 } 1237 1238 static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, 1239 struct request *req, struct ublk_io *io, 1240 unsigned int issue_flags) 1241 { 1242 ublk_init_req_ref(ubq, io); 1243 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) 1244 return ublk_auto_buf_reg(ubq, req, io, issue_flags); 1245 1246 return true; 1247 } 1248 1249 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, 1250 struct ublk_io *io) 1251 { 1252 unsigned mapped_bytes = ublk_map_io(ubq, req, io); 1253 1254 /* partially mapped, update io descriptor */ 1255 if (unlikely(mapped_bytes != blk_rq_bytes(req))) { 1256 /* 1257 * Nothing mapped, retry until we succeed. 1258 * 1259 * We may never succeed in mapping any bytes here because 1260 * of OOM. TODO: reserve one buffer with single page pinned 1261 * for providing forward progress guarantee. 1262 */ 1263 if (unlikely(!mapped_bytes)) { 1264 blk_mq_requeue_request(req, false); 1265 blk_mq_delay_kick_requeue_list(req->q, 1266 UBLK_REQUEUE_DELAY_MS); 1267 return false; 1268 } 1269 1270 ublk_get_iod(ubq, req->tag)->nr_sectors = 1271 mapped_bytes >> 9; 1272 } 1273 1274 return true; 1275 } 1276 1277 static void ublk_dispatch_req(struct ublk_queue *ubq, 1278 struct request *req, 1279 unsigned int issue_flags) 1280 { 1281 int tag = req->tag; 1282 struct ublk_io *io = &ubq->ios[tag]; 1283 1284 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n", 1285 __func__, ubq->q_id, req->tag, io->flags, 1286 ublk_get_iod(ubq, req->tag)->addr); 1287 1288 /* 1289 * Task is exiting if either: 1290 * 1291 * (1) current != io->task. 1292 * io_uring_cmd_complete_in_task() tries to run task_work 1293 * in a workqueue if cmd's task is PF_EXITING. 1294 * 1295 * (2) current->flags & PF_EXITING. 1296 */ 1297 if (unlikely(current != io->task || current->flags & PF_EXITING)) { 1298 __ublk_abort_rq(ubq, req); 1299 return; 1300 } 1301 1302 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { 1303 /* 1304 * We have not handled UBLK_IO_NEED_GET_DATA command yet, 1305 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv 1306 * and notify it. 1307 */ 1308 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; 1309 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n", 1310 __func__, ubq->q_id, req->tag, io->flags); 1311 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA, 1312 issue_flags); 1313 return; 1314 } 1315 1316 if (!ublk_start_io(ubq, req, io)) 1317 return; 1318 1319 if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags)) 1320 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); 1321 } 1322 1323 static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, 1324 unsigned int issue_flags) 1325 { 1326 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1327 struct ublk_queue *ubq = pdu->ubq; 1328 1329 ublk_dispatch_req(ubq, pdu->req, issue_flags); 1330 } 1331 1332 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) 1333 { 1334 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; 1335 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1336 1337 pdu->req = rq; 1338 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); 1339 } 1340 1341 static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, 1342 unsigned int issue_flags) 1343 { 1344 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1345 struct request *rq = pdu->req_list; 1346 struct request *next; 1347 1348 do { 1349 next = rq->rq_next; 1350 rq->rq_next = NULL; 1351 ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); 1352 rq = next; 1353 } while (rq); 1354 } 1355 1356 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l) 1357 { 1358 struct io_uring_cmd *cmd = io->cmd; 1359 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1360 1361 pdu->req_list = rq_list_peek(l); 1362 rq_list_init(l); 1363 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); 1364 } 1365 1366 static enum blk_eh_timer_return ublk_timeout(struct request *rq) 1367 { 1368 struct ublk_queue *ubq = rq->mq_hctx->driver_data; 1369 pid_t tgid = ubq->dev->ublksrv_tgid; 1370 struct task_struct *p; 1371 struct pid *pid; 1372 1373 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV)) 1374 return BLK_EH_RESET_TIMER; 1375 1376 if (unlikely(!tgid)) 1377 return BLK_EH_RESET_TIMER; 1378 1379 rcu_read_lock(); 1380 pid = find_vpid(tgid); 1381 p = pid_task(pid, PIDTYPE_PID); 1382 if (p) 1383 send_sig(SIGKILL, p, 0); 1384 rcu_read_unlock(); 1385 return BLK_EH_DONE; 1386 } 1387 1388 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, 1389 bool check_cancel) 1390 { 1391 blk_status_t res; 1392 1393 if (unlikely(READ_ONCE(ubq->fail_io))) 1394 return BLK_STS_TARGET; 1395 1396 /* With recovery feature enabled, force_abort is set in 1397 * ublk_stop_dev() before calling del_gendisk(). We have to 1398 * abort all requeued and new rqs here to let del_gendisk() 1399 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() 1400 * to avoid UAF on io_uring ctx. 1401 * 1402 * Note: force_abort is guaranteed to be seen because it is set 1403 * before request queue is unqiuesced. 1404 */ 1405 if (ublk_nosrv_should_queue_io(ubq) && 1406 unlikely(READ_ONCE(ubq->force_abort))) 1407 return BLK_STS_IOERR; 1408 1409 if (check_cancel && unlikely(ubq->canceling)) 1410 return BLK_STS_IOERR; 1411 1412 /* fill iod to slot in io cmd buffer */ 1413 res = ublk_setup_iod(ubq, rq); 1414 if (unlikely(res != BLK_STS_OK)) 1415 return BLK_STS_IOERR; 1416 1417 blk_mq_start_request(rq); 1418 return BLK_STS_OK; 1419 } 1420 1421 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, 1422 const struct blk_mq_queue_data *bd) 1423 { 1424 struct ublk_queue *ubq = hctx->driver_data; 1425 struct request *rq = bd->rq; 1426 blk_status_t res; 1427 1428 res = ublk_prep_req(ubq, rq, false); 1429 if (res != BLK_STS_OK) 1430 return res; 1431 1432 /* 1433 * ->canceling has to be handled after ->force_abort and ->fail_io 1434 * is dealt with, otherwise this request may not be failed in case 1435 * of recovery, and cause hang when deleting disk 1436 */ 1437 if (unlikely(ubq->canceling)) { 1438 __ublk_abort_rq(ubq, rq); 1439 return BLK_STS_OK; 1440 } 1441 1442 ublk_queue_cmd(ubq, rq); 1443 return BLK_STS_OK; 1444 } 1445 1446 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, 1447 const struct ublk_io *io2) 1448 { 1449 return (io_uring_cmd_ctx_handle(io->cmd) == 1450 io_uring_cmd_ctx_handle(io2->cmd)) && 1451 (io->task == io2->task); 1452 } 1453 1454 static void ublk_queue_rqs(struct rq_list *rqlist) 1455 { 1456 struct rq_list requeue_list = { }; 1457 struct rq_list submit_list = { }; 1458 struct ublk_io *io = NULL; 1459 struct request *req; 1460 1461 while ((req = rq_list_pop(rqlist))) { 1462 struct ublk_queue *this_q = req->mq_hctx->driver_data; 1463 struct ublk_io *this_io = &this_q->ios[req->tag]; 1464 1465 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { 1466 rq_list_add_tail(&requeue_list, req); 1467 continue; 1468 } 1469 1470 if (io && !ublk_belong_to_same_batch(io, this_io) && 1471 !rq_list_empty(&submit_list)) 1472 ublk_queue_cmd_list(io, &submit_list); 1473 io = this_io; 1474 rq_list_add_tail(&submit_list, req); 1475 } 1476 1477 if (!rq_list_empty(&submit_list)) 1478 ublk_queue_cmd_list(io, &submit_list); 1479 *rqlist = requeue_list; 1480 } 1481 1482 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, 1483 unsigned int hctx_idx) 1484 { 1485 struct ublk_device *ub = driver_data; 1486 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num); 1487 1488 hctx->driver_data = ubq; 1489 return 0; 1490 } 1491 1492 static const struct blk_mq_ops ublk_mq_ops = { 1493 .queue_rq = ublk_queue_rq, 1494 .queue_rqs = ublk_queue_rqs, 1495 .init_hctx = ublk_init_hctx, 1496 .timeout = ublk_timeout, 1497 }; 1498 1499 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) 1500 { 1501 int i; 1502 1503 /* All old ioucmds have to be completed */ 1504 ubq->nr_io_ready = 0; 1505 1506 for (i = 0; i < ubq->q_depth; i++) { 1507 struct ublk_io *io = &ubq->ios[i]; 1508 1509 /* 1510 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch 1511 * io->cmd 1512 */ 1513 io->flags &= UBLK_IO_FLAG_CANCELED; 1514 io->cmd = NULL; 1515 io->addr = 0; 1516 1517 /* 1518 * old task is PF_EXITING, put it now 1519 * 1520 * It could be NULL in case of closing one quiesced 1521 * device. 1522 */ 1523 if (io->task) { 1524 put_task_struct(io->task); 1525 io->task = NULL; 1526 } 1527 1528 WARN_ON_ONCE(refcount_read(&io->ref)); 1529 WARN_ON_ONCE(io->task_registered_buffers); 1530 } 1531 } 1532 1533 static int ublk_ch_open(struct inode *inode, struct file *filp) 1534 { 1535 struct ublk_device *ub = container_of(inode->i_cdev, 1536 struct ublk_device, cdev); 1537 1538 if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) 1539 return -EBUSY; 1540 filp->private_data = ub; 1541 ub->ublksrv_tgid = current->tgid; 1542 return 0; 1543 } 1544 1545 static void ublk_reset_ch_dev(struct ublk_device *ub) 1546 { 1547 int i; 1548 1549 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1550 ublk_queue_reinit(ub, ublk_get_queue(ub, i)); 1551 1552 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ 1553 ub->mm = NULL; 1554 ub->nr_queues_ready = 0; 1555 ub->unprivileged_daemons = false; 1556 ub->ublksrv_tgid = -1; 1557 } 1558 1559 static struct gendisk *ublk_get_disk(struct ublk_device *ub) 1560 { 1561 struct gendisk *disk; 1562 1563 spin_lock(&ub->lock); 1564 disk = ub->ub_disk; 1565 if (disk) 1566 get_device(disk_to_dev(disk)); 1567 spin_unlock(&ub->lock); 1568 1569 return disk; 1570 } 1571 1572 static void ublk_put_disk(struct gendisk *disk) 1573 { 1574 if (disk) 1575 put_device(disk_to_dev(disk)); 1576 } 1577 1578 /* 1579 * Use this function to ensure that ->canceling is consistently set for 1580 * the device and all queues. Do not set these flags directly. 1581 * 1582 * Caller must ensure that: 1583 * - cancel_mutex is held. This ensures that there is no concurrent 1584 * access to ub->canceling and no concurrent writes to ubq->canceling. 1585 * - there are no concurrent reads of ubq->canceling from the queue_rq 1586 * path. This can be done by quiescing the queue, or through other 1587 * means. 1588 */ 1589 static void ublk_set_canceling(struct ublk_device *ub, bool canceling) 1590 __must_hold(&ub->cancel_mutex) 1591 { 1592 int i; 1593 1594 ub->canceling = canceling; 1595 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1596 ublk_get_queue(ub, i)->canceling = canceling; 1597 } 1598 1599 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub) 1600 { 1601 int i, j; 1602 1603 if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | 1604 UBLK_F_AUTO_BUF_REG))) 1605 return false; 1606 1607 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 1608 struct ublk_queue *ubq = ublk_get_queue(ub, i); 1609 1610 for (j = 0; j < ubq->q_depth; j++) { 1611 struct ublk_io *io = &ubq->ios[j]; 1612 unsigned int refs = refcount_read(&io->ref) + 1613 io->task_registered_buffers; 1614 1615 /* 1616 * UBLK_REFCOUNT_INIT or zero means no active 1617 * reference 1618 */ 1619 if (refs != UBLK_REFCOUNT_INIT && refs != 0) 1620 return true; 1621 1622 /* reset to zero if the io hasn't active references */ 1623 refcount_set(&io->ref, 0); 1624 io->task_registered_buffers = 0; 1625 } 1626 } 1627 return false; 1628 } 1629 1630 static void ublk_ch_release_work_fn(struct work_struct *work) 1631 { 1632 struct ublk_device *ub = 1633 container_of(work, struct ublk_device, exit_work.work); 1634 struct gendisk *disk; 1635 int i; 1636 1637 /* 1638 * For zero-copy and auto buffer register modes, I/O references 1639 * might not be dropped naturally when the daemon is killed, but 1640 * io_uring guarantees that registered bvec kernel buffers are 1641 * unregistered finally when freeing io_uring context, then the 1642 * active references are dropped. 1643 * 1644 * Wait until active references are dropped for avoiding use-after-free 1645 * 1646 * registered buffer may be unregistered in io_ring's release hander, 1647 * so have to wait by scheduling work function for avoiding the two 1648 * file release dependency. 1649 */ 1650 if (ublk_check_and_reset_active_ref(ub)) { 1651 schedule_delayed_work(&ub->exit_work, 1); 1652 return; 1653 } 1654 1655 /* 1656 * disk isn't attached yet, either device isn't live, or it has 1657 * been removed already, so we needn't to do anything 1658 */ 1659 disk = ublk_get_disk(ub); 1660 if (!disk) 1661 goto out; 1662 1663 /* 1664 * All uring_cmd are done now, so abort any request outstanding to 1665 * the ublk server 1666 * 1667 * This can be done in lockless way because ublk server has been 1668 * gone 1669 * 1670 * More importantly, we have to provide forward progress guarantee 1671 * without holding ub->mutex, otherwise control task grabbing 1672 * ub->mutex triggers deadlock 1673 * 1674 * All requests may be inflight, so ->canceling may not be set, set 1675 * it now. 1676 */ 1677 mutex_lock(&ub->cancel_mutex); 1678 ublk_set_canceling(ub, true); 1679 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1680 ublk_abort_queue(ub, ublk_get_queue(ub, i)); 1681 mutex_unlock(&ub->cancel_mutex); 1682 blk_mq_kick_requeue_list(disk->queue); 1683 1684 /* 1685 * All infligh requests have been completed or requeued and any new 1686 * request will be failed or requeued via `->canceling` now, so it is 1687 * fine to grab ub->mutex now. 1688 */ 1689 mutex_lock(&ub->mutex); 1690 1691 /* double check after grabbing lock */ 1692 if (!ub->ub_disk) 1693 goto unlock; 1694 1695 /* 1696 * Transition the device to the nosrv state. What exactly this 1697 * means depends on the recovery flags 1698 */ 1699 if (ublk_nosrv_should_stop_dev(ub)) { 1700 /* 1701 * Allow any pending/future I/O to pass through quickly 1702 * with an error. This is needed because del_gendisk 1703 * waits for all pending I/O to complete 1704 */ 1705 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1706 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true); 1707 1708 ublk_stop_dev_unlocked(ub); 1709 } else { 1710 if (ublk_nosrv_dev_should_queue_io(ub)) { 1711 /* ->canceling is set and all requests are aborted */ 1712 ub->dev_info.state = UBLK_S_DEV_QUIESCED; 1713 } else { 1714 ub->dev_info.state = UBLK_S_DEV_FAIL_IO; 1715 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1716 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true); 1717 } 1718 } 1719 unlock: 1720 mutex_unlock(&ub->mutex); 1721 ublk_put_disk(disk); 1722 1723 /* all uring_cmd has been done now, reset device & ubq */ 1724 ublk_reset_ch_dev(ub); 1725 out: 1726 clear_bit(UB_STATE_OPEN, &ub->state); 1727 1728 /* put the reference grabbed in ublk_ch_release() */ 1729 ublk_put_device(ub); 1730 } 1731 1732 static int ublk_ch_release(struct inode *inode, struct file *filp) 1733 { 1734 struct ublk_device *ub = filp->private_data; 1735 1736 /* 1737 * Grab ublk device reference, so it won't be gone until we are 1738 * really released from work function. 1739 */ 1740 ublk_get_device(ub); 1741 1742 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn); 1743 schedule_delayed_work(&ub->exit_work, 0); 1744 return 0; 1745 } 1746 1747 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */ 1748 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) 1749 { 1750 struct ublk_device *ub = filp->private_data; 1751 size_t sz = vma->vm_end - vma->vm_start; 1752 unsigned max_sz = ublk_max_cmd_buf_size(); 1753 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; 1754 int q_id, ret = 0; 1755 1756 spin_lock(&ub->lock); 1757 if (!ub->mm) 1758 ub->mm = current->mm; 1759 if (current->mm != ub->mm) 1760 ret = -EINVAL; 1761 spin_unlock(&ub->lock); 1762 1763 if (ret) 1764 return ret; 1765 1766 if (vma->vm_flags & VM_WRITE) 1767 return -EPERM; 1768 1769 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz; 1770 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end) 1771 return -EINVAL; 1772 1773 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz; 1774 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n", 1775 __func__, q_id, current->pid, vma->vm_start, 1776 phys_off, (unsigned long)sz); 1777 1778 if (sz != ublk_queue_cmd_buf_size(ub, q_id)) 1779 return -EINVAL; 1780 1781 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; 1782 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 1783 } 1784 1785 static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, 1786 struct request *req) 1787 { 1788 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); 1789 1790 if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) 1791 blk_mq_requeue_request(req, false); 1792 else { 1793 io->res = -EIO; 1794 __ublk_complete_rq(req); 1795 } 1796 } 1797 1798 /* 1799 * Called from ublk char device release handler, when any uring_cmd is 1800 * done, meantime request queue is "quiesced" since all inflight requests 1801 * can't be completed because ublk server is dead. 1802 * 1803 * So no one can hold our request IO reference any more, simply ignore the 1804 * reference, and complete the request immediately 1805 */ 1806 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) 1807 { 1808 int i; 1809 1810 for (i = 0; i < ubq->q_depth; i++) { 1811 struct ublk_io *io = &ubq->ios[i]; 1812 1813 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) 1814 __ublk_fail_req(ubq, io, io->req); 1815 } 1816 } 1817 1818 static void ublk_start_cancel(struct ublk_device *ub) 1819 { 1820 struct gendisk *disk = ublk_get_disk(ub); 1821 1822 /* Our disk has been dead */ 1823 if (!disk) 1824 return; 1825 1826 mutex_lock(&ub->cancel_mutex); 1827 if (ub->canceling) 1828 goto out; 1829 /* 1830 * Now we are serialized with ublk_queue_rq() 1831 * 1832 * Make sure that ubq->canceling is set when queue is frozen, 1833 * because ublk_queue_rq() has to rely on this flag for avoiding to 1834 * touch completed uring_cmd 1835 */ 1836 blk_mq_quiesce_queue(disk->queue); 1837 ublk_set_canceling(ub, true); 1838 blk_mq_unquiesce_queue(disk->queue); 1839 out: 1840 mutex_unlock(&ub->cancel_mutex); 1841 ublk_put_disk(disk); 1842 } 1843 1844 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, 1845 unsigned int issue_flags) 1846 { 1847 struct ublk_io *io = &ubq->ios[tag]; 1848 struct ublk_device *ub = ubq->dev; 1849 struct request *req; 1850 bool done; 1851 1852 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) 1853 return; 1854 1855 /* 1856 * Don't try to cancel this command if the request is started for 1857 * avoiding race between io_uring_cmd_done() and 1858 * io_uring_cmd_complete_in_task(). 1859 * 1860 * Either the started request will be aborted via __ublk_abort_rq(), 1861 * then this uring_cmd is canceled next time, or it will be done in 1862 * task work function ublk_dispatch_req() because io_uring guarantees 1863 * that ublk_dispatch_req() is always called 1864 */ 1865 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); 1866 if (req && blk_mq_request_started(req) && req->tag == tag) 1867 return; 1868 1869 spin_lock(&ubq->cancel_lock); 1870 done = !!(io->flags & UBLK_IO_FLAG_CANCELED); 1871 if (!done) 1872 io->flags |= UBLK_IO_FLAG_CANCELED; 1873 spin_unlock(&ubq->cancel_lock); 1874 1875 if (!done) 1876 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); 1877 } 1878 1879 /* 1880 * The ublk char device won't be closed when calling cancel fn, so both 1881 * ublk device and queue are guaranteed to be live 1882 * 1883 * Two-stage cancel: 1884 * 1885 * - make every active uring_cmd done in ->cancel_fn() 1886 * 1887 * - aborting inflight ublk IO requests in ublk char device release handler, 1888 * which depends on 1st stage because device can only be closed iff all 1889 * uring_cmd are done 1890 * 1891 * Do _not_ try to acquire ub->mutex before all inflight requests are 1892 * aborted, otherwise deadlock may be caused. 1893 */ 1894 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, 1895 unsigned int issue_flags) 1896 { 1897 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1898 struct ublk_queue *ubq = pdu->ubq; 1899 struct task_struct *task; 1900 struct ublk_io *io; 1901 1902 if (WARN_ON_ONCE(!ubq)) 1903 return; 1904 1905 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth)) 1906 return; 1907 1908 task = io_uring_cmd_get_task(cmd); 1909 io = &ubq->ios[pdu->tag]; 1910 if (WARN_ON_ONCE(task && task != io->task)) 1911 return; 1912 1913 ublk_start_cancel(ubq->dev); 1914 1915 WARN_ON_ONCE(io->cmd != cmd); 1916 ublk_cancel_cmd(ubq, pdu->tag, issue_flags); 1917 } 1918 1919 static inline bool ublk_queue_ready(struct ublk_queue *ubq) 1920 { 1921 return ubq->nr_io_ready == ubq->q_depth; 1922 } 1923 1924 static void ublk_cancel_queue(struct ublk_queue *ubq) 1925 { 1926 int i; 1927 1928 for (i = 0; i < ubq->q_depth; i++) 1929 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); 1930 } 1931 1932 /* Cancel all pending commands, must be called after del_gendisk() returns */ 1933 static void ublk_cancel_dev(struct ublk_device *ub) 1934 { 1935 int i; 1936 1937 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1938 ublk_cancel_queue(ublk_get_queue(ub, i)); 1939 } 1940 1941 static bool ublk_check_inflight_rq(struct request *rq, void *data) 1942 { 1943 bool *idle = data; 1944 1945 if (blk_mq_request_started(rq)) { 1946 *idle = false; 1947 return false; 1948 } 1949 return true; 1950 } 1951 1952 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) 1953 { 1954 bool idle; 1955 1956 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); 1957 while (true) { 1958 idle = true; 1959 blk_mq_tagset_busy_iter(&ub->tag_set, 1960 ublk_check_inflight_rq, &idle); 1961 if (idle) 1962 break; 1963 msleep(UBLK_REQUEUE_DELAY_MS); 1964 } 1965 } 1966 1967 static void ublk_force_abort_dev(struct ublk_device *ub) 1968 { 1969 int i; 1970 1971 pr_devel("%s: force abort ub: dev_id %d state %s\n", 1972 __func__, ub->dev_info.dev_id, 1973 ub->dev_info.state == UBLK_S_DEV_LIVE ? 1974 "LIVE" : "QUIESCED"); 1975 blk_mq_quiesce_queue(ub->ub_disk->queue); 1976 if (ub->dev_info.state == UBLK_S_DEV_LIVE) 1977 ublk_wait_tagset_rqs_idle(ub); 1978 1979 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1980 ublk_get_queue(ub, i)->force_abort = true; 1981 blk_mq_unquiesce_queue(ub->ub_disk->queue); 1982 /* We may have requeued some rqs in ublk_quiesce_queue() */ 1983 blk_mq_kick_requeue_list(ub->ub_disk->queue); 1984 } 1985 1986 static struct gendisk *ublk_detach_disk(struct ublk_device *ub) 1987 { 1988 struct gendisk *disk; 1989 1990 /* Sync with ublk_abort_queue() by holding the lock */ 1991 spin_lock(&ub->lock); 1992 disk = ub->ub_disk; 1993 ub->dev_info.state = UBLK_S_DEV_DEAD; 1994 ub->dev_info.ublksrv_pid = -1; 1995 ub->ub_disk = NULL; 1996 spin_unlock(&ub->lock); 1997 1998 return disk; 1999 } 2000 2001 static void ublk_stop_dev_unlocked(struct ublk_device *ub) 2002 __must_hold(&ub->mutex) 2003 { 2004 struct gendisk *disk; 2005 2006 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 2007 return; 2008 2009 if (ublk_nosrv_dev_should_queue_io(ub)) 2010 ublk_force_abort_dev(ub); 2011 del_gendisk(ub->ub_disk); 2012 disk = ublk_detach_disk(ub); 2013 put_disk(disk); 2014 } 2015 2016 static void ublk_stop_dev(struct ublk_device *ub) 2017 { 2018 mutex_lock(&ub->mutex); 2019 ublk_stop_dev_unlocked(ub); 2020 mutex_unlock(&ub->mutex); 2021 ublk_cancel_dev(ub); 2022 } 2023 2024 /* reset ublk io_uring queue & io flags */ 2025 static void ublk_reset_io_flags(struct ublk_device *ub) 2026 { 2027 int i, j; 2028 2029 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 2030 struct ublk_queue *ubq = ublk_get_queue(ub, i); 2031 2032 /* UBLK_IO_FLAG_CANCELED can be cleared now */ 2033 spin_lock(&ubq->cancel_lock); 2034 for (j = 0; j < ubq->q_depth; j++) 2035 ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; 2036 spin_unlock(&ubq->cancel_lock); 2037 ubq->fail_io = false; 2038 } 2039 mutex_lock(&ub->cancel_mutex); 2040 ublk_set_canceling(ub, false); 2041 mutex_unlock(&ub->cancel_mutex); 2042 } 2043 2044 /* device can only be started after all IOs are ready */ 2045 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) 2046 __must_hold(&ub->mutex) 2047 { 2048 ubq->nr_io_ready++; 2049 if (ublk_queue_ready(ubq)) 2050 ub->nr_queues_ready++; 2051 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) 2052 ub->unprivileged_daemons = true; 2053 2054 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { 2055 /* now we are ready for handling ublk io request */ 2056 ublk_reset_io_flags(ub); 2057 complete_all(&ub->completion); 2058 } 2059 } 2060 2061 static inline int ublk_check_cmd_op(u32 cmd_op) 2062 { 2063 u32 ioc_type = _IOC_TYPE(cmd_op); 2064 2065 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') 2066 return -EOPNOTSUPP; 2067 2068 if (ioc_type != 'u' && ioc_type != 0) 2069 return -EOPNOTSUPP; 2070 2071 return 0; 2072 } 2073 2074 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) 2075 { 2076 io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); 2077 2078 if (io->buf.reserved0 || io->buf.reserved1) 2079 return -EINVAL; 2080 2081 if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) 2082 return -EINVAL; 2083 return 0; 2084 } 2085 2086 static int ublk_handle_auto_buf_reg(struct ublk_io *io, 2087 struct io_uring_cmd *cmd, 2088 u16 *buf_idx) 2089 { 2090 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { 2091 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; 2092 2093 /* 2094 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` 2095 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same 2096 * `io_ring_ctx`. 2097 * 2098 * If this uring_cmd's io_ring_ctx isn't same with the 2099 * one for registering the buffer, it is ublk server's 2100 * responsibility for unregistering the buffer, otherwise 2101 * this ublk request gets stuck. 2102 */ 2103 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) 2104 *buf_idx = io->buf.index; 2105 } 2106 2107 return ublk_set_auto_buf_reg(io, cmd); 2108 } 2109 2110 /* Once we return, `io->req` can't be used any more */ 2111 static inline struct request * 2112 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) 2113 { 2114 struct request *req = io->req; 2115 2116 io->cmd = cmd; 2117 io->flags |= UBLK_IO_FLAG_ACTIVE; 2118 /* now this cmd slot is owned by ublk driver */ 2119 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; 2120 2121 return req; 2122 } 2123 2124 static inline int 2125 ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io, 2126 struct io_uring_cmd *cmd, unsigned long buf_addr, 2127 u16 *buf_idx) 2128 { 2129 if (ublk_support_auto_buf_reg(ubq)) 2130 return ublk_handle_auto_buf_reg(io, cmd, buf_idx); 2131 2132 io->addr = buf_addr; 2133 return 0; 2134 } 2135 2136 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, 2137 unsigned int issue_flags, 2138 struct ublk_queue *ubq, unsigned int tag) 2139 { 2140 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 2141 2142 /* 2143 * Safe to refer to @ubq since ublk_queue won't be died until its 2144 * commands are completed 2145 */ 2146 pdu->ubq = ubq; 2147 pdu->tag = tag; 2148 io_uring_cmd_mark_cancelable(cmd, issue_flags); 2149 } 2150 2151 static void ublk_io_release(void *priv) 2152 { 2153 struct request *rq = priv; 2154 struct ublk_queue *ubq = rq->mq_hctx->driver_data; 2155 struct ublk_io *io = &ubq->ios[rq->tag]; 2156 2157 /* 2158 * task_registered_buffers may be 0 if buffers were registered off task 2159 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ. 2160 */ 2161 if (current == io->task && io->task_registered_buffers) 2162 io->task_registered_buffers--; 2163 else 2164 ublk_put_req_ref(io, rq); 2165 } 2166 2167 static int ublk_register_io_buf(struct io_uring_cmd *cmd, 2168 const struct ublk_queue *ubq, 2169 struct ublk_io *io, 2170 unsigned int index, unsigned int issue_flags) 2171 { 2172 struct ublk_device *ub = cmd->file->private_data; 2173 struct request *req; 2174 int ret; 2175 2176 if (!ublk_support_zero_copy(ubq)) 2177 return -EINVAL; 2178 2179 req = __ublk_check_and_get_req(ub, ubq, io, 0); 2180 if (!req) 2181 return -EINVAL; 2182 2183 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, 2184 issue_flags); 2185 if (ret) { 2186 ublk_put_req_ref(io, req); 2187 return ret; 2188 } 2189 2190 return 0; 2191 } 2192 2193 static int 2194 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, 2195 const struct ublk_queue *ubq, struct ublk_io *io, 2196 unsigned index, unsigned issue_flags) 2197 { 2198 unsigned new_registered_buffers; 2199 struct request *req = io->req; 2200 int ret; 2201 2202 /* 2203 * Ensure there are still references for ublk_sub_req_ref() to release. 2204 * If not, fall back on the thread-safe buffer registration. 2205 */ 2206 new_registered_buffers = io->task_registered_buffers + 1; 2207 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) 2208 return ublk_register_io_buf(cmd, ubq, io, index, issue_flags); 2209 2210 if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) 2211 return -EINVAL; 2212 2213 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, 2214 issue_flags); 2215 if (ret) 2216 return ret; 2217 2218 io->task_registered_buffers = new_registered_buffers; 2219 return 0; 2220 } 2221 2222 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, 2223 const struct ublk_device *ub, 2224 unsigned int index, unsigned int issue_flags) 2225 { 2226 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) 2227 return -EINVAL; 2228 2229 return io_buffer_unregister_bvec(cmd, index, issue_flags); 2230 } 2231 2232 static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr) 2233 { 2234 if (ublk_need_map_io(ubq)) { 2235 /* 2236 * FETCH_RQ has to provide IO buffer if NEED GET 2237 * DATA is not enabled 2238 */ 2239 if (!buf_addr && !ublk_need_get_data(ubq)) 2240 return -EINVAL; 2241 } else if (buf_addr) { 2242 /* User copy requires addr to be unset */ 2243 return -EINVAL; 2244 } 2245 return 0; 2246 } 2247 2248 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, 2249 struct ublk_io *io, __u64 buf_addr) 2250 { 2251 struct ublk_device *ub = ubq->dev; 2252 int ret = 0; 2253 2254 /* 2255 * When handling FETCH command for setting up ublk uring queue, 2256 * ub->mutex is the innermost lock, and we won't block for handling 2257 * FETCH, so it is fine even for IO_URING_F_NONBLOCK. 2258 */ 2259 mutex_lock(&ub->mutex); 2260 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ 2261 if (ublk_queue_ready(ubq)) { 2262 ret = -EBUSY; 2263 goto out; 2264 } 2265 2266 /* allow each command to be FETCHed at most once */ 2267 if (io->flags & UBLK_IO_FLAG_ACTIVE) { 2268 ret = -EINVAL; 2269 goto out; 2270 } 2271 2272 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); 2273 2274 ublk_fill_io_cmd(io, cmd); 2275 ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL); 2276 if (ret) 2277 goto out; 2278 2279 WRITE_ONCE(io->task, get_task_struct(current)); 2280 ublk_mark_io_ready(ub, ubq); 2281 out: 2282 mutex_unlock(&ub->mutex); 2283 return ret; 2284 } 2285 2286 static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq, 2287 struct ublk_io *io, __u64 buf_addr) 2288 { 2289 struct request *req = io->req; 2290 2291 if (ublk_need_map_io(ubq)) { 2292 /* 2293 * COMMIT_AND_FETCH_REQ has to provide IO buffer if 2294 * NEED GET DATA is not enabled or it is Read IO. 2295 */ 2296 if (!buf_addr && (!ublk_need_get_data(ubq) || 2297 req_op(req) == REQ_OP_READ)) 2298 return -EINVAL; 2299 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { 2300 /* 2301 * User copy requires addr to be unset when command is 2302 * not zone append 2303 */ 2304 return -EINVAL; 2305 } 2306 2307 return 0; 2308 } 2309 2310 static bool ublk_need_complete_req(const struct ublk_queue *ubq, 2311 struct ublk_io *io) 2312 { 2313 if (ublk_need_req_ref(ubq)) 2314 return ublk_sub_req_ref(io); 2315 return true; 2316 } 2317 2318 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, 2319 struct request *req) 2320 { 2321 /* 2322 * We have handled UBLK_IO_NEED_GET_DATA command, 2323 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just 2324 * do the copy work. 2325 */ 2326 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; 2327 /* update iod->addr because ublksrv may have passed a new io buffer */ 2328 ublk_get_iod(ubq, req->tag)->addr = io->addr; 2329 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", 2330 __func__, ubq->q_id, req->tag, io->flags, 2331 ublk_get_iod(ubq, req->tag)->addr); 2332 2333 return ublk_start_io(ubq, req, io); 2334 } 2335 2336 static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, 2337 unsigned int issue_flags, 2338 const struct ublksrv_io_cmd *ub_cmd) 2339 { 2340 u16 buf_idx = UBLK_INVALID_BUF_IDX; 2341 struct ublk_device *ub = cmd->file->private_data; 2342 struct ublk_queue *ubq; 2343 struct ublk_io *io; 2344 u32 cmd_op = cmd->cmd_op; 2345 unsigned tag = ub_cmd->tag; 2346 struct request *req; 2347 int ret; 2348 bool compl; 2349 2350 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", 2351 __func__, cmd->cmd_op, ub_cmd->q_id, tag, 2352 ub_cmd->result); 2353 2354 ret = ublk_check_cmd_op(cmd_op); 2355 if (ret) 2356 goto out; 2357 2358 /* 2359 * io_buffer_unregister_bvec() doesn't access the ubq or io, 2360 * so no need to validate the q_id, tag, or task 2361 */ 2362 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) 2363 return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, 2364 issue_flags); 2365 2366 ret = -EINVAL; 2367 if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) 2368 goto out; 2369 2370 ubq = ublk_get_queue(ub, ub_cmd->q_id); 2371 2372 if (tag >= ubq->q_depth) 2373 goto out; 2374 2375 io = &ubq->ios[tag]; 2376 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ 2377 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { 2378 ret = ublk_check_fetch_buf(ubq, ub_cmd->addr); 2379 if (ret) 2380 goto out; 2381 ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); 2382 if (ret) 2383 goto out; 2384 2385 ublk_prep_cancel(cmd, issue_flags, ubq, tag); 2386 return -EIOCBQUEUED; 2387 } 2388 2389 if (READ_ONCE(io->task) != current) { 2390 /* 2391 * ublk_register_io_buf() accesses only the io's refcount, 2392 * so can be handled on any task 2393 */ 2394 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) 2395 return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, 2396 issue_flags); 2397 2398 goto out; 2399 } 2400 2401 /* there is pending io cmd, something must be wrong */ 2402 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) { 2403 ret = -EBUSY; 2404 goto out; 2405 } 2406 2407 /* 2408 * ensure that the user issues UBLK_IO_NEED_GET_DATA 2409 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. 2410 */ 2411 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) 2412 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) 2413 goto out; 2414 2415 switch (_IOC_NR(cmd_op)) { 2416 case UBLK_IO_REGISTER_IO_BUF: 2417 return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr, 2418 issue_flags); 2419 case UBLK_IO_COMMIT_AND_FETCH_REQ: 2420 ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr); 2421 if (ret) 2422 goto out; 2423 io->res = ub_cmd->result; 2424 req = ublk_fill_io_cmd(io, cmd); 2425 ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx); 2426 compl = ublk_need_complete_req(ubq, io); 2427 2428 /* can't touch 'ublk_io' any more */ 2429 if (buf_idx != UBLK_INVALID_BUF_IDX) 2430 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); 2431 if (req_op(req) == REQ_OP_ZONE_APPEND) 2432 req->__sector = ub_cmd->zone_append_lba; 2433 if (compl) 2434 __ublk_complete_rq(req); 2435 2436 if (ret) 2437 goto out; 2438 break; 2439 case UBLK_IO_NEED_GET_DATA: 2440 /* 2441 * ublk_get_data() may fail and fallback to requeue, so keep 2442 * uring_cmd active first and prepare for handling new requeued 2443 * request 2444 */ 2445 req = ublk_fill_io_cmd(io, cmd); 2446 ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL); 2447 WARN_ON_ONCE(ret); 2448 if (likely(ublk_get_data(ubq, io, req))) { 2449 __ublk_prep_compl_io_cmd(io, req); 2450 return UBLK_IO_RES_OK; 2451 } 2452 break; 2453 default: 2454 goto out; 2455 } 2456 ublk_prep_cancel(cmd, issue_flags, ubq, tag); 2457 return -EIOCBQUEUED; 2458 2459 out: 2460 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", 2461 __func__, cmd_op, tag, ret, io->flags); 2462 return ret; 2463 } 2464 2465 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 2466 const struct ublk_queue *ubq, struct ublk_io *io, size_t offset) 2467 { 2468 unsigned tag = io - ubq->ios; 2469 struct request *req; 2470 2471 /* 2472 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, 2473 * which would overwrite it with io->cmd 2474 */ 2475 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); 2476 if (!req) 2477 return NULL; 2478 2479 if (!ublk_get_req_ref(io)) 2480 return NULL; 2481 2482 if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) 2483 goto fail_put; 2484 2485 if (!ublk_rq_has_data(req)) 2486 goto fail_put; 2487 2488 if (offset > blk_rq_bytes(req)) 2489 goto fail_put; 2490 2491 return req; 2492 fail_put: 2493 ublk_put_req_ref(io, req); 2494 return NULL; 2495 } 2496 2497 static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, 2498 unsigned int issue_flags) 2499 { 2500 /* 2501 * Not necessary for async retry, but let's keep it simple and always 2502 * copy the values to avoid any potential reuse. 2503 */ 2504 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); 2505 const struct ublksrv_io_cmd ub_cmd = { 2506 .q_id = READ_ONCE(ub_src->q_id), 2507 .tag = READ_ONCE(ub_src->tag), 2508 .result = READ_ONCE(ub_src->result), 2509 .addr = READ_ONCE(ub_src->addr) 2510 }; 2511 2512 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); 2513 2514 return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); 2515 } 2516 2517 static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, 2518 unsigned int issue_flags) 2519 { 2520 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); 2521 2522 if (ret != -EIOCBQUEUED) 2523 io_uring_cmd_done(cmd, ret, 0, issue_flags); 2524 } 2525 2526 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 2527 { 2528 if (unlikely(issue_flags & IO_URING_F_CANCEL)) { 2529 ublk_uring_cmd_cancel_fn(cmd, issue_flags); 2530 return 0; 2531 } 2532 2533 /* well-implemented server won't run into unlocked */ 2534 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 2535 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb); 2536 return -EIOCBQUEUED; 2537 } 2538 2539 return ublk_ch_uring_cmd_local(cmd, issue_flags); 2540 } 2541 2542 static inline bool ublk_check_ubuf_dir(const struct request *req, 2543 int ubuf_dir) 2544 { 2545 /* copy ubuf to request pages */ 2546 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && 2547 ubuf_dir == ITER_SOURCE) 2548 return true; 2549 2550 /* copy request pages to ubuf */ 2551 if ((req_op(req) == REQ_OP_WRITE || 2552 req_op(req) == REQ_OP_ZONE_APPEND) && 2553 ubuf_dir == ITER_DEST) 2554 return true; 2555 2556 return false; 2557 } 2558 2559 static struct request *ublk_check_and_get_req(struct kiocb *iocb, 2560 struct iov_iter *iter, size_t *off, int dir, 2561 struct ublk_io **io) 2562 { 2563 struct ublk_device *ub = iocb->ki_filp->private_data; 2564 struct ublk_queue *ubq; 2565 struct request *req; 2566 size_t buf_off; 2567 u16 tag, q_id; 2568 2569 if (!ub) 2570 return ERR_PTR(-EACCES); 2571 2572 if (!user_backed_iter(iter)) 2573 return ERR_PTR(-EACCES); 2574 2575 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 2576 return ERR_PTR(-EACCES); 2577 2578 tag = ublk_pos_to_tag(iocb->ki_pos); 2579 q_id = ublk_pos_to_hwq(iocb->ki_pos); 2580 buf_off = ublk_pos_to_buf_off(iocb->ki_pos); 2581 2582 if (q_id >= ub->dev_info.nr_hw_queues) 2583 return ERR_PTR(-EINVAL); 2584 2585 ubq = ublk_get_queue(ub, q_id); 2586 if (!ubq) 2587 return ERR_PTR(-EINVAL); 2588 2589 if (!ublk_support_user_copy(ubq)) 2590 return ERR_PTR(-EACCES); 2591 2592 if (tag >= ubq->q_depth) 2593 return ERR_PTR(-EINVAL); 2594 2595 *io = &ubq->ios[tag]; 2596 req = __ublk_check_and_get_req(ub, ubq, *io, buf_off); 2597 if (!req) 2598 return ERR_PTR(-EINVAL); 2599 2600 if (!req->mq_hctx || !req->mq_hctx->driver_data) 2601 goto fail; 2602 2603 if (!ublk_check_ubuf_dir(req, dir)) 2604 goto fail; 2605 2606 *off = buf_off; 2607 return req; 2608 fail: 2609 ublk_put_req_ref(*io, req); 2610 return ERR_PTR(-EACCES); 2611 } 2612 2613 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) 2614 { 2615 struct request *req; 2616 struct ublk_io *io; 2617 size_t buf_off; 2618 size_t ret; 2619 2620 req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io); 2621 if (IS_ERR(req)) 2622 return PTR_ERR(req); 2623 2624 ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); 2625 ublk_put_req_ref(io, req); 2626 2627 return ret; 2628 } 2629 2630 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) 2631 { 2632 struct request *req; 2633 struct ublk_io *io; 2634 size_t buf_off; 2635 size_t ret; 2636 2637 req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io); 2638 if (IS_ERR(req)) 2639 return PTR_ERR(req); 2640 2641 ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); 2642 ublk_put_req_ref(io, req); 2643 2644 return ret; 2645 } 2646 2647 static const struct file_operations ublk_ch_fops = { 2648 .owner = THIS_MODULE, 2649 .open = ublk_ch_open, 2650 .release = ublk_ch_release, 2651 .read_iter = ublk_ch_read_iter, 2652 .write_iter = ublk_ch_write_iter, 2653 .uring_cmd = ublk_ch_uring_cmd, 2654 .mmap = ublk_ch_mmap, 2655 }; 2656 2657 static void ublk_deinit_queue(struct ublk_device *ub, int q_id) 2658 { 2659 int size = ublk_queue_cmd_buf_size(ub, q_id); 2660 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 2661 int i; 2662 2663 for (i = 0; i < ubq->q_depth; i++) { 2664 struct ublk_io *io = &ubq->ios[i]; 2665 if (io->task) 2666 put_task_struct(io->task); 2667 WARN_ON_ONCE(refcount_read(&io->ref)); 2668 WARN_ON_ONCE(io->task_registered_buffers); 2669 } 2670 2671 if (ubq->io_cmd_buf) 2672 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); 2673 } 2674 2675 static int ublk_init_queue(struct ublk_device *ub, int q_id) 2676 { 2677 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 2678 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; 2679 void *ptr; 2680 int size; 2681 2682 spin_lock_init(&ubq->cancel_lock); 2683 ubq->flags = ub->dev_info.flags; 2684 ubq->q_id = q_id; 2685 ubq->q_depth = ub->dev_info.queue_depth; 2686 size = ublk_queue_cmd_buf_size(ub, q_id); 2687 2688 ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); 2689 if (!ptr) 2690 return -ENOMEM; 2691 2692 ubq->io_cmd_buf = ptr; 2693 ubq->dev = ub; 2694 return 0; 2695 } 2696 2697 static void ublk_deinit_queues(struct ublk_device *ub) 2698 { 2699 int nr_queues = ub->dev_info.nr_hw_queues; 2700 int i; 2701 2702 if (!ub->__queues) 2703 return; 2704 2705 for (i = 0; i < nr_queues; i++) 2706 ublk_deinit_queue(ub, i); 2707 kvfree(ub->__queues); 2708 } 2709 2710 static int ublk_init_queues(struct ublk_device *ub) 2711 { 2712 int nr_queues = ub->dev_info.nr_hw_queues; 2713 int depth = ub->dev_info.queue_depth; 2714 int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); 2715 int i, ret = -ENOMEM; 2716 2717 ub->queue_size = ubq_size; 2718 ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); 2719 if (!ub->__queues) 2720 return ret; 2721 2722 for (i = 0; i < nr_queues; i++) { 2723 if (ublk_init_queue(ub, i)) 2724 goto fail; 2725 } 2726 2727 init_completion(&ub->completion); 2728 return 0; 2729 2730 fail: 2731 ublk_deinit_queues(ub); 2732 return ret; 2733 } 2734 2735 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) 2736 { 2737 int i = idx; 2738 int err; 2739 2740 spin_lock(&ublk_idr_lock); 2741 /* allocate id, if @id >= 0, we're requesting that specific id */ 2742 if (i >= 0) { 2743 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT); 2744 if (err == -ENOSPC) 2745 err = -EEXIST; 2746 } else { 2747 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS, 2748 GFP_NOWAIT); 2749 } 2750 spin_unlock(&ublk_idr_lock); 2751 2752 if (err >= 0) 2753 ub->ub_number = err; 2754 2755 return err; 2756 } 2757 2758 static void ublk_free_dev_number(struct ublk_device *ub) 2759 { 2760 spin_lock(&ublk_idr_lock); 2761 idr_remove(&ublk_index_idr, ub->ub_number); 2762 wake_up_all(&ublk_idr_wq); 2763 spin_unlock(&ublk_idr_lock); 2764 } 2765 2766 static void ublk_cdev_rel(struct device *dev) 2767 { 2768 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); 2769 2770 blk_mq_free_tag_set(&ub->tag_set); 2771 ublk_deinit_queues(ub); 2772 ublk_free_dev_number(ub); 2773 mutex_destroy(&ub->mutex); 2774 mutex_destroy(&ub->cancel_mutex); 2775 kfree(ub); 2776 } 2777 2778 static int ublk_add_chdev(struct ublk_device *ub) 2779 { 2780 struct device *dev = &ub->cdev_dev; 2781 int minor = ub->ub_number; 2782 int ret; 2783 2784 dev->parent = ublk_misc.this_device; 2785 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); 2786 dev->class = &ublk_chr_class; 2787 dev->release = ublk_cdev_rel; 2788 device_initialize(dev); 2789 2790 ret = dev_set_name(dev, "ublkc%d", minor); 2791 if (ret) 2792 goto fail; 2793 2794 cdev_init(&ub->cdev, &ublk_ch_fops); 2795 ret = cdev_device_add(&ub->cdev, dev); 2796 if (ret) 2797 goto fail; 2798 2799 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) 2800 unprivileged_ublks_added++; 2801 return 0; 2802 fail: 2803 put_device(dev); 2804 return ret; 2805 } 2806 2807 /* align max io buffer size with PAGE_SIZE */ 2808 static void ublk_align_max_io_size(struct ublk_device *ub) 2809 { 2810 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes; 2811 2812 ub->dev_info.max_io_buf_bytes = 2813 round_down(max_io_bytes, PAGE_SIZE); 2814 } 2815 2816 static int ublk_add_tag_set(struct ublk_device *ub) 2817 { 2818 ub->tag_set.ops = &ublk_mq_ops; 2819 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; 2820 ub->tag_set.queue_depth = ub->dev_info.queue_depth; 2821 ub->tag_set.numa_node = NUMA_NO_NODE; 2822 ub->tag_set.driver_data = ub; 2823 return blk_mq_alloc_tag_set(&ub->tag_set); 2824 } 2825 2826 static void ublk_remove(struct ublk_device *ub) 2827 { 2828 bool unprivileged; 2829 2830 ublk_stop_dev(ub); 2831 cdev_device_del(&ub->cdev, &ub->cdev_dev); 2832 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; 2833 ublk_put_device(ub); 2834 2835 if (unprivileged) 2836 unprivileged_ublks_added--; 2837 } 2838 2839 static struct ublk_device *ublk_get_device_from_id(int idx) 2840 { 2841 struct ublk_device *ub = NULL; 2842 2843 if (idx < 0) 2844 return NULL; 2845 2846 spin_lock(&ublk_idr_lock); 2847 ub = idr_find(&ublk_index_idr, idx); 2848 if (ub) 2849 ub = ublk_get_device(ub); 2850 spin_unlock(&ublk_idr_lock); 2851 2852 return ub; 2853 } 2854 2855 static int ublk_ctrl_start_dev(struct ublk_device *ub, 2856 const struct ublksrv_ctrl_cmd *header) 2857 { 2858 const struct ublk_param_basic *p = &ub->params.basic; 2859 int ublksrv_pid = (int)header->data[0]; 2860 struct queue_limits lim = { 2861 .logical_block_size = 1 << p->logical_bs_shift, 2862 .physical_block_size = 1 << p->physical_bs_shift, 2863 .io_min = 1 << p->io_min_shift, 2864 .io_opt = 1 << p->io_opt_shift, 2865 .max_hw_sectors = p->max_sectors, 2866 .chunk_sectors = p->chunk_sectors, 2867 .virt_boundary_mask = p->virt_boundary_mask, 2868 .max_segments = USHRT_MAX, 2869 .max_segment_size = UINT_MAX, 2870 .dma_alignment = 3, 2871 }; 2872 struct gendisk *disk; 2873 int ret = -EINVAL; 2874 2875 if (ublksrv_pid <= 0) 2876 return -EINVAL; 2877 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) 2878 return -EINVAL; 2879 2880 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 2881 const struct ublk_param_discard *pd = &ub->params.discard; 2882 2883 lim.discard_alignment = pd->discard_alignment; 2884 lim.discard_granularity = pd->discard_granularity; 2885 lim.max_hw_discard_sectors = pd->max_discard_sectors; 2886 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; 2887 lim.max_discard_segments = pd->max_discard_segments; 2888 } 2889 2890 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { 2891 const struct ublk_param_zoned *p = &ub->params.zoned; 2892 2893 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 2894 return -EOPNOTSUPP; 2895 2896 lim.features |= BLK_FEAT_ZONED; 2897 lim.max_active_zones = p->max_active_zones; 2898 lim.max_open_zones = p->max_open_zones; 2899 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; 2900 } 2901 2902 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { 2903 lim.features |= BLK_FEAT_WRITE_CACHE; 2904 if (ub->params.basic.attrs & UBLK_ATTR_FUA) 2905 lim.features |= BLK_FEAT_FUA; 2906 } 2907 2908 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) 2909 lim.features |= BLK_FEAT_ROTATIONAL; 2910 2911 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) 2912 lim.dma_alignment = ub->params.dma.alignment; 2913 2914 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { 2915 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask; 2916 lim.max_segment_size = ub->params.seg.max_segment_size; 2917 lim.max_segments = ub->params.seg.max_segments; 2918 } 2919 2920 if (wait_for_completion_interruptible(&ub->completion) != 0) 2921 return -EINTR; 2922 2923 if (ub->ublksrv_tgid != ublksrv_pid) 2924 return -EINVAL; 2925 2926 mutex_lock(&ub->mutex); 2927 if (ub->dev_info.state == UBLK_S_DEV_LIVE || 2928 test_bit(UB_STATE_USED, &ub->state)) { 2929 ret = -EEXIST; 2930 goto out_unlock; 2931 } 2932 2933 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); 2934 if (IS_ERR(disk)) { 2935 ret = PTR_ERR(disk); 2936 goto out_unlock; 2937 } 2938 sprintf(disk->disk_name, "ublkb%d", ub->ub_number); 2939 disk->fops = &ub_fops; 2940 disk->private_data = ub; 2941 2942 ub->dev_info.ublksrv_pid = ublksrv_pid; 2943 ub->ub_disk = disk; 2944 2945 ublk_apply_params(ub); 2946 2947 /* don't probe partitions if any daemon task is un-trusted */ 2948 if (ub->unprivileged_daemons) 2949 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 2950 2951 ublk_get_device(ub); 2952 ub->dev_info.state = UBLK_S_DEV_LIVE; 2953 2954 if (ublk_dev_is_zoned(ub)) { 2955 ret = ublk_revalidate_disk_zones(ub); 2956 if (ret) 2957 goto out_put_cdev; 2958 } 2959 2960 ret = add_disk(disk); 2961 if (ret) 2962 goto out_put_cdev; 2963 2964 set_bit(UB_STATE_USED, &ub->state); 2965 2966 out_put_cdev: 2967 if (ret) { 2968 ublk_detach_disk(ub); 2969 ublk_put_device(ub); 2970 } 2971 if (ret) 2972 put_disk(disk); 2973 out_unlock: 2974 mutex_unlock(&ub->mutex); 2975 return ret; 2976 } 2977 2978 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, 2979 const struct ublksrv_ctrl_cmd *header) 2980 { 2981 void __user *argp = (void __user *)(unsigned long)header->addr; 2982 cpumask_var_t cpumask; 2983 unsigned long queue; 2984 unsigned int retlen; 2985 unsigned int i; 2986 int ret; 2987 2988 if (header->len * BITS_PER_BYTE < nr_cpu_ids) 2989 return -EINVAL; 2990 if (header->len & (sizeof(unsigned long)-1)) 2991 return -EINVAL; 2992 if (!header->addr) 2993 return -EINVAL; 2994 2995 queue = header->data[0]; 2996 if (queue >= ub->dev_info.nr_hw_queues) 2997 return -EINVAL; 2998 2999 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 3000 return -ENOMEM; 3001 3002 for_each_possible_cpu(i) { 3003 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) 3004 cpumask_set_cpu(i, cpumask); 3005 } 3006 3007 ret = -EFAULT; 3008 retlen = min_t(unsigned short, header->len, cpumask_size()); 3009 if (copy_to_user(argp, cpumask, retlen)) 3010 goto out_free_cpumask; 3011 if (retlen != header->len && 3012 clear_user(argp + retlen, header->len - retlen)) 3013 goto out_free_cpumask; 3014 3015 ret = 0; 3016 out_free_cpumask: 3017 free_cpumask_var(cpumask); 3018 return ret; 3019 } 3020 3021 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) 3022 { 3023 pr_devel("%s: dev id %d flags %llx\n", __func__, 3024 info->dev_id, info->flags); 3025 pr_devel("\t nr_hw_queues %d queue_depth %d\n", 3026 info->nr_hw_queues, info->queue_depth); 3027 } 3028 3029 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) 3030 { 3031 void __user *argp = (void __user *)(unsigned long)header->addr; 3032 struct ublksrv_ctrl_dev_info info; 3033 struct ublk_device *ub; 3034 int ret = -EINVAL; 3035 3036 if (header->len < sizeof(info) || !header->addr) 3037 return -EINVAL; 3038 if (header->queue_id != (u16)-1) { 3039 pr_warn("%s: queue_id is wrong %x\n", 3040 __func__, header->queue_id); 3041 return -EINVAL; 3042 } 3043 3044 if (copy_from_user(&info, argp, sizeof(info))) 3045 return -EFAULT; 3046 3047 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || 3048 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) 3049 return -EINVAL; 3050 3051 if (capable(CAP_SYS_ADMIN)) 3052 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; 3053 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) 3054 return -EPERM; 3055 3056 /* forbid nonsense combinations of recovery flags */ 3057 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { 3058 case 0: 3059 case UBLK_F_USER_RECOVERY: 3060 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): 3061 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): 3062 break; 3063 default: 3064 pr_warn("%s: invalid recovery flags %llx\n", __func__, 3065 info.flags & UBLK_F_ALL_RECOVERY_FLAGS); 3066 return -EINVAL; 3067 } 3068 3069 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) { 3070 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n"); 3071 return -EINVAL; 3072 } 3073 3074 /* 3075 * unprivileged device can't be trusted, but RECOVERY and 3076 * RECOVERY_REISSUE still may hang error handling, so can't 3077 * support recovery features for unprivileged ublk now 3078 * 3079 * TODO: provide forward progress for RECOVERY handler, so that 3080 * unprivileged device can benefit from it 3081 */ 3082 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { 3083 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | 3084 UBLK_F_USER_RECOVERY); 3085 3086 /* 3087 * For USER_COPY, we depends on userspace to fill request 3088 * buffer by pwrite() to ublk char device, which can't be 3089 * used for unprivileged device 3090 * 3091 * Same with zero copy or auto buffer register. 3092 */ 3093 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | 3094 UBLK_F_AUTO_BUF_REG)) 3095 return -EINVAL; 3096 } 3097 3098 /* the created device is always owned by current user */ 3099 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); 3100 3101 if (header->dev_id != info.dev_id) { 3102 pr_warn("%s: dev id not match %u %u\n", 3103 __func__, header->dev_id, info.dev_id); 3104 return -EINVAL; 3105 } 3106 3107 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) { 3108 pr_warn("%s: dev id is too large. Max supported is %d\n", 3109 __func__, UBLK_MAX_UBLKS - 1); 3110 return -EINVAL; 3111 } 3112 3113 ublk_dump_dev_info(&info); 3114 3115 ret = mutex_lock_killable(&ublk_ctl_mutex); 3116 if (ret) 3117 return ret; 3118 3119 ret = -EACCES; 3120 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) && 3121 unprivileged_ublks_added >= unprivileged_ublks_max) 3122 goto out_unlock; 3123 3124 ret = -ENOMEM; 3125 ub = kzalloc(sizeof(*ub), GFP_KERNEL); 3126 if (!ub) 3127 goto out_unlock; 3128 mutex_init(&ub->mutex); 3129 spin_lock_init(&ub->lock); 3130 mutex_init(&ub->cancel_mutex); 3131 3132 ret = ublk_alloc_dev_number(ub, header->dev_id); 3133 if (ret < 0) 3134 goto out_free_ub; 3135 3136 memcpy(&ub->dev_info, &info, sizeof(info)); 3137 3138 /* update device id */ 3139 ub->dev_info.dev_id = ub->ub_number; 3140 3141 /* 3142 * 64bit flags will be copied back to userspace as feature 3143 * negotiation result, so have to clear flags which driver 3144 * doesn't support yet, then userspace can get correct flags 3145 * (features) to handle. 3146 */ 3147 ub->dev_info.flags &= UBLK_F_ALL; 3148 3149 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | 3150 UBLK_F_URING_CMD_COMP_IN_TASK | 3151 UBLK_F_PER_IO_DAEMON | 3152 UBLK_F_BUF_REG_OFF_DAEMON; 3153 3154 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ 3155 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | 3156 UBLK_F_AUTO_BUF_REG)) 3157 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; 3158 3159 /* 3160 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for 3161 * returning write_append_lba, which is only allowed in case of 3162 * user copy or zero copy 3163 */ 3164 if (ublk_dev_is_zoned(ub) && 3165 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags & 3166 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) { 3167 ret = -EINVAL; 3168 goto out_free_dev_number; 3169 } 3170 3171 ub->dev_info.nr_hw_queues = min_t(unsigned int, 3172 ub->dev_info.nr_hw_queues, nr_cpu_ids); 3173 ublk_align_max_io_size(ub); 3174 3175 ret = ublk_init_queues(ub); 3176 if (ret) 3177 goto out_free_dev_number; 3178 3179 ret = ublk_add_tag_set(ub); 3180 if (ret) 3181 goto out_deinit_queues; 3182 3183 ret = -EFAULT; 3184 if (copy_to_user(argp, &ub->dev_info, sizeof(info))) 3185 goto out_free_tag_set; 3186 3187 /* 3188 * Add the char dev so that ublksrv daemon can be setup. 3189 * ublk_add_chdev() will cleanup everything if it fails. 3190 */ 3191 ret = ublk_add_chdev(ub); 3192 goto out_unlock; 3193 3194 out_free_tag_set: 3195 blk_mq_free_tag_set(&ub->tag_set); 3196 out_deinit_queues: 3197 ublk_deinit_queues(ub); 3198 out_free_dev_number: 3199 ublk_free_dev_number(ub); 3200 out_free_ub: 3201 mutex_destroy(&ub->mutex); 3202 mutex_destroy(&ub->cancel_mutex); 3203 kfree(ub); 3204 out_unlock: 3205 mutex_unlock(&ublk_ctl_mutex); 3206 return ret; 3207 } 3208 3209 static inline bool ublk_idr_freed(int id) 3210 { 3211 void *ptr; 3212 3213 spin_lock(&ublk_idr_lock); 3214 ptr = idr_find(&ublk_index_idr, id); 3215 spin_unlock(&ublk_idr_lock); 3216 3217 return ptr == NULL; 3218 } 3219 3220 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) 3221 { 3222 struct ublk_device *ub = *p_ub; 3223 int idx = ub->ub_number; 3224 int ret; 3225 3226 ret = mutex_lock_killable(&ublk_ctl_mutex); 3227 if (ret) 3228 return ret; 3229 3230 if (!test_bit(UB_STATE_DELETED, &ub->state)) { 3231 ublk_remove(ub); 3232 set_bit(UB_STATE_DELETED, &ub->state); 3233 } 3234 3235 /* Mark the reference as consumed */ 3236 *p_ub = NULL; 3237 ublk_put_device(ub); 3238 mutex_unlock(&ublk_ctl_mutex); 3239 3240 /* 3241 * Wait until the idr is removed, then it can be reused after 3242 * DEL_DEV command is returned. 3243 * 3244 * If we returns because of user interrupt, future delete command 3245 * may come: 3246 * 3247 * - the device number isn't freed, this device won't or needn't 3248 * be deleted again, since UB_STATE_DELETED is set, and device 3249 * will be released after the last reference is dropped 3250 * 3251 * - the device number is freed already, we will not find this 3252 * device via ublk_get_device_from_id() 3253 */ 3254 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) 3255 return -EINTR; 3256 return 0; 3257 } 3258 3259 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) 3260 { 3261 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 3262 3263 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", 3264 __func__, cmd->cmd_op, header->dev_id, header->queue_id, 3265 header->data[0], header->addr, header->len); 3266 } 3267 3268 static int ublk_ctrl_stop_dev(struct ublk_device *ub) 3269 { 3270 ublk_stop_dev(ub); 3271 return 0; 3272 } 3273 3274 static int ublk_ctrl_get_dev_info(struct ublk_device *ub, 3275 const struct ublksrv_ctrl_cmd *header) 3276 { 3277 void __user *argp = (void __user *)(unsigned long)header->addr; 3278 3279 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) 3280 return -EINVAL; 3281 3282 if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) 3283 return -EFAULT; 3284 3285 return 0; 3286 } 3287 3288 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */ 3289 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) 3290 { 3291 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); 3292 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); 3293 3294 if (ub->ub_disk) { 3295 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); 3296 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); 3297 } else { 3298 ub->params.devt.disk_major = 0; 3299 ub->params.devt.disk_minor = 0; 3300 } 3301 ub->params.types |= UBLK_PARAM_TYPE_DEVT; 3302 } 3303 3304 static int ublk_ctrl_get_params(struct ublk_device *ub, 3305 const struct ublksrv_ctrl_cmd *header) 3306 { 3307 void __user *argp = (void __user *)(unsigned long)header->addr; 3308 struct ublk_params_header ph; 3309 int ret; 3310 3311 if (header->len <= sizeof(ph) || !header->addr) 3312 return -EINVAL; 3313 3314 if (copy_from_user(&ph, argp, sizeof(ph))) 3315 return -EFAULT; 3316 3317 if (ph.len > header->len || !ph.len) 3318 return -EINVAL; 3319 3320 if (ph.len > sizeof(struct ublk_params)) 3321 ph.len = sizeof(struct ublk_params); 3322 3323 mutex_lock(&ub->mutex); 3324 ublk_ctrl_fill_params_devt(ub); 3325 if (copy_to_user(argp, &ub->params, ph.len)) 3326 ret = -EFAULT; 3327 else 3328 ret = 0; 3329 mutex_unlock(&ub->mutex); 3330 3331 return ret; 3332 } 3333 3334 static int ublk_ctrl_set_params(struct ublk_device *ub, 3335 const struct ublksrv_ctrl_cmd *header) 3336 { 3337 void __user *argp = (void __user *)(unsigned long)header->addr; 3338 struct ublk_params_header ph; 3339 int ret = -EFAULT; 3340 3341 if (header->len <= sizeof(ph) || !header->addr) 3342 return -EINVAL; 3343 3344 if (copy_from_user(&ph, argp, sizeof(ph))) 3345 return -EFAULT; 3346 3347 if (ph.len > header->len || !ph.len || !ph.types) 3348 return -EINVAL; 3349 3350 if (ph.len > sizeof(struct ublk_params)) 3351 ph.len = sizeof(struct ublk_params); 3352 3353 mutex_lock(&ub->mutex); 3354 if (test_bit(UB_STATE_USED, &ub->state)) { 3355 /* 3356 * Parameters can only be changed when device hasn't 3357 * been started yet 3358 */ 3359 ret = -EACCES; 3360 } else if (copy_from_user(&ub->params, argp, ph.len)) { 3361 ret = -EFAULT; 3362 } else { 3363 /* clear all we don't support yet */ 3364 ub->params.types &= UBLK_PARAM_TYPE_ALL; 3365 ret = ublk_validate_params(ub); 3366 if (ret) 3367 ub->params.types = 0; 3368 } 3369 mutex_unlock(&ub->mutex); 3370 3371 return ret; 3372 } 3373 3374 static int ublk_ctrl_start_recovery(struct ublk_device *ub, 3375 const struct ublksrv_ctrl_cmd *header) 3376 { 3377 int ret = -EINVAL; 3378 3379 mutex_lock(&ub->mutex); 3380 if (ublk_nosrv_should_stop_dev(ub)) 3381 goto out_unlock; 3382 /* 3383 * START_RECOVERY is only allowd after: 3384 * 3385 * (1) UB_STATE_OPEN is not set, which means the dying process is exited 3386 * and related io_uring ctx is freed so file struct of /dev/ublkcX is 3387 * released. 3388 * 3389 * and one of the following holds 3390 * 3391 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: 3392 * (a)has quiesced request queue 3393 * (b)has requeued every inflight rqs whose io_flags is ACTIVE 3394 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE 3395 * (d)has completed/camceled all ioucmds owned by ther dying process 3396 * 3397 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not 3398 * quiesced, but all I/O is being immediately errored 3399 */ 3400 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { 3401 ret = -EBUSY; 3402 goto out_unlock; 3403 } 3404 pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); 3405 init_completion(&ub->completion); 3406 ret = 0; 3407 out_unlock: 3408 mutex_unlock(&ub->mutex); 3409 return ret; 3410 } 3411 3412 static int ublk_ctrl_end_recovery(struct ublk_device *ub, 3413 const struct ublksrv_ctrl_cmd *header) 3414 { 3415 int ublksrv_pid = (int)header->data[0]; 3416 int ret = -EINVAL; 3417 3418 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__, 3419 header->dev_id); 3420 3421 if (wait_for_completion_interruptible(&ub->completion)) 3422 return -EINTR; 3423 3424 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__, 3425 header->dev_id); 3426 3427 if (ub->ublksrv_tgid != ublksrv_pid) 3428 return -EINVAL; 3429 3430 mutex_lock(&ub->mutex); 3431 if (ublk_nosrv_should_stop_dev(ub)) 3432 goto out_unlock; 3433 3434 if (!ublk_dev_in_recoverable_state(ub)) { 3435 ret = -EBUSY; 3436 goto out_unlock; 3437 } 3438 ub->dev_info.ublksrv_pid = ublksrv_pid; 3439 ub->dev_info.state = UBLK_S_DEV_LIVE; 3440 pr_devel("%s: new ublksrv_pid %d, dev id %d\n", 3441 __func__, ublksrv_pid, header->dev_id); 3442 blk_mq_kick_requeue_list(ub->ub_disk->queue); 3443 ret = 0; 3444 out_unlock: 3445 mutex_unlock(&ub->mutex); 3446 return ret; 3447 } 3448 3449 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header) 3450 { 3451 void __user *argp = (void __user *)(unsigned long)header->addr; 3452 u64 features = UBLK_F_ALL; 3453 3454 if (header->len != UBLK_FEATURES_LEN || !header->addr) 3455 return -EINVAL; 3456 3457 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) 3458 return -EFAULT; 3459 3460 return 0; 3461 } 3462 3463 static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) 3464 { 3465 struct ublk_param_basic *p = &ub->params.basic; 3466 u64 new_size = header->data[0]; 3467 3468 mutex_lock(&ub->mutex); 3469 p->dev_sectors = new_size; 3470 set_capacity_and_notify(ub->ub_disk, p->dev_sectors); 3471 mutex_unlock(&ub->mutex); 3472 } 3473 3474 struct count_busy { 3475 const struct ublk_queue *ubq; 3476 unsigned int nr_busy; 3477 }; 3478 3479 static bool ublk_count_busy_req(struct request *rq, void *data) 3480 { 3481 struct count_busy *idle = data; 3482 3483 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq) 3484 idle->nr_busy += 1; 3485 return true; 3486 } 3487 3488 /* uring_cmd is guaranteed to be active if the associated request is idle */ 3489 static bool ubq_has_idle_io(const struct ublk_queue *ubq) 3490 { 3491 struct count_busy data = { 3492 .ubq = ubq, 3493 }; 3494 3495 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data); 3496 return data.nr_busy < ubq->q_depth; 3497 } 3498 3499 /* Wait until each hw queue has at least one idle IO */ 3500 static int ublk_wait_for_idle_io(struct ublk_device *ub, 3501 unsigned int timeout_ms) 3502 { 3503 unsigned int elapsed = 0; 3504 int ret; 3505 3506 while (elapsed < timeout_ms && !signal_pending(current)) { 3507 unsigned int queues_cancelable = 0; 3508 int i; 3509 3510 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 3511 struct ublk_queue *ubq = ublk_get_queue(ub, i); 3512 3513 queues_cancelable += !!ubq_has_idle_io(ubq); 3514 } 3515 3516 /* 3517 * Each queue needs at least one active command for 3518 * notifying ublk server 3519 */ 3520 if (queues_cancelable == ub->dev_info.nr_hw_queues) 3521 break; 3522 3523 msleep(UBLK_REQUEUE_DELAY_MS); 3524 elapsed += UBLK_REQUEUE_DELAY_MS; 3525 } 3526 3527 if (signal_pending(current)) 3528 ret = -EINTR; 3529 else if (elapsed >= timeout_ms) 3530 ret = -EBUSY; 3531 else 3532 ret = 0; 3533 3534 return ret; 3535 } 3536 3537 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, 3538 const struct ublksrv_ctrl_cmd *header) 3539 { 3540 /* zero means wait forever */ 3541 u64 timeout_ms = header->data[0]; 3542 struct gendisk *disk; 3543 int ret = -ENODEV; 3544 3545 if (!(ub->dev_info.flags & UBLK_F_QUIESCE)) 3546 return -EOPNOTSUPP; 3547 3548 mutex_lock(&ub->mutex); 3549 disk = ublk_get_disk(ub); 3550 if (!disk) 3551 goto unlock; 3552 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 3553 goto put_disk; 3554 3555 ret = 0; 3556 /* already in expected state */ 3557 if (ub->dev_info.state != UBLK_S_DEV_LIVE) 3558 goto put_disk; 3559 3560 /* Mark the device as canceling */ 3561 mutex_lock(&ub->cancel_mutex); 3562 blk_mq_quiesce_queue(disk->queue); 3563 ublk_set_canceling(ub, true); 3564 blk_mq_unquiesce_queue(disk->queue); 3565 mutex_unlock(&ub->cancel_mutex); 3566 3567 if (!timeout_ms) 3568 timeout_ms = UINT_MAX; 3569 ret = ublk_wait_for_idle_io(ub, timeout_ms); 3570 3571 put_disk: 3572 ublk_put_disk(disk); 3573 unlock: 3574 mutex_unlock(&ub->mutex); 3575 3576 /* Cancel pending uring_cmd */ 3577 if (!ret) 3578 ublk_cancel_dev(ub); 3579 return ret; 3580 } 3581 3582 /* 3583 * All control commands are sent via /dev/ublk-control, so we have to check 3584 * the destination device's permission 3585 */ 3586 static int ublk_char_dev_permission(struct ublk_device *ub, 3587 const char *dev_path, int mask) 3588 { 3589 int err; 3590 struct path path; 3591 struct kstat stat; 3592 3593 err = kern_path(dev_path, LOOKUP_FOLLOW, &path); 3594 if (err) 3595 return err; 3596 3597 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); 3598 if (err) 3599 goto exit; 3600 3601 err = -EPERM; 3602 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) 3603 goto exit; 3604 3605 err = inode_permission(&nop_mnt_idmap, 3606 d_backing_inode(path.dentry), mask); 3607 exit: 3608 path_put(&path); 3609 return err; 3610 } 3611 3612 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, 3613 struct io_uring_cmd *cmd) 3614 { 3615 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe); 3616 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; 3617 void __user *argp = (void __user *)(unsigned long)header->addr; 3618 char *dev_path = NULL; 3619 int ret = 0; 3620 int mask; 3621 3622 if (!unprivileged) { 3623 if (!capable(CAP_SYS_ADMIN)) 3624 return -EPERM; 3625 /* 3626 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes 3627 * char_dev_path in payload too, since userspace may not 3628 * know if the specified device is created as unprivileged 3629 * mode. 3630 */ 3631 if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) 3632 return 0; 3633 } 3634 3635 /* 3636 * User has to provide the char device path for unprivileged ublk 3637 * 3638 * header->addr always points to the dev path buffer, and 3639 * header->dev_path_len records length of dev path buffer. 3640 */ 3641 if (!header->dev_path_len || header->dev_path_len > PATH_MAX) 3642 return -EINVAL; 3643 3644 if (header->len < header->dev_path_len) 3645 return -EINVAL; 3646 3647 dev_path = memdup_user_nul(argp, header->dev_path_len); 3648 if (IS_ERR(dev_path)) 3649 return PTR_ERR(dev_path); 3650 3651 ret = -EINVAL; 3652 switch (_IOC_NR(cmd->cmd_op)) { 3653 case UBLK_CMD_GET_DEV_INFO: 3654 case UBLK_CMD_GET_DEV_INFO2: 3655 case UBLK_CMD_GET_QUEUE_AFFINITY: 3656 case UBLK_CMD_GET_PARAMS: 3657 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): 3658 mask = MAY_READ; 3659 break; 3660 case UBLK_CMD_START_DEV: 3661 case UBLK_CMD_STOP_DEV: 3662 case UBLK_CMD_ADD_DEV: 3663 case UBLK_CMD_DEL_DEV: 3664 case UBLK_CMD_SET_PARAMS: 3665 case UBLK_CMD_START_USER_RECOVERY: 3666 case UBLK_CMD_END_USER_RECOVERY: 3667 case UBLK_CMD_UPDATE_SIZE: 3668 case UBLK_CMD_QUIESCE_DEV: 3669 mask = MAY_READ | MAY_WRITE; 3670 break; 3671 default: 3672 goto exit; 3673 } 3674 3675 ret = ublk_char_dev_permission(ub, dev_path, mask); 3676 if (!ret) { 3677 header->len -= header->dev_path_len; 3678 header->addr += header->dev_path_len; 3679 } 3680 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", 3681 __func__, ub->ub_number, cmd->cmd_op, 3682 ub->dev_info.owner_uid, ub->dev_info.owner_gid, 3683 dev_path, ret); 3684 exit: 3685 kfree(dev_path); 3686 return ret; 3687 } 3688 3689 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, 3690 unsigned int issue_flags) 3691 { 3692 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 3693 struct ublk_device *ub = NULL; 3694 u32 cmd_op = cmd->cmd_op; 3695 int ret = -EINVAL; 3696 3697 if (issue_flags & IO_URING_F_NONBLOCK) 3698 return -EAGAIN; 3699 3700 ublk_ctrl_cmd_dump(cmd); 3701 3702 if (!(issue_flags & IO_URING_F_SQE128)) 3703 goto out; 3704 3705 ret = ublk_check_cmd_op(cmd_op); 3706 if (ret) 3707 goto out; 3708 3709 if (cmd_op == UBLK_U_CMD_GET_FEATURES) { 3710 ret = ublk_ctrl_get_features(header); 3711 goto out; 3712 } 3713 3714 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { 3715 ret = -ENODEV; 3716 ub = ublk_get_device_from_id(header->dev_id); 3717 if (!ub) 3718 goto out; 3719 3720 ret = ublk_ctrl_uring_cmd_permission(ub, cmd); 3721 if (ret) 3722 goto put_dev; 3723 } 3724 3725 switch (_IOC_NR(cmd_op)) { 3726 case UBLK_CMD_START_DEV: 3727 ret = ublk_ctrl_start_dev(ub, header); 3728 break; 3729 case UBLK_CMD_STOP_DEV: 3730 ret = ublk_ctrl_stop_dev(ub); 3731 break; 3732 case UBLK_CMD_GET_DEV_INFO: 3733 case UBLK_CMD_GET_DEV_INFO2: 3734 ret = ublk_ctrl_get_dev_info(ub, header); 3735 break; 3736 case UBLK_CMD_ADD_DEV: 3737 ret = ublk_ctrl_add_dev(header); 3738 break; 3739 case UBLK_CMD_DEL_DEV: 3740 ret = ublk_ctrl_del_dev(&ub, true); 3741 break; 3742 case UBLK_CMD_DEL_DEV_ASYNC: 3743 ret = ublk_ctrl_del_dev(&ub, false); 3744 break; 3745 case UBLK_CMD_GET_QUEUE_AFFINITY: 3746 ret = ublk_ctrl_get_queue_affinity(ub, header); 3747 break; 3748 case UBLK_CMD_GET_PARAMS: 3749 ret = ublk_ctrl_get_params(ub, header); 3750 break; 3751 case UBLK_CMD_SET_PARAMS: 3752 ret = ublk_ctrl_set_params(ub, header); 3753 break; 3754 case UBLK_CMD_START_USER_RECOVERY: 3755 ret = ublk_ctrl_start_recovery(ub, header); 3756 break; 3757 case UBLK_CMD_END_USER_RECOVERY: 3758 ret = ublk_ctrl_end_recovery(ub, header); 3759 break; 3760 case UBLK_CMD_UPDATE_SIZE: 3761 ublk_ctrl_set_size(ub, header); 3762 ret = 0; 3763 break; 3764 case UBLK_CMD_QUIESCE_DEV: 3765 ret = ublk_ctrl_quiesce_dev(ub, header); 3766 break; 3767 default: 3768 ret = -EOPNOTSUPP; 3769 break; 3770 } 3771 3772 put_dev: 3773 if (ub) 3774 ublk_put_device(ub); 3775 out: 3776 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", 3777 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); 3778 return ret; 3779 } 3780 3781 static const struct file_operations ublk_ctl_fops = { 3782 .open = nonseekable_open, 3783 .uring_cmd = ublk_ctrl_uring_cmd, 3784 .owner = THIS_MODULE, 3785 .llseek = noop_llseek, 3786 }; 3787 3788 static struct miscdevice ublk_misc = { 3789 .minor = MISC_DYNAMIC_MINOR, 3790 .name = "ublk-control", 3791 .fops = &ublk_ctl_fops, 3792 }; 3793 3794 static int __init ublk_init(void) 3795 { 3796 int ret; 3797 3798 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + 3799 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); 3800 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); 3801 3802 init_waitqueue_head(&ublk_idr_wq); 3803 3804 ret = misc_register(&ublk_misc); 3805 if (ret) 3806 return ret; 3807 3808 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char"); 3809 if (ret) 3810 goto unregister_mis; 3811 3812 ret = class_register(&ublk_chr_class); 3813 if (ret) 3814 goto free_chrdev_region; 3815 3816 return 0; 3817 3818 free_chrdev_region: 3819 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 3820 unregister_mis: 3821 misc_deregister(&ublk_misc); 3822 return ret; 3823 } 3824 3825 static void __exit ublk_exit(void) 3826 { 3827 struct ublk_device *ub; 3828 int id; 3829 3830 idr_for_each_entry(&ublk_index_idr, ub, id) 3831 ublk_remove(ub); 3832 3833 class_unregister(&ublk_chr_class); 3834 misc_deregister(&ublk_misc); 3835 3836 idr_destroy(&ublk_index_idr); 3837 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 3838 } 3839 3840 module_init(ublk_init); 3841 module_exit(ublk_exit); 3842 3843 static int ublk_set_max_unprivileged_ublks(const char *buf, 3844 const struct kernel_param *kp) 3845 { 3846 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); 3847 } 3848 3849 static int ublk_get_max_unprivileged_ublks(char *buf, 3850 const struct kernel_param *kp) 3851 { 3852 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max); 3853 } 3854 3855 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = { 3856 .set = ublk_set_max_unprivileged_ublks, 3857 .get = ublk_get_max_unprivileged_ublks, 3858 }; 3859 3860 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops, 3861 &unprivileged_ublks_max, 0644); 3862 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)"); 3863 3864 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); 3865 MODULE_DESCRIPTION("Userspace block device"); 3866 MODULE_LICENSE("GPL"); 3867