1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Userspace block device - block device which IO is handled from userspace 4 * 5 * Take full use of io_uring passthrough command for communicating with 6 * ublk userspace daemon(ublksrvd) for handling basic IO request. 7 * 8 * Copyright 2022 Ming Lei <ming.lei@redhat.com> 9 * 10 * (part of code stolen from loop.c) 11 */ 12 #include <linux/module.h> 13 #include <linux/moduleparam.h> 14 #include <linux/sched.h> 15 #include <linux/fs.h> 16 #include <linux/pagemap.h> 17 #include <linux/file.h> 18 #include <linux/stat.h> 19 #include <linux/errno.h> 20 #include <linux/major.h> 21 #include <linux/wait.h> 22 #include <linux/blkdev.h> 23 #include <linux/init.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/compat.h> 27 #include <linux/mutex.h> 28 #include <linux/writeback.h> 29 #include <linux/completion.h> 30 #include <linux/highmem.h> 31 #include <linux/sysfs.h> 32 #include <linux/miscdevice.h> 33 #include <linux/falloc.h> 34 #include <linux/uio.h> 35 #include <linux/ioprio.h> 36 #include <linux/sched/mm.h> 37 #include <linux/uaccess.h> 38 #include <linux/cdev.h> 39 #include <linux/io_uring/cmd.h> 40 #include <linux/blk-mq.h> 41 #include <linux/delay.h> 42 #include <linux/mm.h> 43 #include <asm/page.h> 44 #include <linux/task_work.h> 45 #include <linux/namei.h> 46 #include <linux/kref.h> 47 #include <uapi/linux/ublk_cmd.h> 48 49 #define UBLK_MINORS (1U << MINORBITS) 50 51 /* private ioctl command mirror */ 52 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) 53 54 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 55 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 56 | UBLK_F_URING_CMD_COMP_IN_TASK \ 57 | UBLK_F_NEED_GET_DATA \ 58 | UBLK_F_USER_RECOVERY \ 59 | UBLK_F_USER_RECOVERY_REISSUE \ 60 | UBLK_F_UNPRIVILEGED_DEV \ 61 | UBLK_F_CMD_IOCTL_ENCODE \ 62 | UBLK_F_USER_COPY \ 63 | UBLK_F_ZONED) 64 65 /* All UBLK_PARAM_TYPE_* should be included here */ 66 #define UBLK_PARAM_TYPE_ALL \ 67 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ 68 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) 69 70 struct ublk_rq_data { 71 struct llist_node node; 72 73 struct kref ref; 74 __u64 sector; 75 __u32 operation; 76 __u32 nr_zones; 77 }; 78 79 struct ublk_uring_cmd_pdu { 80 struct ublk_queue *ubq; 81 u16 tag; 82 }; 83 84 /* 85 * io command is active: sqe cmd is received, and its cqe isn't done 86 * 87 * If the flag is set, the io command is owned by ublk driver, and waited 88 * for incoming blk-mq request from the ublk block device. 89 * 90 * If the flag is cleared, the io command will be completed, and owned by 91 * ublk server. 92 */ 93 #define UBLK_IO_FLAG_ACTIVE 0x01 94 95 /* 96 * IO command is completed via cqe, and it is being handled by ublksrv, and 97 * not committed yet 98 * 99 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for 100 * cross verification 101 */ 102 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 103 104 /* 105 * IO command is aborted, so this flag is set in case of 106 * !UBLK_IO_FLAG_ACTIVE. 107 * 108 * After this flag is observed, any pending or new incoming request 109 * associated with this io command will be failed immediately 110 */ 111 #define UBLK_IO_FLAG_ABORTED 0x04 112 113 /* 114 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires 115 * get data buffer address from ublksrv. 116 * 117 * Then, bio data could be copied into this data buffer for a WRITE request 118 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset. 119 */ 120 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 121 122 /* atomic RW with ubq->cancel_lock */ 123 #define UBLK_IO_FLAG_CANCELED 0x80000000 124 125 struct ublk_io { 126 /* userspace buffer address from io cmd */ 127 __u64 addr; 128 unsigned int flags; 129 int res; 130 131 struct io_uring_cmd *cmd; 132 }; 133 134 struct ublk_queue { 135 int q_id; 136 int q_depth; 137 138 unsigned long flags; 139 struct task_struct *ubq_daemon; 140 char *io_cmd_buf; 141 142 struct llist_head io_cmds; 143 144 unsigned long io_addr; /* mapped vm address */ 145 unsigned int max_io_sz; 146 bool force_abort; 147 bool timeout; 148 bool canceling; 149 unsigned short nr_io_ready; /* how many ios setup */ 150 spinlock_t cancel_lock; 151 struct ublk_device *dev; 152 struct ublk_io ios[]; 153 }; 154 155 struct ublk_device { 156 struct gendisk *ub_disk; 157 158 char *__queues; 159 160 unsigned int queue_size; 161 struct ublksrv_ctrl_dev_info dev_info; 162 163 struct blk_mq_tag_set tag_set; 164 165 struct cdev cdev; 166 struct device cdev_dev; 167 168 #define UB_STATE_OPEN 0 169 #define UB_STATE_USED 1 170 #define UB_STATE_DELETED 2 171 unsigned long state; 172 int ub_number; 173 174 struct mutex mutex; 175 176 spinlock_t lock; 177 struct mm_struct *mm; 178 179 struct ublk_params params; 180 181 struct completion completion; 182 unsigned int nr_queues_ready; 183 unsigned int nr_privileged_daemon; 184 185 struct work_struct quiesce_work; 186 struct work_struct stop_work; 187 }; 188 189 /* header of ublk_params */ 190 struct ublk_params_header { 191 __u32 len; 192 __u32 types; 193 }; 194 195 static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); 196 197 static inline unsigned int ublk_req_build_flags(struct request *req); 198 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 199 int tag); 200 static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) 201 { 202 return ub->dev_info.flags & UBLK_F_USER_COPY; 203 } 204 205 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) 206 { 207 return ub->dev_info.flags & UBLK_F_ZONED; 208 } 209 210 static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) 211 { 212 return ubq->flags & UBLK_F_ZONED; 213 } 214 215 #ifdef CONFIG_BLK_DEV_ZONED 216 217 static int ublk_get_nr_zones(const struct ublk_device *ub) 218 { 219 const struct ublk_param_basic *p = &ub->params.basic; 220 221 /* Zone size is a power of 2 */ 222 return p->dev_sectors >> ilog2(p->chunk_sectors); 223 } 224 225 static int ublk_revalidate_disk_zones(struct ublk_device *ub) 226 { 227 return blk_revalidate_disk_zones(ub->ub_disk); 228 } 229 230 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 231 { 232 const struct ublk_param_zoned *p = &ub->params.zoned; 233 int nr_zones; 234 235 if (!ublk_dev_is_zoned(ub)) 236 return -EINVAL; 237 238 if (!p->max_zone_append_sectors) 239 return -EINVAL; 240 241 nr_zones = ublk_get_nr_zones(ub); 242 243 if (p->max_active_zones > nr_zones) 244 return -EINVAL; 245 246 if (p->max_open_zones > nr_zones) 247 return -EINVAL; 248 249 return 0; 250 } 251 252 static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 253 { 254 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); 255 } 256 257 /* Based on virtblk_alloc_report_buffer */ 258 static void *ublk_alloc_report_buffer(struct ublk_device *ublk, 259 unsigned int nr_zones, size_t *buflen) 260 { 261 struct request_queue *q = ublk->ub_disk->queue; 262 size_t bufsize; 263 void *buf; 264 265 nr_zones = min_t(unsigned int, nr_zones, 266 ublk->ub_disk->nr_zones); 267 268 bufsize = nr_zones * sizeof(struct blk_zone); 269 bufsize = 270 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); 271 272 while (bufsize >= sizeof(struct blk_zone)) { 273 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); 274 if (buf) { 275 *buflen = bufsize; 276 return buf; 277 } 278 bufsize >>= 1; 279 } 280 281 *buflen = 0; 282 return NULL; 283 } 284 285 static int ublk_report_zones(struct gendisk *disk, sector_t sector, 286 unsigned int nr_zones, report_zones_cb cb, void *data) 287 { 288 struct ublk_device *ub = disk->private_data; 289 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; 290 unsigned int first_zone = sector >> ilog2(zone_size_sectors); 291 unsigned int done_zones = 0; 292 unsigned int max_zones_per_request; 293 int ret; 294 struct blk_zone *buffer; 295 size_t buffer_length; 296 297 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, 298 nr_zones); 299 300 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); 301 if (!buffer) 302 return -ENOMEM; 303 304 max_zones_per_request = buffer_length / sizeof(struct blk_zone); 305 306 while (done_zones < nr_zones) { 307 unsigned int remaining_zones = nr_zones - done_zones; 308 unsigned int zones_in_request = 309 min_t(unsigned int, remaining_zones, max_zones_per_request); 310 struct request *req; 311 struct ublk_rq_data *pdu; 312 blk_status_t status; 313 314 memset(buffer, 0, buffer_length); 315 316 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); 317 if (IS_ERR(req)) { 318 ret = PTR_ERR(req); 319 goto out; 320 } 321 322 pdu = blk_mq_rq_to_pdu(req); 323 pdu->operation = UBLK_IO_OP_REPORT_ZONES; 324 pdu->sector = sector; 325 pdu->nr_zones = zones_in_request; 326 327 ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, 328 GFP_KERNEL); 329 if (ret) { 330 blk_mq_free_request(req); 331 goto out; 332 } 333 334 status = blk_execute_rq(req, 0); 335 ret = blk_status_to_errno(status); 336 blk_mq_free_request(req); 337 if (ret) 338 goto out; 339 340 for (unsigned int i = 0; i < zones_in_request; i++) { 341 struct blk_zone *zone = buffer + i; 342 343 /* A zero length zone means no more zones in this response */ 344 if (!zone->len) 345 break; 346 347 ret = cb(zone, i, data); 348 if (ret) 349 goto out; 350 351 done_zones++; 352 sector += zone_size_sectors; 353 354 } 355 } 356 357 ret = done_zones; 358 359 out: 360 kvfree(buffer); 361 return ret; 362 } 363 364 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 365 struct request *req) 366 { 367 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 368 struct ublk_io *io = &ubq->ios[req->tag]; 369 struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); 370 u32 ublk_op; 371 372 switch (req_op(req)) { 373 case REQ_OP_ZONE_OPEN: 374 ublk_op = UBLK_IO_OP_ZONE_OPEN; 375 break; 376 case REQ_OP_ZONE_CLOSE: 377 ublk_op = UBLK_IO_OP_ZONE_CLOSE; 378 break; 379 case REQ_OP_ZONE_FINISH: 380 ublk_op = UBLK_IO_OP_ZONE_FINISH; 381 break; 382 case REQ_OP_ZONE_RESET: 383 ublk_op = UBLK_IO_OP_ZONE_RESET; 384 break; 385 case REQ_OP_ZONE_APPEND: 386 ublk_op = UBLK_IO_OP_ZONE_APPEND; 387 break; 388 case REQ_OP_ZONE_RESET_ALL: 389 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; 390 break; 391 case REQ_OP_DRV_IN: 392 ublk_op = pdu->operation; 393 switch (ublk_op) { 394 case UBLK_IO_OP_REPORT_ZONES: 395 iod->op_flags = ublk_op | ublk_req_build_flags(req); 396 iod->nr_zones = pdu->nr_zones; 397 iod->start_sector = pdu->sector; 398 return BLK_STS_OK; 399 default: 400 return BLK_STS_IOERR; 401 } 402 case REQ_OP_DRV_OUT: 403 /* We do not support drv_out */ 404 return BLK_STS_NOTSUPP; 405 default: 406 return BLK_STS_IOERR; 407 } 408 409 iod->op_flags = ublk_op | ublk_req_build_flags(req); 410 iod->nr_sectors = blk_rq_sectors(req); 411 iod->start_sector = blk_rq_pos(req); 412 iod->addr = io->addr; 413 414 return BLK_STS_OK; 415 } 416 417 #else 418 419 #define ublk_report_zones (NULL) 420 421 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 422 { 423 return -EOPNOTSUPP; 424 } 425 426 static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 427 { 428 } 429 430 static int ublk_revalidate_disk_zones(struct ublk_device *ub) 431 { 432 return 0; 433 } 434 435 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 436 struct request *req) 437 { 438 return BLK_STS_NOTSUPP; 439 } 440 441 #endif 442 443 static inline void __ublk_complete_rq(struct request *req); 444 static void ublk_complete_rq(struct kref *ref); 445 446 static dev_t ublk_chr_devt; 447 static const struct class ublk_chr_class = { 448 .name = "ublk-char", 449 }; 450 451 static DEFINE_IDR(ublk_index_idr); 452 static DEFINE_SPINLOCK(ublk_idr_lock); 453 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ 454 455 static DEFINE_MUTEX(ublk_ctl_mutex); 456 457 /* 458 * Max ublk devices allowed to add 459 * 460 * It can be extended to one per-user limit in future or even controlled 461 * by cgroup. 462 */ 463 #define UBLK_MAX_UBLKS UBLK_MINORS 464 static unsigned int ublks_max = 64; 465 static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ 466 467 static struct miscdevice ublk_misc; 468 469 static inline unsigned ublk_pos_to_hwq(loff_t pos) 470 { 471 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & 472 UBLK_QID_BITS_MASK; 473 } 474 475 static inline unsigned ublk_pos_to_buf_off(loff_t pos) 476 { 477 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; 478 } 479 480 static inline unsigned ublk_pos_to_tag(loff_t pos) 481 { 482 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & 483 UBLK_TAG_BITS_MASK; 484 } 485 486 static void ublk_dev_param_basic_apply(struct ublk_device *ub) 487 { 488 const struct ublk_param_basic *p = &ub->params.basic; 489 490 if (p->attrs & UBLK_ATTR_READ_ONLY) 491 set_disk_ro(ub->ub_disk, true); 492 493 set_capacity(ub->ub_disk, p->dev_sectors); 494 } 495 496 static int ublk_validate_params(const struct ublk_device *ub) 497 { 498 /* basic param is the only one which must be set */ 499 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { 500 const struct ublk_param_basic *p = &ub->params.basic; 501 502 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) 503 return -EINVAL; 504 505 if (p->logical_bs_shift > p->physical_bs_shift) 506 return -EINVAL; 507 508 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) 509 return -EINVAL; 510 511 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) 512 return -EINVAL; 513 } else 514 return -EINVAL; 515 516 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 517 const struct ublk_param_discard *p = &ub->params.discard; 518 519 /* So far, only support single segment discard */ 520 if (p->max_discard_sectors && p->max_discard_segments != 1) 521 return -EINVAL; 522 523 if (!p->discard_granularity) 524 return -EINVAL; 525 } 526 527 /* dev_t is read-only */ 528 if (ub->params.types & UBLK_PARAM_TYPE_DEVT) 529 return -EINVAL; 530 531 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 532 return ublk_dev_param_zoned_validate(ub); 533 else if (ublk_dev_is_zoned(ub)) 534 return -EINVAL; 535 536 return 0; 537 } 538 539 static void ublk_apply_params(struct ublk_device *ub) 540 { 541 ublk_dev_param_basic_apply(ub); 542 543 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 544 ublk_dev_param_zoned_apply(ub); 545 } 546 547 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) 548 { 549 return ubq->flags & UBLK_F_USER_COPY; 550 } 551 552 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) 553 { 554 /* 555 * read()/write() is involved in user copy, so request reference 556 * has to be grabbed 557 */ 558 return ublk_support_user_copy(ubq); 559 } 560 561 static inline void ublk_init_req_ref(const struct ublk_queue *ubq, 562 struct request *req) 563 { 564 if (ublk_need_req_ref(ubq)) { 565 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 566 567 kref_init(&data->ref); 568 } 569 } 570 571 static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, 572 struct request *req) 573 { 574 if (ublk_need_req_ref(ubq)) { 575 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 576 577 return kref_get_unless_zero(&data->ref); 578 } 579 580 return true; 581 } 582 583 static inline void ublk_put_req_ref(const struct ublk_queue *ubq, 584 struct request *req) 585 { 586 if (ublk_need_req_ref(ubq)) { 587 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 588 589 kref_put(&data->ref, ublk_complete_rq); 590 } else { 591 __ublk_complete_rq(req); 592 } 593 } 594 595 static inline bool ublk_need_get_data(const struct ublk_queue *ubq) 596 { 597 return ubq->flags & UBLK_F_NEED_GET_DATA; 598 } 599 600 /* Called in slow path only, keep it noinline for trace purpose */ 601 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) 602 { 603 if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) 604 return ub; 605 return NULL; 606 } 607 608 /* Called in slow path only, keep it noinline for trace purpose */ 609 static noinline void ublk_put_device(struct ublk_device *ub) 610 { 611 put_device(&ub->cdev_dev); 612 } 613 614 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, 615 int qid) 616 { 617 return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); 618 } 619 620 static inline bool ublk_rq_has_data(const struct request *rq) 621 { 622 return bio_has_data(rq->bio); 623 } 624 625 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 626 int tag) 627 { 628 return (struct ublksrv_io_desc *) 629 &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); 630 } 631 632 static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) 633 { 634 return ublk_get_queue(ub, q_id)->io_cmd_buf; 635 } 636 637 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) 638 { 639 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 640 641 return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), 642 PAGE_SIZE); 643 } 644 645 static inline bool ublk_queue_can_use_recovery_reissue( 646 struct ublk_queue *ubq) 647 { 648 return (ubq->flags & UBLK_F_USER_RECOVERY) && 649 (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); 650 } 651 652 static inline bool ublk_queue_can_use_recovery( 653 struct ublk_queue *ubq) 654 { 655 return ubq->flags & UBLK_F_USER_RECOVERY; 656 } 657 658 static inline bool ublk_can_use_recovery(struct ublk_device *ub) 659 { 660 return ub->dev_info.flags & UBLK_F_USER_RECOVERY; 661 } 662 663 static void ublk_free_disk(struct gendisk *disk) 664 { 665 struct ublk_device *ub = disk->private_data; 666 667 clear_bit(UB_STATE_USED, &ub->state); 668 ublk_put_device(ub); 669 } 670 671 static void ublk_store_owner_uid_gid(unsigned int *owner_uid, 672 unsigned int *owner_gid) 673 { 674 kuid_t uid; 675 kgid_t gid; 676 677 current_uid_gid(&uid, &gid); 678 679 *owner_uid = from_kuid(&init_user_ns, uid); 680 *owner_gid = from_kgid(&init_user_ns, gid); 681 } 682 683 static int ublk_open(struct gendisk *disk, blk_mode_t mode) 684 { 685 struct ublk_device *ub = disk->private_data; 686 687 if (capable(CAP_SYS_ADMIN)) 688 return 0; 689 690 /* 691 * If it is one unprivileged device, only owner can open 692 * the disk. Otherwise it could be one trap made by one 693 * evil user who grants this disk's privileges to other 694 * users deliberately. 695 * 696 * This way is reasonable too given anyone can create 697 * unprivileged device, and no need other's grant. 698 */ 699 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { 700 unsigned int curr_uid, curr_gid; 701 702 ublk_store_owner_uid_gid(&curr_uid, &curr_gid); 703 704 if (curr_uid != ub->dev_info.owner_uid || curr_gid != 705 ub->dev_info.owner_gid) 706 return -EPERM; 707 } 708 709 return 0; 710 } 711 712 static const struct block_device_operations ub_fops = { 713 .owner = THIS_MODULE, 714 .open = ublk_open, 715 .free_disk = ublk_free_disk, 716 .report_zones = ublk_report_zones, 717 }; 718 719 #define UBLK_MAX_PIN_PAGES 32 720 721 struct ublk_io_iter { 722 struct page *pages[UBLK_MAX_PIN_PAGES]; 723 struct bio *bio; 724 struct bvec_iter iter; 725 }; 726 727 /* return how many pages are copied */ 728 static void ublk_copy_io_pages(struct ublk_io_iter *data, 729 size_t total, size_t pg_off, int dir) 730 { 731 unsigned done = 0; 732 unsigned pg_idx = 0; 733 734 while (done < total) { 735 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); 736 unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, 737 (unsigned)(PAGE_SIZE - pg_off)); 738 void *bv_buf = bvec_kmap_local(&bv); 739 void *pg_buf = kmap_local_page(data->pages[pg_idx]); 740 741 if (dir == ITER_DEST) 742 memcpy(pg_buf + pg_off, bv_buf, bytes); 743 else 744 memcpy(bv_buf, pg_buf + pg_off, bytes); 745 746 kunmap_local(pg_buf); 747 kunmap_local(bv_buf); 748 749 /* advance page array */ 750 pg_off += bytes; 751 if (pg_off == PAGE_SIZE) { 752 pg_idx += 1; 753 pg_off = 0; 754 } 755 756 done += bytes; 757 758 /* advance bio */ 759 bio_advance_iter_single(data->bio, &data->iter, bytes); 760 if (!data->iter.bi_size) { 761 data->bio = data->bio->bi_next; 762 if (data->bio == NULL) 763 break; 764 data->iter = data->bio->bi_iter; 765 } 766 } 767 } 768 769 static bool ublk_advance_io_iter(const struct request *req, 770 struct ublk_io_iter *iter, unsigned int offset) 771 { 772 struct bio *bio = req->bio; 773 774 for_each_bio(bio) { 775 if (bio->bi_iter.bi_size > offset) { 776 iter->bio = bio; 777 iter->iter = bio->bi_iter; 778 bio_advance_iter(iter->bio, &iter->iter, offset); 779 return true; 780 } 781 offset -= bio->bi_iter.bi_size; 782 } 783 return false; 784 } 785 786 /* 787 * Copy data between request pages and io_iter, and 'offset' 788 * is the start point of linear offset of request. 789 */ 790 static size_t ublk_copy_user_pages(const struct request *req, 791 unsigned offset, struct iov_iter *uiter, int dir) 792 { 793 struct ublk_io_iter iter; 794 size_t done = 0; 795 796 if (!ublk_advance_io_iter(req, &iter, offset)) 797 return 0; 798 799 while (iov_iter_count(uiter) && iter.bio) { 800 unsigned nr_pages; 801 ssize_t len; 802 size_t off; 803 int i; 804 805 len = iov_iter_get_pages2(uiter, iter.pages, 806 iov_iter_count(uiter), 807 UBLK_MAX_PIN_PAGES, &off); 808 if (len <= 0) 809 return done; 810 811 ublk_copy_io_pages(&iter, len, off, dir); 812 nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); 813 for (i = 0; i < nr_pages; i++) { 814 if (dir == ITER_DEST) 815 set_page_dirty(iter.pages[i]); 816 put_page(iter.pages[i]); 817 } 818 done += len; 819 } 820 821 return done; 822 } 823 824 static inline bool ublk_need_map_req(const struct request *req) 825 { 826 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; 827 } 828 829 static inline bool ublk_need_unmap_req(const struct request *req) 830 { 831 return ublk_rq_has_data(req) && 832 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); 833 } 834 835 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, 836 struct ublk_io *io) 837 { 838 const unsigned int rq_bytes = blk_rq_bytes(req); 839 840 if (ublk_support_user_copy(ubq)) 841 return rq_bytes; 842 843 /* 844 * no zero copy, we delay copy WRITE request data into ublksrv 845 * context and the big benefit is that pinning pages in current 846 * context is pretty fast, see ublk_pin_user_pages 847 */ 848 if (ublk_need_map_req(req)) { 849 struct iov_iter iter; 850 const int dir = ITER_DEST; 851 852 import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); 853 return ublk_copy_user_pages(req, 0, &iter, dir); 854 } 855 return rq_bytes; 856 } 857 858 static int ublk_unmap_io(const struct ublk_queue *ubq, 859 const struct request *req, 860 struct ublk_io *io) 861 { 862 const unsigned int rq_bytes = blk_rq_bytes(req); 863 864 if (ublk_support_user_copy(ubq)) 865 return rq_bytes; 866 867 if (ublk_need_unmap_req(req)) { 868 struct iov_iter iter; 869 const int dir = ITER_SOURCE; 870 871 WARN_ON_ONCE(io->res > rq_bytes); 872 873 import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); 874 return ublk_copy_user_pages(req, 0, &iter, dir); 875 } 876 return rq_bytes; 877 } 878 879 static inline unsigned int ublk_req_build_flags(struct request *req) 880 { 881 unsigned flags = 0; 882 883 if (req->cmd_flags & REQ_FAILFAST_DEV) 884 flags |= UBLK_IO_F_FAILFAST_DEV; 885 886 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT) 887 flags |= UBLK_IO_F_FAILFAST_TRANSPORT; 888 889 if (req->cmd_flags & REQ_FAILFAST_DRIVER) 890 flags |= UBLK_IO_F_FAILFAST_DRIVER; 891 892 if (req->cmd_flags & REQ_META) 893 flags |= UBLK_IO_F_META; 894 895 if (req->cmd_flags & REQ_FUA) 896 flags |= UBLK_IO_F_FUA; 897 898 if (req->cmd_flags & REQ_NOUNMAP) 899 flags |= UBLK_IO_F_NOUNMAP; 900 901 if (req->cmd_flags & REQ_SWAP) 902 flags |= UBLK_IO_F_SWAP; 903 904 return flags; 905 } 906 907 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) 908 { 909 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 910 struct ublk_io *io = &ubq->ios[req->tag]; 911 enum req_op op = req_op(req); 912 u32 ublk_op; 913 914 if (!ublk_queue_is_zoned(ubq) && 915 (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) 916 return BLK_STS_IOERR; 917 918 switch (req_op(req)) { 919 case REQ_OP_READ: 920 ublk_op = UBLK_IO_OP_READ; 921 break; 922 case REQ_OP_WRITE: 923 ublk_op = UBLK_IO_OP_WRITE; 924 break; 925 case REQ_OP_FLUSH: 926 ublk_op = UBLK_IO_OP_FLUSH; 927 break; 928 case REQ_OP_DISCARD: 929 ublk_op = UBLK_IO_OP_DISCARD; 930 break; 931 case REQ_OP_WRITE_ZEROES: 932 ublk_op = UBLK_IO_OP_WRITE_ZEROES; 933 break; 934 default: 935 if (ublk_queue_is_zoned(ubq)) 936 return ublk_setup_iod_zoned(ubq, req); 937 return BLK_STS_IOERR; 938 } 939 940 /* need to translate since kernel may change */ 941 iod->op_flags = ublk_op | ublk_req_build_flags(req); 942 iod->nr_sectors = blk_rq_sectors(req); 943 iod->start_sector = blk_rq_pos(req); 944 iod->addr = io->addr; 945 946 return BLK_STS_OK; 947 } 948 949 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( 950 struct io_uring_cmd *ioucmd) 951 { 952 return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; 953 } 954 955 static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) 956 { 957 return ubq->ubq_daemon->flags & PF_EXITING; 958 } 959 960 /* todo: handle partial completion */ 961 static inline void __ublk_complete_rq(struct request *req) 962 { 963 struct ublk_queue *ubq = req->mq_hctx->driver_data; 964 struct ublk_io *io = &ubq->ios[req->tag]; 965 unsigned int unmapped_bytes; 966 blk_status_t res = BLK_STS_OK; 967 968 /* called from ublk_abort_queue() code path */ 969 if (io->flags & UBLK_IO_FLAG_ABORTED) { 970 res = BLK_STS_IOERR; 971 goto exit; 972 } 973 974 /* failed read IO if nothing is read */ 975 if (!io->res && req_op(req) == REQ_OP_READ) 976 io->res = -EIO; 977 978 if (io->res < 0) { 979 res = errno_to_blk_status(io->res); 980 goto exit; 981 } 982 983 /* 984 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them 985 * directly. 986 * 987 * Both the two needn't unmap. 988 */ 989 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && 990 req_op(req) != REQ_OP_DRV_IN) 991 goto exit; 992 993 /* for READ request, writing data in iod->addr to rq buffers */ 994 unmapped_bytes = ublk_unmap_io(ubq, req, io); 995 996 /* 997 * Extremely impossible since we got data filled in just before 998 * 999 * Re-read simply for this unlikely case. 1000 */ 1001 if (unlikely(unmapped_bytes < io->res)) 1002 io->res = unmapped_bytes; 1003 1004 if (blk_update_request(req, BLK_STS_OK, io->res)) 1005 blk_mq_requeue_request(req, true); 1006 else 1007 __blk_mq_end_request(req, BLK_STS_OK); 1008 1009 return; 1010 exit: 1011 blk_mq_end_request(req, res); 1012 } 1013 1014 static void ublk_complete_rq(struct kref *ref) 1015 { 1016 struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, 1017 ref); 1018 struct request *req = blk_mq_rq_from_pdu(data); 1019 1020 __ublk_complete_rq(req); 1021 } 1022 1023 /* 1024 * Since __ublk_rq_task_work always fails requests immediately during 1025 * exiting, __ublk_fail_req() is only called from abort context during 1026 * exiting. So lock is unnecessary. 1027 * 1028 * Also aborting may not be started yet, keep in mind that one failed 1029 * request may be issued by block layer again. 1030 */ 1031 static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, 1032 struct request *req) 1033 { 1034 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); 1035 1036 if (ublk_queue_can_use_recovery_reissue(ubq)) 1037 blk_mq_requeue_request(req, false); 1038 else 1039 ublk_put_req_ref(ubq, req); 1040 } 1041 1042 static void ubq_complete_io_cmd(struct ublk_io *io, int res, 1043 unsigned issue_flags) 1044 { 1045 /* mark this cmd owned by ublksrv */ 1046 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; 1047 1048 /* 1049 * clear ACTIVE since we are done with this sqe/cmd slot 1050 * We can only accept io cmd in case of being not active. 1051 */ 1052 io->flags &= ~UBLK_IO_FLAG_ACTIVE; 1053 1054 /* tell ublksrv one io request is coming */ 1055 io_uring_cmd_done(io->cmd, res, 0, issue_flags); 1056 } 1057 1058 #define UBLK_REQUEUE_DELAY_MS 3 1059 1060 static inline void __ublk_abort_rq(struct ublk_queue *ubq, 1061 struct request *rq) 1062 { 1063 /* We cannot process this rq so just requeue it. */ 1064 if (ublk_queue_can_use_recovery(ubq)) 1065 blk_mq_requeue_request(rq, false); 1066 else 1067 blk_mq_end_request(rq, BLK_STS_IOERR); 1068 } 1069 1070 static inline void __ublk_rq_task_work(struct request *req, 1071 unsigned issue_flags) 1072 { 1073 struct ublk_queue *ubq = req->mq_hctx->driver_data; 1074 int tag = req->tag; 1075 struct ublk_io *io = &ubq->ios[tag]; 1076 unsigned int mapped_bytes; 1077 1078 pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", 1079 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, 1080 ublk_get_iod(ubq, req->tag)->addr); 1081 1082 /* 1083 * Task is exiting if either: 1084 * 1085 * (1) current != ubq_daemon. 1086 * io_uring_cmd_complete_in_task() tries to run task_work 1087 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. 1088 * 1089 * (2) current->flags & PF_EXITING. 1090 */ 1091 if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { 1092 __ublk_abort_rq(ubq, req); 1093 return; 1094 } 1095 1096 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { 1097 /* 1098 * We have not handled UBLK_IO_NEED_GET_DATA command yet, 1099 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv 1100 * and notify it. 1101 */ 1102 if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) { 1103 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; 1104 pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", 1105 __func__, io->cmd->cmd_op, ubq->q_id, 1106 req->tag, io->flags); 1107 ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags); 1108 return; 1109 } 1110 /* 1111 * We have handled UBLK_IO_NEED_GET_DATA command, 1112 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just 1113 * do the copy work. 1114 */ 1115 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; 1116 /* update iod->addr because ublksrv may have passed a new io buffer */ 1117 ublk_get_iod(ubq, req->tag)->addr = io->addr; 1118 pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n", 1119 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, 1120 ublk_get_iod(ubq, req->tag)->addr); 1121 } 1122 1123 mapped_bytes = ublk_map_io(ubq, req, io); 1124 1125 /* partially mapped, update io descriptor */ 1126 if (unlikely(mapped_bytes != blk_rq_bytes(req))) { 1127 /* 1128 * Nothing mapped, retry until we succeed. 1129 * 1130 * We may never succeed in mapping any bytes here because 1131 * of OOM. TODO: reserve one buffer with single page pinned 1132 * for providing forward progress guarantee. 1133 */ 1134 if (unlikely(!mapped_bytes)) { 1135 blk_mq_requeue_request(req, false); 1136 blk_mq_delay_kick_requeue_list(req->q, 1137 UBLK_REQUEUE_DELAY_MS); 1138 return; 1139 } 1140 1141 ublk_get_iod(ubq, req->tag)->nr_sectors = 1142 mapped_bytes >> 9; 1143 } 1144 1145 ublk_init_req_ref(ubq, req); 1146 ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); 1147 } 1148 1149 static inline void ublk_forward_io_cmds(struct ublk_queue *ubq, 1150 unsigned issue_flags) 1151 { 1152 struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); 1153 struct ublk_rq_data *data, *tmp; 1154 1155 io_cmds = llist_reverse_order(io_cmds); 1156 llist_for_each_entry_safe(data, tmp, io_cmds, node) 1157 __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags); 1158 } 1159 1160 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) 1161 { 1162 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1163 struct ublk_queue *ubq = pdu->ubq; 1164 1165 ublk_forward_io_cmds(ubq, issue_flags); 1166 } 1167 1168 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) 1169 { 1170 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); 1171 1172 if (llist_add(&data->node, &ubq->io_cmds)) { 1173 struct ublk_io *io = &ubq->ios[rq->tag]; 1174 1175 io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); 1176 } 1177 } 1178 1179 static enum blk_eh_timer_return ublk_timeout(struct request *rq) 1180 { 1181 struct ublk_queue *ubq = rq->mq_hctx->driver_data; 1182 unsigned int nr_inflight = 0; 1183 int i; 1184 1185 if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { 1186 if (!ubq->timeout) { 1187 send_sig(SIGKILL, ubq->ubq_daemon, 0); 1188 ubq->timeout = true; 1189 } 1190 1191 return BLK_EH_DONE; 1192 } 1193 1194 if (!ubq_daemon_is_dying(ubq)) 1195 return BLK_EH_RESET_TIMER; 1196 1197 for (i = 0; i < ubq->q_depth; i++) { 1198 struct ublk_io *io = &ubq->ios[i]; 1199 1200 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) 1201 nr_inflight++; 1202 } 1203 1204 /* cancelable uring_cmd can't help us if all commands are in-flight */ 1205 if (nr_inflight == ubq->q_depth) { 1206 struct ublk_device *ub = ubq->dev; 1207 1208 if (ublk_abort_requests(ub, ubq)) { 1209 if (ublk_can_use_recovery(ub)) 1210 schedule_work(&ub->quiesce_work); 1211 else 1212 schedule_work(&ub->stop_work); 1213 } 1214 return BLK_EH_DONE; 1215 } 1216 1217 return BLK_EH_RESET_TIMER; 1218 } 1219 1220 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, 1221 const struct blk_mq_queue_data *bd) 1222 { 1223 struct ublk_queue *ubq = hctx->driver_data; 1224 struct request *rq = bd->rq; 1225 blk_status_t res; 1226 1227 /* fill iod to slot in io cmd buffer */ 1228 res = ublk_setup_iod(ubq, rq); 1229 if (unlikely(res != BLK_STS_OK)) 1230 return BLK_STS_IOERR; 1231 1232 /* With recovery feature enabled, force_abort is set in 1233 * ublk_stop_dev() before calling del_gendisk(). We have to 1234 * abort all requeued and new rqs here to let del_gendisk() 1235 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() 1236 * to avoid UAF on io_uring ctx. 1237 * 1238 * Note: force_abort is guaranteed to be seen because it is set 1239 * before request queue is unqiuesced. 1240 */ 1241 if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) 1242 return BLK_STS_IOERR; 1243 1244 if (unlikely(ubq->canceling)) { 1245 __ublk_abort_rq(ubq, rq); 1246 return BLK_STS_OK; 1247 } 1248 1249 blk_mq_start_request(bd->rq); 1250 ublk_queue_cmd(ubq, rq); 1251 1252 return BLK_STS_OK; 1253 } 1254 1255 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, 1256 unsigned int hctx_idx) 1257 { 1258 struct ublk_device *ub = driver_data; 1259 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num); 1260 1261 hctx->driver_data = ubq; 1262 return 0; 1263 } 1264 1265 static const struct blk_mq_ops ublk_mq_ops = { 1266 .queue_rq = ublk_queue_rq, 1267 .init_hctx = ublk_init_hctx, 1268 .timeout = ublk_timeout, 1269 }; 1270 1271 static int ublk_ch_open(struct inode *inode, struct file *filp) 1272 { 1273 struct ublk_device *ub = container_of(inode->i_cdev, 1274 struct ublk_device, cdev); 1275 1276 if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) 1277 return -EBUSY; 1278 filp->private_data = ub; 1279 return 0; 1280 } 1281 1282 static int ublk_ch_release(struct inode *inode, struct file *filp) 1283 { 1284 struct ublk_device *ub = filp->private_data; 1285 1286 clear_bit(UB_STATE_OPEN, &ub->state); 1287 return 0; 1288 } 1289 1290 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */ 1291 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) 1292 { 1293 struct ublk_device *ub = filp->private_data; 1294 size_t sz = vma->vm_end - vma->vm_start; 1295 unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); 1296 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; 1297 int q_id, ret = 0; 1298 1299 spin_lock(&ub->lock); 1300 if (!ub->mm) 1301 ub->mm = current->mm; 1302 if (current->mm != ub->mm) 1303 ret = -EINVAL; 1304 spin_unlock(&ub->lock); 1305 1306 if (ret) 1307 return ret; 1308 1309 if (vma->vm_flags & VM_WRITE) 1310 return -EPERM; 1311 1312 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz; 1313 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end) 1314 return -EINVAL; 1315 1316 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz; 1317 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n", 1318 __func__, q_id, current->pid, vma->vm_start, 1319 phys_off, (unsigned long)sz); 1320 1321 if (sz != ublk_queue_cmd_buf_size(ub, q_id)) 1322 return -EINVAL; 1323 1324 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; 1325 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 1326 } 1327 1328 static void ublk_commit_completion(struct ublk_device *ub, 1329 const struct ublksrv_io_cmd *ub_cmd) 1330 { 1331 u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; 1332 struct ublk_queue *ubq = ublk_get_queue(ub, qid); 1333 struct ublk_io *io = &ubq->ios[tag]; 1334 struct request *req; 1335 1336 /* now this cmd slot is owned by nbd driver */ 1337 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; 1338 io->res = ub_cmd->result; 1339 1340 /* find the io request and complete */ 1341 req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); 1342 if (WARN_ON_ONCE(unlikely(!req))) 1343 return; 1344 1345 if (req_op(req) == REQ_OP_ZONE_APPEND) 1346 req->__sector = ub_cmd->zone_append_lba; 1347 1348 if (likely(!blk_should_fake_timeout(req->q))) 1349 ublk_put_req_ref(ubq, req); 1350 } 1351 1352 /* 1353 * Called from ubq_daemon context via cancel fn, meantime quiesce ublk 1354 * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon 1355 * context, so everything is serialized. 1356 */ 1357 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) 1358 { 1359 int i; 1360 1361 for (i = 0; i < ubq->q_depth; i++) { 1362 struct ublk_io *io = &ubq->ios[i]; 1363 1364 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { 1365 struct request *rq; 1366 1367 /* 1368 * Either we fail the request or ublk_rq_task_work_fn 1369 * will do it 1370 */ 1371 rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); 1372 if (rq && blk_mq_request_started(rq)) { 1373 io->flags |= UBLK_IO_FLAG_ABORTED; 1374 __ublk_fail_req(ubq, io, rq); 1375 } 1376 } 1377 } 1378 } 1379 1380 static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) 1381 { 1382 struct gendisk *disk; 1383 1384 spin_lock(&ubq->cancel_lock); 1385 if (ubq->canceling) { 1386 spin_unlock(&ubq->cancel_lock); 1387 return false; 1388 } 1389 ubq->canceling = true; 1390 spin_unlock(&ubq->cancel_lock); 1391 1392 spin_lock(&ub->lock); 1393 disk = ub->ub_disk; 1394 if (disk) 1395 get_device(disk_to_dev(disk)); 1396 spin_unlock(&ub->lock); 1397 1398 /* Our disk has been dead */ 1399 if (!disk) 1400 return false; 1401 1402 /* Now we are serialized with ublk_queue_rq() */ 1403 blk_mq_quiesce_queue(disk->queue); 1404 /* abort queue is for making forward progress */ 1405 ublk_abort_queue(ub, ubq); 1406 blk_mq_unquiesce_queue(disk->queue); 1407 put_device(disk_to_dev(disk)); 1408 1409 return true; 1410 } 1411 1412 static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, 1413 unsigned int issue_flags) 1414 { 1415 bool done; 1416 1417 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) 1418 return; 1419 1420 spin_lock(&ubq->cancel_lock); 1421 done = !!(io->flags & UBLK_IO_FLAG_CANCELED); 1422 if (!done) 1423 io->flags |= UBLK_IO_FLAG_CANCELED; 1424 spin_unlock(&ubq->cancel_lock); 1425 1426 if (!done) 1427 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); 1428 } 1429 1430 /* 1431 * The ublk char device won't be closed when calling cancel fn, so both 1432 * ublk device and queue are guaranteed to be live 1433 */ 1434 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, 1435 unsigned int issue_flags) 1436 { 1437 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1438 struct ublk_queue *ubq = pdu->ubq; 1439 struct task_struct *task; 1440 struct ublk_device *ub; 1441 bool need_schedule; 1442 struct ublk_io *io; 1443 1444 if (WARN_ON_ONCE(!ubq)) 1445 return; 1446 1447 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth)) 1448 return; 1449 1450 task = io_uring_cmd_get_task(cmd); 1451 if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) 1452 return; 1453 1454 ub = ubq->dev; 1455 need_schedule = ublk_abort_requests(ub, ubq); 1456 1457 io = &ubq->ios[pdu->tag]; 1458 WARN_ON_ONCE(io->cmd != cmd); 1459 ublk_cancel_cmd(ubq, io, issue_flags); 1460 1461 if (need_schedule) { 1462 if (ublk_can_use_recovery(ub)) 1463 schedule_work(&ub->quiesce_work); 1464 else 1465 schedule_work(&ub->stop_work); 1466 } 1467 } 1468 1469 static inline bool ublk_queue_ready(struct ublk_queue *ubq) 1470 { 1471 return ubq->nr_io_ready == ubq->q_depth; 1472 } 1473 1474 static void ublk_cancel_queue(struct ublk_queue *ubq) 1475 { 1476 int i; 1477 1478 for (i = 0; i < ubq->q_depth; i++) 1479 ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED); 1480 } 1481 1482 /* Cancel all pending commands, must be called after del_gendisk() returns */ 1483 static void ublk_cancel_dev(struct ublk_device *ub) 1484 { 1485 int i; 1486 1487 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1488 ublk_cancel_queue(ublk_get_queue(ub, i)); 1489 } 1490 1491 static bool ublk_check_inflight_rq(struct request *rq, void *data) 1492 { 1493 bool *idle = data; 1494 1495 if (blk_mq_request_started(rq)) { 1496 *idle = false; 1497 return false; 1498 } 1499 return true; 1500 } 1501 1502 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) 1503 { 1504 bool idle; 1505 1506 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); 1507 while (true) { 1508 idle = true; 1509 blk_mq_tagset_busy_iter(&ub->tag_set, 1510 ublk_check_inflight_rq, &idle); 1511 if (idle) 1512 break; 1513 msleep(UBLK_REQUEUE_DELAY_MS); 1514 } 1515 } 1516 1517 static void __ublk_quiesce_dev(struct ublk_device *ub) 1518 { 1519 pr_devel("%s: quiesce ub: dev_id %d state %s\n", 1520 __func__, ub->dev_info.dev_id, 1521 ub->dev_info.state == UBLK_S_DEV_LIVE ? 1522 "LIVE" : "QUIESCED"); 1523 blk_mq_quiesce_queue(ub->ub_disk->queue); 1524 ublk_wait_tagset_rqs_idle(ub); 1525 ub->dev_info.state = UBLK_S_DEV_QUIESCED; 1526 } 1527 1528 static void ublk_quiesce_work_fn(struct work_struct *work) 1529 { 1530 struct ublk_device *ub = 1531 container_of(work, struct ublk_device, quiesce_work); 1532 1533 mutex_lock(&ub->mutex); 1534 if (ub->dev_info.state != UBLK_S_DEV_LIVE) 1535 goto unlock; 1536 __ublk_quiesce_dev(ub); 1537 unlock: 1538 mutex_unlock(&ub->mutex); 1539 ublk_cancel_dev(ub); 1540 } 1541 1542 static void ublk_unquiesce_dev(struct ublk_device *ub) 1543 { 1544 int i; 1545 1546 pr_devel("%s: unquiesce ub: dev_id %d state %s\n", 1547 __func__, ub->dev_info.dev_id, 1548 ub->dev_info.state == UBLK_S_DEV_LIVE ? 1549 "LIVE" : "QUIESCED"); 1550 /* quiesce_work has run. We let requeued rqs be aborted 1551 * before running fallback_wq. "force_abort" must be seen 1552 * after request queue is unqiuesced. Then del_gendisk() 1553 * can move on. 1554 */ 1555 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1556 ublk_get_queue(ub, i)->force_abort = true; 1557 1558 blk_mq_unquiesce_queue(ub->ub_disk->queue); 1559 /* We may have requeued some rqs in ublk_quiesce_queue() */ 1560 blk_mq_kick_requeue_list(ub->ub_disk->queue); 1561 } 1562 1563 static void ublk_stop_dev(struct ublk_device *ub) 1564 { 1565 struct gendisk *disk; 1566 1567 mutex_lock(&ub->mutex); 1568 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 1569 goto unlock; 1570 if (ublk_can_use_recovery(ub)) { 1571 if (ub->dev_info.state == UBLK_S_DEV_LIVE) 1572 __ublk_quiesce_dev(ub); 1573 ublk_unquiesce_dev(ub); 1574 } 1575 del_gendisk(ub->ub_disk); 1576 1577 /* Sync with ublk_abort_queue() by holding the lock */ 1578 spin_lock(&ub->lock); 1579 disk = ub->ub_disk; 1580 ub->dev_info.state = UBLK_S_DEV_DEAD; 1581 ub->dev_info.ublksrv_pid = -1; 1582 ub->ub_disk = NULL; 1583 spin_unlock(&ub->lock); 1584 put_disk(disk); 1585 unlock: 1586 mutex_unlock(&ub->mutex); 1587 ublk_cancel_dev(ub); 1588 } 1589 1590 /* device can only be started after all IOs are ready */ 1591 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) 1592 { 1593 mutex_lock(&ub->mutex); 1594 ubq->nr_io_ready++; 1595 if (ublk_queue_ready(ubq)) { 1596 ubq->ubq_daemon = current; 1597 get_task_struct(ubq->ubq_daemon); 1598 ub->nr_queues_ready++; 1599 1600 if (capable(CAP_SYS_ADMIN)) 1601 ub->nr_privileged_daemon++; 1602 } 1603 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) 1604 complete_all(&ub->completion); 1605 mutex_unlock(&ub->mutex); 1606 } 1607 1608 static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, 1609 int tag) 1610 { 1611 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 1612 struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); 1613 1614 ublk_queue_cmd(ubq, req); 1615 } 1616 1617 static inline int ublk_check_cmd_op(u32 cmd_op) 1618 { 1619 u32 ioc_type = _IOC_TYPE(cmd_op); 1620 1621 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') 1622 return -EOPNOTSUPP; 1623 1624 if (ioc_type != 'u' && ioc_type != 0) 1625 return -EOPNOTSUPP; 1626 1627 return 0; 1628 } 1629 1630 static inline void ublk_fill_io_cmd(struct ublk_io *io, 1631 struct io_uring_cmd *cmd, unsigned long buf_addr) 1632 { 1633 io->cmd = cmd; 1634 io->flags |= UBLK_IO_FLAG_ACTIVE; 1635 io->addr = buf_addr; 1636 } 1637 1638 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, 1639 unsigned int issue_flags, 1640 struct ublk_queue *ubq, unsigned int tag) 1641 { 1642 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1643 1644 /* 1645 * Safe to refer to @ubq since ublk_queue won't be died until its 1646 * commands are completed 1647 */ 1648 pdu->ubq = ubq; 1649 pdu->tag = tag; 1650 io_uring_cmd_mark_cancelable(cmd, issue_flags); 1651 } 1652 1653 static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, 1654 unsigned int issue_flags, 1655 const struct ublksrv_io_cmd *ub_cmd) 1656 { 1657 struct ublk_device *ub = cmd->file->private_data; 1658 struct ublk_queue *ubq; 1659 struct ublk_io *io; 1660 u32 cmd_op = cmd->cmd_op; 1661 unsigned tag = ub_cmd->tag; 1662 int ret = -EINVAL; 1663 struct request *req; 1664 1665 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", 1666 __func__, cmd->cmd_op, ub_cmd->q_id, tag, 1667 ub_cmd->result); 1668 1669 if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) 1670 goto out; 1671 1672 ubq = ublk_get_queue(ub, ub_cmd->q_id); 1673 if (!ubq || ub_cmd->q_id != ubq->q_id) 1674 goto out; 1675 1676 if (ubq->ubq_daemon && ubq->ubq_daemon != current) 1677 goto out; 1678 1679 if (tag >= ubq->q_depth) 1680 goto out; 1681 1682 io = &ubq->ios[tag]; 1683 1684 /* there is pending io cmd, something must be wrong */ 1685 if (io->flags & UBLK_IO_FLAG_ACTIVE) { 1686 ret = -EBUSY; 1687 goto out; 1688 } 1689 1690 /* 1691 * ensure that the user issues UBLK_IO_NEED_GET_DATA 1692 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. 1693 */ 1694 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) 1695 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) 1696 goto out; 1697 1698 ret = ublk_check_cmd_op(cmd_op); 1699 if (ret) 1700 goto out; 1701 1702 ret = -EINVAL; 1703 switch (_IOC_NR(cmd_op)) { 1704 case UBLK_IO_FETCH_REQ: 1705 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ 1706 if (ublk_queue_ready(ubq)) { 1707 ret = -EBUSY; 1708 goto out; 1709 } 1710 /* 1711 * The io is being handled by server, so COMMIT_RQ is expected 1712 * instead of FETCH_REQ 1713 */ 1714 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) 1715 goto out; 1716 1717 if (!ublk_support_user_copy(ubq)) { 1718 /* 1719 * FETCH_RQ has to provide IO buffer if NEED GET 1720 * DATA is not enabled 1721 */ 1722 if (!ub_cmd->addr && !ublk_need_get_data(ubq)) 1723 goto out; 1724 } else if (ub_cmd->addr) { 1725 /* User copy requires addr to be unset */ 1726 ret = -EINVAL; 1727 goto out; 1728 } 1729 1730 ublk_fill_io_cmd(io, cmd, ub_cmd->addr); 1731 ublk_mark_io_ready(ub, ubq); 1732 break; 1733 case UBLK_IO_COMMIT_AND_FETCH_REQ: 1734 req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); 1735 1736 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 1737 goto out; 1738 1739 if (!ublk_support_user_copy(ubq)) { 1740 /* 1741 * COMMIT_AND_FETCH_REQ has to provide IO buffer if 1742 * NEED GET DATA is not enabled or it is Read IO. 1743 */ 1744 if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || 1745 req_op(req) == REQ_OP_READ)) 1746 goto out; 1747 } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { 1748 /* 1749 * User copy requires addr to be unset when command is 1750 * not zone append 1751 */ 1752 ret = -EINVAL; 1753 goto out; 1754 } 1755 1756 ublk_fill_io_cmd(io, cmd, ub_cmd->addr); 1757 ublk_commit_completion(ub, ub_cmd); 1758 break; 1759 case UBLK_IO_NEED_GET_DATA: 1760 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 1761 goto out; 1762 ublk_fill_io_cmd(io, cmd, ub_cmd->addr); 1763 ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); 1764 break; 1765 default: 1766 goto out; 1767 } 1768 ublk_prep_cancel(cmd, issue_flags, ubq, tag); 1769 return -EIOCBQUEUED; 1770 1771 out: 1772 io_uring_cmd_done(cmd, ret, 0, issue_flags); 1773 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", 1774 __func__, cmd_op, tag, ret, io->flags); 1775 return -EIOCBQUEUED; 1776 } 1777 1778 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 1779 struct ublk_queue *ubq, int tag, size_t offset) 1780 { 1781 struct request *req; 1782 1783 if (!ublk_need_req_ref(ubq)) 1784 return NULL; 1785 1786 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); 1787 if (!req) 1788 return NULL; 1789 1790 if (!ublk_get_req_ref(ubq, req)) 1791 return NULL; 1792 1793 if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) 1794 goto fail_put; 1795 1796 if (!ublk_rq_has_data(req)) 1797 goto fail_put; 1798 1799 if (offset > blk_rq_bytes(req)) 1800 goto fail_put; 1801 1802 return req; 1803 fail_put: 1804 ublk_put_req_ref(ubq, req); 1805 return NULL; 1806 } 1807 1808 static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, 1809 unsigned int issue_flags) 1810 { 1811 /* 1812 * Not necessary for async retry, but let's keep it simple and always 1813 * copy the values to avoid any potential reuse. 1814 */ 1815 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); 1816 const struct ublksrv_io_cmd ub_cmd = { 1817 .q_id = READ_ONCE(ub_src->q_id), 1818 .tag = READ_ONCE(ub_src->tag), 1819 .result = READ_ONCE(ub_src->result), 1820 .addr = READ_ONCE(ub_src->addr) 1821 }; 1822 1823 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); 1824 1825 return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); 1826 } 1827 1828 static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, 1829 unsigned int issue_flags) 1830 { 1831 ublk_ch_uring_cmd_local(cmd, issue_flags); 1832 } 1833 1834 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 1835 { 1836 if (unlikely(issue_flags & IO_URING_F_CANCEL)) { 1837 ublk_uring_cmd_cancel_fn(cmd, issue_flags); 1838 return 0; 1839 } 1840 1841 /* well-implemented server won't run into unlocked */ 1842 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 1843 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb); 1844 return -EIOCBQUEUED; 1845 } 1846 1847 return ublk_ch_uring_cmd_local(cmd, issue_flags); 1848 } 1849 1850 static inline bool ublk_check_ubuf_dir(const struct request *req, 1851 int ubuf_dir) 1852 { 1853 /* copy ubuf to request pages */ 1854 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && 1855 ubuf_dir == ITER_SOURCE) 1856 return true; 1857 1858 /* copy request pages to ubuf */ 1859 if ((req_op(req) == REQ_OP_WRITE || 1860 req_op(req) == REQ_OP_ZONE_APPEND) && 1861 ubuf_dir == ITER_DEST) 1862 return true; 1863 1864 return false; 1865 } 1866 1867 static struct request *ublk_check_and_get_req(struct kiocb *iocb, 1868 struct iov_iter *iter, size_t *off, int dir) 1869 { 1870 struct ublk_device *ub = iocb->ki_filp->private_data; 1871 struct ublk_queue *ubq; 1872 struct request *req; 1873 size_t buf_off; 1874 u16 tag, q_id; 1875 1876 if (!ub) 1877 return ERR_PTR(-EACCES); 1878 1879 if (!user_backed_iter(iter)) 1880 return ERR_PTR(-EACCES); 1881 1882 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 1883 return ERR_PTR(-EACCES); 1884 1885 tag = ublk_pos_to_tag(iocb->ki_pos); 1886 q_id = ublk_pos_to_hwq(iocb->ki_pos); 1887 buf_off = ublk_pos_to_buf_off(iocb->ki_pos); 1888 1889 if (q_id >= ub->dev_info.nr_hw_queues) 1890 return ERR_PTR(-EINVAL); 1891 1892 ubq = ublk_get_queue(ub, q_id); 1893 if (!ubq) 1894 return ERR_PTR(-EINVAL); 1895 1896 if (tag >= ubq->q_depth) 1897 return ERR_PTR(-EINVAL); 1898 1899 req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); 1900 if (!req) 1901 return ERR_PTR(-EINVAL); 1902 1903 if (!req->mq_hctx || !req->mq_hctx->driver_data) 1904 goto fail; 1905 1906 if (!ublk_check_ubuf_dir(req, dir)) 1907 goto fail; 1908 1909 *off = buf_off; 1910 return req; 1911 fail: 1912 ublk_put_req_ref(ubq, req); 1913 return ERR_PTR(-EACCES); 1914 } 1915 1916 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) 1917 { 1918 struct ublk_queue *ubq; 1919 struct request *req; 1920 size_t buf_off; 1921 size_t ret; 1922 1923 req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST); 1924 if (IS_ERR(req)) 1925 return PTR_ERR(req); 1926 1927 ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); 1928 ubq = req->mq_hctx->driver_data; 1929 ublk_put_req_ref(ubq, req); 1930 1931 return ret; 1932 } 1933 1934 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) 1935 { 1936 struct ublk_queue *ubq; 1937 struct request *req; 1938 size_t buf_off; 1939 size_t ret; 1940 1941 req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE); 1942 if (IS_ERR(req)) 1943 return PTR_ERR(req); 1944 1945 ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); 1946 ubq = req->mq_hctx->driver_data; 1947 ublk_put_req_ref(ubq, req); 1948 1949 return ret; 1950 } 1951 1952 static const struct file_operations ublk_ch_fops = { 1953 .owner = THIS_MODULE, 1954 .open = ublk_ch_open, 1955 .release = ublk_ch_release, 1956 .llseek = no_llseek, 1957 .read_iter = ublk_ch_read_iter, 1958 .write_iter = ublk_ch_write_iter, 1959 .uring_cmd = ublk_ch_uring_cmd, 1960 .mmap = ublk_ch_mmap, 1961 }; 1962 1963 static void ublk_deinit_queue(struct ublk_device *ub, int q_id) 1964 { 1965 int size = ublk_queue_cmd_buf_size(ub, q_id); 1966 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 1967 1968 if (ubq->ubq_daemon) 1969 put_task_struct(ubq->ubq_daemon); 1970 if (ubq->io_cmd_buf) 1971 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); 1972 } 1973 1974 static int ublk_init_queue(struct ublk_device *ub, int q_id) 1975 { 1976 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 1977 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; 1978 void *ptr; 1979 int size; 1980 1981 spin_lock_init(&ubq->cancel_lock); 1982 ubq->flags = ub->dev_info.flags; 1983 ubq->q_id = q_id; 1984 ubq->q_depth = ub->dev_info.queue_depth; 1985 size = ublk_queue_cmd_buf_size(ub, q_id); 1986 1987 ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); 1988 if (!ptr) 1989 return -ENOMEM; 1990 1991 ubq->io_cmd_buf = ptr; 1992 ubq->dev = ub; 1993 return 0; 1994 } 1995 1996 static void ublk_deinit_queues(struct ublk_device *ub) 1997 { 1998 int nr_queues = ub->dev_info.nr_hw_queues; 1999 int i; 2000 2001 if (!ub->__queues) 2002 return; 2003 2004 for (i = 0; i < nr_queues; i++) 2005 ublk_deinit_queue(ub, i); 2006 kfree(ub->__queues); 2007 } 2008 2009 static int ublk_init_queues(struct ublk_device *ub) 2010 { 2011 int nr_queues = ub->dev_info.nr_hw_queues; 2012 int depth = ub->dev_info.queue_depth; 2013 int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); 2014 int i, ret = -ENOMEM; 2015 2016 ub->queue_size = ubq_size; 2017 ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); 2018 if (!ub->__queues) 2019 return ret; 2020 2021 for (i = 0; i < nr_queues; i++) { 2022 if (ublk_init_queue(ub, i)) 2023 goto fail; 2024 } 2025 2026 init_completion(&ub->completion); 2027 return 0; 2028 2029 fail: 2030 ublk_deinit_queues(ub); 2031 return ret; 2032 } 2033 2034 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) 2035 { 2036 int i = idx; 2037 int err; 2038 2039 spin_lock(&ublk_idr_lock); 2040 /* allocate id, if @id >= 0, we're requesting that specific id */ 2041 if (i >= 0) { 2042 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT); 2043 if (err == -ENOSPC) 2044 err = -EEXIST; 2045 } else { 2046 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS, 2047 GFP_NOWAIT); 2048 } 2049 spin_unlock(&ublk_idr_lock); 2050 2051 if (err >= 0) 2052 ub->ub_number = err; 2053 2054 return err; 2055 } 2056 2057 static void ublk_free_dev_number(struct ublk_device *ub) 2058 { 2059 spin_lock(&ublk_idr_lock); 2060 idr_remove(&ublk_index_idr, ub->ub_number); 2061 wake_up_all(&ublk_idr_wq); 2062 spin_unlock(&ublk_idr_lock); 2063 } 2064 2065 static void ublk_cdev_rel(struct device *dev) 2066 { 2067 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); 2068 2069 blk_mq_free_tag_set(&ub->tag_set); 2070 ublk_deinit_queues(ub); 2071 ublk_free_dev_number(ub); 2072 mutex_destroy(&ub->mutex); 2073 kfree(ub); 2074 } 2075 2076 static int ublk_add_chdev(struct ublk_device *ub) 2077 { 2078 struct device *dev = &ub->cdev_dev; 2079 int minor = ub->ub_number; 2080 int ret; 2081 2082 dev->parent = ublk_misc.this_device; 2083 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); 2084 dev->class = &ublk_chr_class; 2085 dev->release = ublk_cdev_rel; 2086 device_initialize(dev); 2087 2088 ret = dev_set_name(dev, "ublkc%d", minor); 2089 if (ret) 2090 goto fail; 2091 2092 cdev_init(&ub->cdev, &ublk_ch_fops); 2093 ret = cdev_device_add(&ub->cdev, dev); 2094 if (ret) 2095 goto fail; 2096 2097 ublks_added++; 2098 return 0; 2099 fail: 2100 put_device(dev); 2101 return ret; 2102 } 2103 2104 static void ublk_stop_work_fn(struct work_struct *work) 2105 { 2106 struct ublk_device *ub = 2107 container_of(work, struct ublk_device, stop_work); 2108 2109 ublk_stop_dev(ub); 2110 } 2111 2112 /* align max io buffer size with PAGE_SIZE */ 2113 static void ublk_align_max_io_size(struct ublk_device *ub) 2114 { 2115 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes; 2116 2117 ub->dev_info.max_io_buf_bytes = 2118 round_down(max_io_bytes, PAGE_SIZE); 2119 } 2120 2121 static int ublk_add_tag_set(struct ublk_device *ub) 2122 { 2123 ub->tag_set.ops = &ublk_mq_ops; 2124 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; 2125 ub->tag_set.queue_depth = ub->dev_info.queue_depth; 2126 ub->tag_set.numa_node = NUMA_NO_NODE; 2127 ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); 2128 ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 2129 ub->tag_set.driver_data = ub; 2130 return blk_mq_alloc_tag_set(&ub->tag_set); 2131 } 2132 2133 static void ublk_remove(struct ublk_device *ub) 2134 { 2135 ublk_stop_dev(ub); 2136 cancel_work_sync(&ub->stop_work); 2137 cancel_work_sync(&ub->quiesce_work); 2138 cdev_device_del(&ub->cdev, &ub->cdev_dev); 2139 ublk_put_device(ub); 2140 ublks_added--; 2141 } 2142 2143 static struct ublk_device *ublk_get_device_from_id(int idx) 2144 { 2145 struct ublk_device *ub = NULL; 2146 2147 if (idx < 0) 2148 return NULL; 2149 2150 spin_lock(&ublk_idr_lock); 2151 ub = idr_find(&ublk_index_idr, idx); 2152 if (ub) 2153 ub = ublk_get_device(ub); 2154 spin_unlock(&ublk_idr_lock); 2155 2156 return ub; 2157 } 2158 2159 static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) 2160 { 2161 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2162 const struct ublk_param_basic *p = &ub->params.basic; 2163 int ublksrv_pid = (int)header->data[0]; 2164 struct queue_limits lim = { 2165 .logical_block_size = 1 << p->logical_bs_shift, 2166 .physical_block_size = 1 << p->physical_bs_shift, 2167 .io_min = 1 << p->io_min_shift, 2168 .io_opt = 1 << p->io_opt_shift, 2169 .max_hw_sectors = p->max_sectors, 2170 .chunk_sectors = p->chunk_sectors, 2171 .virt_boundary_mask = p->virt_boundary_mask, 2172 .max_segments = USHRT_MAX, 2173 .max_segment_size = UINT_MAX, 2174 .dma_alignment = 3, 2175 }; 2176 struct gendisk *disk; 2177 int ret = -EINVAL; 2178 2179 if (ublksrv_pid <= 0) 2180 return -EINVAL; 2181 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) 2182 return -EINVAL; 2183 2184 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 2185 const struct ublk_param_discard *pd = &ub->params.discard; 2186 2187 lim.discard_alignment = pd->discard_alignment; 2188 lim.discard_granularity = pd->discard_granularity; 2189 lim.max_hw_discard_sectors = pd->max_discard_sectors; 2190 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; 2191 lim.max_discard_segments = pd->max_discard_segments; 2192 } 2193 2194 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { 2195 const struct ublk_param_zoned *p = &ub->params.zoned; 2196 2197 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 2198 return -EOPNOTSUPP; 2199 2200 lim.features |= BLK_FEAT_ZONED; 2201 lim.max_active_zones = p->max_active_zones; 2202 lim.max_open_zones = p->max_open_zones; 2203 lim.max_zone_append_sectors = p->max_zone_append_sectors; 2204 } 2205 2206 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { 2207 lim.features |= BLK_FEAT_WRITE_CACHE; 2208 if (ub->params.basic.attrs & UBLK_ATTR_FUA) 2209 lim.features |= BLK_FEAT_FUA; 2210 } 2211 2212 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) 2213 lim.features |= BLK_FEAT_ROTATIONAL; 2214 2215 if (wait_for_completion_interruptible(&ub->completion) != 0) 2216 return -EINTR; 2217 2218 mutex_lock(&ub->mutex); 2219 if (ub->dev_info.state == UBLK_S_DEV_LIVE || 2220 test_bit(UB_STATE_USED, &ub->state)) { 2221 ret = -EEXIST; 2222 goto out_unlock; 2223 } 2224 2225 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); 2226 if (IS_ERR(disk)) { 2227 ret = PTR_ERR(disk); 2228 goto out_unlock; 2229 } 2230 sprintf(disk->disk_name, "ublkb%d", ub->ub_number); 2231 disk->fops = &ub_fops; 2232 disk->private_data = ub; 2233 2234 ub->dev_info.ublksrv_pid = ublksrv_pid; 2235 ub->ub_disk = disk; 2236 2237 ublk_apply_params(ub); 2238 2239 /* don't probe partitions if any one ubq daemon is un-trusted */ 2240 if (ub->nr_privileged_daemon != ub->nr_queues_ready) 2241 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 2242 2243 ublk_get_device(ub); 2244 ub->dev_info.state = UBLK_S_DEV_LIVE; 2245 2246 if (ublk_dev_is_zoned(ub)) { 2247 ret = ublk_revalidate_disk_zones(ub); 2248 if (ret) 2249 goto out_put_cdev; 2250 } 2251 2252 ret = add_disk(disk); 2253 if (ret) 2254 goto out_put_cdev; 2255 2256 set_bit(UB_STATE_USED, &ub->state); 2257 2258 out_put_cdev: 2259 if (ret) { 2260 ub->dev_info.state = UBLK_S_DEV_DEAD; 2261 ublk_put_device(ub); 2262 } 2263 if (ret) 2264 put_disk(disk); 2265 out_unlock: 2266 mutex_unlock(&ub->mutex); 2267 return ret; 2268 } 2269 2270 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, 2271 struct io_uring_cmd *cmd) 2272 { 2273 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2274 void __user *argp = (void __user *)(unsigned long)header->addr; 2275 cpumask_var_t cpumask; 2276 unsigned long queue; 2277 unsigned int retlen; 2278 unsigned int i; 2279 int ret; 2280 2281 if (header->len * BITS_PER_BYTE < nr_cpu_ids) 2282 return -EINVAL; 2283 if (header->len & (sizeof(unsigned long)-1)) 2284 return -EINVAL; 2285 if (!header->addr) 2286 return -EINVAL; 2287 2288 queue = header->data[0]; 2289 if (queue >= ub->dev_info.nr_hw_queues) 2290 return -EINVAL; 2291 2292 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 2293 return -ENOMEM; 2294 2295 for_each_possible_cpu(i) { 2296 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) 2297 cpumask_set_cpu(i, cpumask); 2298 } 2299 2300 ret = -EFAULT; 2301 retlen = min_t(unsigned short, header->len, cpumask_size()); 2302 if (copy_to_user(argp, cpumask, retlen)) 2303 goto out_free_cpumask; 2304 if (retlen != header->len && 2305 clear_user(argp + retlen, header->len - retlen)) 2306 goto out_free_cpumask; 2307 2308 ret = 0; 2309 out_free_cpumask: 2310 free_cpumask_var(cpumask); 2311 return ret; 2312 } 2313 2314 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) 2315 { 2316 pr_devel("%s: dev id %d flags %llx\n", __func__, 2317 info->dev_id, info->flags); 2318 pr_devel("\t nr_hw_queues %d queue_depth %d\n", 2319 info->nr_hw_queues, info->queue_depth); 2320 } 2321 2322 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) 2323 { 2324 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2325 void __user *argp = (void __user *)(unsigned long)header->addr; 2326 struct ublksrv_ctrl_dev_info info; 2327 struct ublk_device *ub; 2328 int ret = -EINVAL; 2329 2330 if (header->len < sizeof(info) || !header->addr) 2331 return -EINVAL; 2332 if (header->queue_id != (u16)-1) { 2333 pr_warn("%s: queue_id is wrong %x\n", 2334 __func__, header->queue_id); 2335 return -EINVAL; 2336 } 2337 2338 if (copy_from_user(&info, argp, sizeof(info))) 2339 return -EFAULT; 2340 2341 if (capable(CAP_SYS_ADMIN)) 2342 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; 2343 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) 2344 return -EPERM; 2345 2346 /* 2347 * unprivileged device can't be trusted, but RECOVERY and 2348 * RECOVERY_REISSUE still may hang error handling, so can't 2349 * support recovery features for unprivileged ublk now 2350 * 2351 * TODO: provide forward progress for RECOVERY handler, so that 2352 * unprivileged device can benefit from it 2353 */ 2354 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) 2355 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | 2356 UBLK_F_USER_RECOVERY); 2357 2358 /* the created device is always owned by current user */ 2359 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); 2360 2361 if (header->dev_id != info.dev_id) { 2362 pr_warn("%s: dev id not match %u %u\n", 2363 __func__, header->dev_id, info.dev_id); 2364 return -EINVAL; 2365 } 2366 2367 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) { 2368 pr_warn("%s: dev id is too large. Max supported is %d\n", 2369 __func__, UBLK_MAX_UBLKS - 1); 2370 return -EINVAL; 2371 } 2372 2373 ublk_dump_dev_info(&info); 2374 2375 ret = mutex_lock_killable(&ublk_ctl_mutex); 2376 if (ret) 2377 return ret; 2378 2379 ret = -EACCES; 2380 if (ublks_added >= ublks_max) 2381 goto out_unlock; 2382 2383 ret = -ENOMEM; 2384 ub = kzalloc(sizeof(*ub), GFP_KERNEL); 2385 if (!ub) 2386 goto out_unlock; 2387 mutex_init(&ub->mutex); 2388 spin_lock_init(&ub->lock); 2389 INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); 2390 INIT_WORK(&ub->stop_work, ublk_stop_work_fn); 2391 2392 ret = ublk_alloc_dev_number(ub, header->dev_id); 2393 if (ret < 0) 2394 goto out_free_ub; 2395 2396 memcpy(&ub->dev_info, &info, sizeof(info)); 2397 2398 /* update device id */ 2399 ub->dev_info.dev_id = ub->ub_number; 2400 2401 /* 2402 * 64bit flags will be copied back to userspace as feature 2403 * negotiation result, so have to clear flags which driver 2404 * doesn't support yet, then userspace can get correct flags 2405 * (features) to handle. 2406 */ 2407 ub->dev_info.flags &= UBLK_F_ALL; 2408 2409 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | 2410 UBLK_F_URING_CMD_COMP_IN_TASK; 2411 2412 /* GET_DATA isn't needed any more with USER_COPY */ 2413 if (ublk_dev_is_user_copy(ub)) 2414 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; 2415 2416 /* Zoned storage support requires user copy feature */ 2417 if (ublk_dev_is_zoned(ub) && 2418 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { 2419 ret = -EINVAL; 2420 goto out_free_dev_number; 2421 } 2422 2423 /* We are not ready to support zero copy */ 2424 ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; 2425 2426 ub->dev_info.nr_hw_queues = min_t(unsigned int, 2427 ub->dev_info.nr_hw_queues, nr_cpu_ids); 2428 ublk_align_max_io_size(ub); 2429 2430 ret = ublk_init_queues(ub); 2431 if (ret) 2432 goto out_free_dev_number; 2433 2434 ret = ublk_add_tag_set(ub); 2435 if (ret) 2436 goto out_deinit_queues; 2437 2438 ret = -EFAULT; 2439 if (copy_to_user(argp, &ub->dev_info, sizeof(info))) 2440 goto out_free_tag_set; 2441 2442 /* 2443 * Add the char dev so that ublksrv daemon can be setup. 2444 * ublk_add_chdev() will cleanup everything if it fails. 2445 */ 2446 ret = ublk_add_chdev(ub); 2447 goto out_unlock; 2448 2449 out_free_tag_set: 2450 blk_mq_free_tag_set(&ub->tag_set); 2451 out_deinit_queues: 2452 ublk_deinit_queues(ub); 2453 out_free_dev_number: 2454 ublk_free_dev_number(ub); 2455 out_free_ub: 2456 mutex_destroy(&ub->mutex); 2457 kfree(ub); 2458 out_unlock: 2459 mutex_unlock(&ublk_ctl_mutex); 2460 return ret; 2461 } 2462 2463 static inline bool ublk_idr_freed(int id) 2464 { 2465 void *ptr; 2466 2467 spin_lock(&ublk_idr_lock); 2468 ptr = idr_find(&ublk_index_idr, id); 2469 spin_unlock(&ublk_idr_lock); 2470 2471 return ptr == NULL; 2472 } 2473 2474 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) 2475 { 2476 struct ublk_device *ub = *p_ub; 2477 int idx = ub->ub_number; 2478 int ret; 2479 2480 ret = mutex_lock_killable(&ublk_ctl_mutex); 2481 if (ret) 2482 return ret; 2483 2484 if (!test_bit(UB_STATE_DELETED, &ub->state)) { 2485 ublk_remove(ub); 2486 set_bit(UB_STATE_DELETED, &ub->state); 2487 } 2488 2489 /* Mark the reference as consumed */ 2490 *p_ub = NULL; 2491 ublk_put_device(ub); 2492 mutex_unlock(&ublk_ctl_mutex); 2493 2494 /* 2495 * Wait until the idr is removed, then it can be reused after 2496 * DEL_DEV command is returned. 2497 * 2498 * If we returns because of user interrupt, future delete command 2499 * may come: 2500 * 2501 * - the device number isn't freed, this device won't or needn't 2502 * be deleted again, since UB_STATE_DELETED is set, and device 2503 * will be released after the last reference is dropped 2504 * 2505 * - the device number is freed already, we will not find this 2506 * device via ublk_get_device_from_id() 2507 */ 2508 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) 2509 return -EINTR; 2510 return 0; 2511 } 2512 2513 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) 2514 { 2515 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2516 2517 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", 2518 __func__, cmd->cmd_op, header->dev_id, header->queue_id, 2519 header->data[0], header->addr, header->len); 2520 } 2521 2522 static int ublk_ctrl_stop_dev(struct ublk_device *ub) 2523 { 2524 ublk_stop_dev(ub); 2525 cancel_work_sync(&ub->stop_work); 2526 cancel_work_sync(&ub->quiesce_work); 2527 2528 return 0; 2529 } 2530 2531 static int ublk_ctrl_get_dev_info(struct ublk_device *ub, 2532 struct io_uring_cmd *cmd) 2533 { 2534 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2535 void __user *argp = (void __user *)(unsigned long)header->addr; 2536 2537 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) 2538 return -EINVAL; 2539 2540 if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) 2541 return -EFAULT; 2542 2543 return 0; 2544 } 2545 2546 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */ 2547 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) 2548 { 2549 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); 2550 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); 2551 2552 if (ub->ub_disk) { 2553 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); 2554 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); 2555 } else { 2556 ub->params.devt.disk_major = 0; 2557 ub->params.devt.disk_minor = 0; 2558 } 2559 ub->params.types |= UBLK_PARAM_TYPE_DEVT; 2560 } 2561 2562 static int ublk_ctrl_get_params(struct ublk_device *ub, 2563 struct io_uring_cmd *cmd) 2564 { 2565 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2566 void __user *argp = (void __user *)(unsigned long)header->addr; 2567 struct ublk_params_header ph; 2568 int ret; 2569 2570 if (header->len <= sizeof(ph) || !header->addr) 2571 return -EINVAL; 2572 2573 if (copy_from_user(&ph, argp, sizeof(ph))) 2574 return -EFAULT; 2575 2576 if (ph.len > header->len || !ph.len) 2577 return -EINVAL; 2578 2579 if (ph.len > sizeof(struct ublk_params)) 2580 ph.len = sizeof(struct ublk_params); 2581 2582 mutex_lock(&ub->mutex); 2583 ublk_ctrl_fill_params_devt(ub); 2584 if (copy_to_user(argp, &ub->params, ph.len)) 2585 ret = -EFAULT; 2586 else 2587 ret = 0; 2588 mutex_unlock(&ub->mutex); 2589 2590 return ret; 2591 } 2592 2593 static int ublk_ctrl_set_params(struct ublk_device *ub, 2594 struct io_uring_cmd *cmd) 2595 { 2596 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2597 void __user *argp = (void __user *)(unsigned long)header->addr; 2598 struct ublk_params_header ph; 2599 int ret = -EFAULT; 2600 2601 if (header->len <= sizeof(ph) || !header->addr) 2602 return -EINVAL; 2603 2604 if (copy_from_user(&ph, argp, sizeof(ph))) 2605 return -EFAULT; 2606 2607 if (ph.len > header->len || !ph.len || !ph.types) 2608 return -EINVAL; 2609 2610 if (ph.len > sizeof(struct ublk_params)) 2611 ph.len = sizeof(struct ublk_params); 2612 2613 /* parameters can only be changed when device isn't live */ 2614 mutex_lock(&ub->mutex); 2615 if (ub->dev_info.state == UBLK_S_DEV_LIVE) { 2616 ret = -EACCES; 2617 } else if (copy_from_user(&ub->params, argp, ph.len)) { 2618 ret = -EFAULT; 2619 } else { 2620 /* clear all we don't support yet */ 2621 ub->params.types &= UBLK_PARAM_TYPE_ALL; 2622 ret = ublk_validate_params(ub); 2623 if (ret) 2624 ub->params.types = 0; 2625 } 2626 mutex_unlock(&ub->mutex); 2627 2628 return ret; 2629 } 2630 2631 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) 2632 { 2633 int i; 2634 2635 WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); 2636 2637 /* All old ioucmds have to be completed */ 2638 ubq->nr_io_ready = 0; 2639 /* old daemon is PF_EXITING, put it now */ 2640 put_task_struct(ubq->ubq_daemon); 2641 /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ 2642 ubq->ubq_daemon = NULL; 2643 ubq->timeout = false; 2644 ubq->canceling = false; 2645 2646 for (i = 0; i < ubq->q_depth; i++) { 2647 struct ublk_io *io = &ubq->ios[i]; 2648 2649 /* forget everything now and be ready for new FETCH_REQ */ 2650 io->flags = 0; 2651 io->cmd = NULL; 2652 io->addr = 0; 2653 } 2654 } 2655 2656 static int ublk_ctrl_start_recovery(struct ublk_device *ub, 2657 struct io_uring_cmd *cmd) 2658 { 2659 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2660 int ret = -EINVAL; 2661 int i; 2662 2663 mutex_lock(&ub->mutex); 2664 if (!ublk_can_use_recovery(ub)) 2665 goto out_unlock; 2666 if (!ub->nr_queues_ready) 2667 goto out_unlock; 2668 /* 2669 * START_RECOVERY is only allowd after: 2670 * 2671 * (1) UB_STATE_OPEN is not set, which means the dying process is exited 2672 * and related io_uring ctx is freed so file struct of /dev/ublkcX is 2673 * released. 2674 * 2675 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: 2676 * (a)has quiesced request queue 2677 * (b)has requeued every inflight rqs whose io_flags is ACTIVE 2678 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE 2679 * (d)has completed/camceled all ioucmds owned by ther dying process 2680 */ 2681 if (test_bit(UB_STATE_OPEN, &ub->state) || 2682 ub->dev_info.state != UBLK_S_DEV_QUIESCED) { 2683 ret = -EBUSY; 2684 goto out_unlock; 2685 } 2686 pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); 2687 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2688 ublk_queue_reinit(ub, ublk_get_queue(ub, i)); 2689 /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ 2690 ub->mm = NULL; 2691 ub->nr_queues_ready = 0; 2692 ub->nr_privileged_daemon = 0; 2693 init_completion(&ub->completion); 2694 ret = 0; 2695 out_unlock: 2696 mutex_unlock(&ub->mutex); 2697 return ret; 2698 } 2699 2700 static int ublk_ctrl_end_recovery(struct ublk_device *ub, 2701 struct io_uring_cmd *cmd) 2702 { 2703 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2704 int ublksrv_pid = (int)header->data[0]; 2705 int ret = -EINVAL; 2706 2707 pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", 2708 __func__, ub->dev_info.nr_hw_queues, header->dev_id); 2709 /* wait until new ubq_daemon sending all FETCH_REQ */ 2710 if (wait_for_completion_interruptible(&ub->completion)) 2711 return -EINTR; 2712 2713 pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", 2714 __func__, ub->dev_info.nr_hw_queues, header->dev_id); 2715 2716 mutex_lock(&ub->mutex); 2717 if (!ublk_can_use_recovery(ub)) 2718 goto out_unlock; 2719 2720 if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { 2721 ret = -EBUSY; 2722 goto out_unlock; 2723 } 2724 ub->dev_info.ublksrv_pid = ublksrv_pid; 2725 pr_devel("%s: new ublksrv_pid %d, dev id %d\n", 2726 __func__, ublksrv_pid, header->dev_id); 2727 blk_mq_unquiesce_queue(ub->ub_disk->queue); 2728 pr_devel("%s: queue unquiesced, dev id %d.\n", 2729 __func__, header->dev_id); 2730 blk_mq_kick_requeue_list(ub->ub_disk->queue); 2731 ub->dev_info.state = UBLK_S_DEV_LIVE; 2732 ret = 0; 2733 out_unlock: 2734 mutex_unlock(&ub->mutex); 2735 return ret; 2736 } 2737 2738 static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) 2739 { 2740 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2741 void __user *argp = (void __user *)(unsigned long)header->addr; 2742 u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; 2743 2744 if (header->len != UBLK_FEATURES_LEN || !header->addr) 2745 return -EINVAL; 2746 2747 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) 2748 return -EFAULT; 2749 2750 return 0; 2751 } 2752 2753 /* 2754 * All control commands are sent via /dev/ublk-control, so we have to check 2755 * the destination device's permission 2756 */ 2757 static int ublk_char_dev_permission(struct ublk_device *ub, 2758 const char *dev_path, int mask) 2759 { 2760 int err; 2761 struct path path; 2762 struct kstat stat; 2763 2764 err = kern_path(dev_path, LOOKUP_FOLLOW, &path); 2765 if (err) 2766 return err; 2767 2768 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); 2769 if (err) 2770 goto exit; 2771 2772 err = -EPERM; 2773 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) 2774 goto exit; 2775 2776 err = inode_permission(&nop_mnt_idmap, 2777 d_backing_inode(path.dentry), mask); 2778 exit: 2779 path_put(&path); 2780 return err; 2781 } 2782 2783 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, 2784 struct io_uring_cmd *cmd) 2785 { 2786 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe); 2787 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; 2788 void __user *argp = (void __user *)(unsigned long)header->addr; 2789 char *dev_path = NULL; 2790 int ret = 0; 2791 int mask; 2792 2793 if (!unprivileged) { 2794 if (!capable(CAP_SYS_ADMIN)) 2795 return -EPERM; 2796 /* 2797 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes 2798 * char_dev_path in payload too, since userspace may not 2799 * know if the specified device is created as unprivileged 2800 * mode. 2801 */ 2802 if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) 2803 return 0; 2804 } 2805 2806 /* 2807 * User has to provide the char device path for unprivileged ublk 2808 * 2809 * header->addr always points to the dev path buffer, and 2810 * header->dev_path_len records length of dev path buffer. 2811 */ 2812 if (!header->dev_path_len || header->dev_path_len > PATH_MAX) 2813 return -EINVAL; 2814 2815 if (header->len < header->dev_path_len) 2816 return -EINVAL; 2817 2818 dev_path = memdup_user_nul(argp, header->dev_path_len); 2819 if (IS_ERR(dev_path)) 2820 return PTR_ERR(dev_path); 2821 2822 ret = -EINVAL; 2823 switch (_IOC_NR(cmd->cmd_op)) { 2824 case UBLK_CMD_GET_DEV_INFO: 2825 case UBLK_CMD_GET_DEV_INFO2: 2826 case UBLK_CMD_GET_QUEUE_AFFINITY: 2827 case UBLK_CMD_GET_PARAMS: 2828 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): 2829 mask = MAY_READ; 2830 break; 2831 case UBLK_CMD_START_DEV: 2832 case UBLK_CMD_STOP_DEV: 2833 case UBLK_CMD_ADD_DEV: 2834 case UBLK_CMD_DEL_DEV: 2835 case UBLK_CMD_SET_PARAMS: 2836 case UBLK_CMD_START_USER_RECOVERY: 2837 case UBLK_CMD_END_USER_RECOVERY: 2838 mask = MAY_READ | MAY_WRITE; 2839 break; 2840 default: 2841 goto exit; 2842 } 2843 2844 ret = ublk_char_dev_permission(ub, dev_path, mask); 2845 if (!ret) { 2846 header->len -= header->dev_path_len; 2847 header->addr += header->dev_path_len; 2848 } 2849 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", 2850 __func__, ub->ub_number, cmd->cmd_op, 2851 ub->dev_info.owner_uid, ub->dev_info.owner_gid, 2852 dev_path, ret); 2853 exit: 2854 kfree(dev_path); 2855 return ret; 2856 } 2857 2858 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, 2859 unsigned int issue_flags) 2860 { 2861 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2862 struct ublk_device *ub = NULL; 2863 u32 cmd_op = cmd->cmd_op; 2864 int ret = -EINVAL; 2865 2866 if (issue_flags & IO_URING_F_NONBLOCK) 2867 return -EAGAIN; 2868 2869 ublk_ctrl_cmd_dump(cmd); 2870 2871 if (!(issue_flags & IO_URING_F_SQE128)) 2872 goto out; 2873 2874 ret = ublk_check_cmd_op(cmd_op); 2875 if (ret) 2876 goto out; 2877 2878 if (cmd_op == UBLK_U_CMD_GET_FEATURES) { 2879 ret = ublk_ctrl_get_features(cmd); 2880 goto out; 2881 } 2882 2883 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { 2884 ret = -ENODEV; 2885 ub = ublk_get_device_from_id(header->dev_id); 2886 if (!ub) 2887 goto out; 2888 2889 ret = ublk_ctrl_uring_cmd_permission(ub, cmd); 2890 if (ret) 2891 goto put_dev; 2892 } 2893 2894 switch (_IOC_NR(cmd_op)) { 2895 case UBLK_CMD_START_DEV: 2896 ret = ublk_ctrl_start_dev(ub, cmd); 2897 break; 2898 case UBLK_CMD_STOP_DEV: 2899 ret = ublk_ctrl_stop_dev(ub); 2900 break; 2901 case UBLK_CMD_GET_DEV_INFO: 2902 case UBLK_CMD_GET_DEV_INFO2: 2903 ret = ublk_ctrl_get_dev_info(ub, cmd); 2904 break; 2905 case UBLK_CMD_ADD_DEV: 2906 ret = ublk_ctrl_add_dev(cmd); 2907 break; 2908 case UBLK_CMD_DEL_DEV: 2909 ret = ublk_ctrl_del_dev(&ub, true); 2910 break; 2911 case UBLK_CMD_DEL_DEV_ASYNC: 2912 ret = ublk_ctrl_del_dev(&ub, false); 2913 break; 2914 case UBLK_CMD_GET_QUEUE_AFFINITY: 2915 ret = ublk_ctrl_get_queue_affinity(ub, cmd); 2916 break; 2917 case UBLK_CMD_GET_PARAMS: 2918 ret = ublk_ctrl_get_params(ub, cmd); 2919 break; 2920 case UBLK_CMD_SET_PARAMS: 2921 ret = ublk_ctrl_set_params(ub, cmd); 2922 break; 2923 case UBLK_CMD_START_USER_RECOVERY: 2924 ret = ublk_ctrl_start_recovery(ub, cmd); 2925 break; 2926 case UBLK_CMD_END_USER_RECOVERY: 2927 ret = ublk_ctrl_end_recovery(ub, cmd); 2928 break; 2929 default: 2930 ret = -ENOTSUPP; 2931 break; 2932 } 2933 2934 put_dev: 2935 if (ub) 2936 ublk_put_device(ub); 2937 out: 2938 io_uring_cmd_done(cmd, ret, 0, issue_flags); 2939 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", 2940 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); 2941 return -EIOCBQUEUED; 2942 } 2943 2944 static const struct file_operations ublk_ctl_fops = { 2945 .open = nonseekable_open, 2946 .uring_cmd = ublk_ctrl_uring_cmd, 2947 .owner = THIS_MODULE, 2948 .llseek = noop_llseek, 2949 }; 2950 2951 static struct miscdevice ublk_misc = { 2952 .minor = MISC_DYNAMIC_MINOR, 2953 .name = "ublk-control", 2954 .fops = &ublk_ctl_fops, 2955 }; 2956 2957 static int __init ublk_init(void) 2958 { 2959 int ret; 2960 2961 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + 2962 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); 2963 2964 init_waitqueue_head(&ublk_idr_wq); 2965 2966 ret = misc_register(&ublk_misc); 2967 if (ret) 2968 return ret; 2969 2970 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char"); 2971 if (ret) 2972 goto unregister_mis; 2973 2974 ret = class_register(&ublk_chr_class); 2975 if (ret) 2976 goto free_chrdev_region; 2977 2978 return 0; 2979 2980 free_chrdev_region: 2981 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 2982 unregister_mis: 2983 misc_deregister(&ublk_misc); 2984 return ret; 2985 } 2986 2987 static void __exit ublk_exit(void) 2988 { 2989 struct ublk_device *ub; 2990 int id; 2991 2992 idr_for_each_entry(&ublk_index_idr, ub, id) 2993 ublk_remove(ub); 2994 2995 class_unregister(&ublk_chr_class); 2996 misc_deregister(&ublk_misc); 2997 2998 idr_destroy(&ublk_index_idr); 2999 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 3000 } 3001 3002 module_init(ublk_init); 3003 module_exit(ublk_exit); 3004 3005 static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp) 3006 { 3007 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); 3008 } 3009 3010 static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp) 3011 { 3012 return sysfs_emit(buf, "%u\n", ublks_max); 3013 } 3014 3015 static const struct kernel_param_ops ublk_max_ublks_ops = { 3016 .set = ublk_set_max_ublks, 3017 .get = ublk_get_max_ublks, 3018 }; 3019 3020 module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644); 3021 MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); 3022 3023 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); 3024 MODULE_DESCRIPTION("Userspace block device"); 3025 MODULE_LICENSE("GPL"); 3026