1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <uapi/linux/ublk_cmd.h>
48
49 #define UBLK_MINORS (1U << MINORBITS)
50
51 #define UBLK_INVALID_BUF_IDX ((u16)-1)
52
53 /* private ioctl command mirror */
54 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
55 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
56 #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
57
58 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
59 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
60
61 /* All UBLK_F_* have to be included into UBLK_F_ALL */
62 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
63 | UBLK_F_URING_CMD_COMP_IN_TASK \
64 | UBLK_F_NEED_GET_DATA \
65 | UBLK_F_USER_RECOVERY \
66 | UBLK_F_USER_RECOVERY_REISSUE \
67 | UBLK_F_UNPRIVILEGED_DEV \
68 | UBLK_F_CMD_IOCTL_ENCODE \
69 | UBLK_F_USER_COPY \
70 | UBLK_F_ZONED \
71 | UBLK_F_USER_RECOVERY_FAIL_IO \
72 | UBLK_F_UPDATE_SIZE \
73 | UBLK_F_AUTO_BUF_REG \
74 | UBLK_F_QUIESCE \
75 | UBLK_F_PER_IO_DAEMON \
76 | UBLK_F_BUF_REG_OFF_DAEMON)
77
78 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
79 | UBLK_F_USER_RECOVERY_REISSUE \
80 | UBLK_F_USER_RECOVERY_FAIL_IO)
81
82 /* All UBLK_PARAM_TYPE_* should be included here */
83 #define UBLK_PARAM_TYPE_ALL \
84 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
85 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
86 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
87
88 struct ublk_uring_cmd_pdu {
89 /*
90 * Store requests in same batch temporarily for queuing them to
91 * daemon context.
92 *
93 * It should have been stored to request payload, but we do want
94 * to avoid extra pre-allocation, and uring_cmd payload is always
95 * free for us
96 */
97 union {
98 struct request *req;
99 struct request *req_list;
100 };
101
102 /*
103 * The following two are valid in this cmd whole lifetime, and
104 * setup in ublk uring_cmd handler
105 */
106 struct ublk_queue *ubq;
107
108 u16 tag;
109 };
110
111 /*
112 * io command is active: sqe cmd is received, and its cqe isn't done
113 *
114 * If the flag is set, the io command is owned by ublk driver, and waited
115 * for incoming blk-mq request from the ublk block device.
116 *
117 * If the flag is cleared, the io command will be completed, and owned by
118 * ublk server.
119 */
120 #define UBLK_IO_FLAG_ACTIVE 0x01
121
122 /*
123 * IO command is completed via cqe, and it is being handled by ublksrv, and
124 * not committed yet
125 *
126 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
127 * cross verification
128 */
129 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
130
131 /*
132 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
133 * get data buffer address from ublksrv.
134 *
135 * Then, bio data could be copied into this data buffer for a WRITE request
136 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
137 */
138 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
139
140 /*
141 * request buffer is registered automatically, so we have to unregister it
142 * before completing this request.
143 *
144 * io_uring will unregister buffer automatically for us during exiting.
145 */
146 #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
147
148 /* atomic RW with ubq->cancel_lock */
149 #define UBLK_IO_FLAG_CANCELED 0x80000000
150
151 /*
152 * Initialize refcount to a large number to include any registered buffers.
153 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
154 * any buffers registered on the io daemon task.
155 */
156 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
157
158 union ublk_io_buf {
159 __u64 addr;
160 struct ublk_auto_buf_reg auto_reg;
161 };
162
163 struct ublk_io {
164 union ublk_io_buf buf;
165 unsigned int flags;
166 int res;
167
168 union {
169 /* valid if UBLK_IO_FLAG_ACTIVE is set */
170 struct io_uring_cmd *cmd;
171 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
172 struct request *req;
173 };
174
175 struct task_struct *task;
176
177 /*
178 * The number of uses of this I/O by the ublk server
179 * if user copy or zero copy are enabled:
180 * - UBLK_REFCOUNT_INIT from dispatch to the server
181 * until UBLK_IO_COMMIT_AND_FETCH_REQ
182 * - 1 for each inflight ublk_ch_{read,write}_iter() call
183 * - 1 for each io_uring registered buffer not registered on task
184 * The I/O can only be completed once all references are dropped.
185 * User copy and buffer registration operations are only permitted
186 * if the reference count is nonzero.
187 */
188 refcount_t ref;
189 /* Count of buffers registered on task and not yet unregistered */
190 unsigned task_registered_buffers;
191
192 void *buf_ctx_handle;
193 } ____cacheline_aligned_in_smp;
194
195 struct ublk_queue {
196 int q_id;
197 int q_depth;
198
199 unsigned long flags;
200 struct ublksrv_io_desc *io_cmd_buf;
201
202 bool force_abort;
203 bool canceling;
204 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
205 spinlock_t cancel_lock;
206 struct ublk_device *dev;
207 struct ublk_io ios[] __counted_by(q_depth);
208 };
209
210 struct ublk_device {
211 struct gendisk *ub_disk;
212
213 struct ublksrv_ctrl_dev_info dev_info;
214
215 struct blk_mq_tag_set tag_set;
216
217 struct cdev cdev;
218 struct device cdev_dev;
219
220 #define UB_STATE_OPEN 0
221 #define UB_STATE_USED 1
222 #define UB_STATE_DELETED 2
223 unsigned long state;
224 int ub_number;
225
226 struct mutex mutex;
227
228 spinlock_t lock;
229 struct mm_struct *mm;
230
231 struct ublk_params params;
232
233 struct completion completion;
234 u32 nr_io_ready;
235 bool unprivileged_daemons;
236 struct mutex cancel_mutex;
237 bool canceling;
238 pid_t ublksrv_tgid;
239 struct delayed_work exit_work;
240 struct work_struct partition_scan_work;
241
242 struct ublk_queue *queues[];
243 };
244
245 /* header of ublk_params */
246 struct ublk_params_header {
247 __u32 len;
248 __u32 types;
249 };
250
251 static void ublk_io_release(void *priv);
252 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
253 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
254 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
255 u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
256 static inline unsigned int ublk_req_build_flags(struct request *req);
257
ublk_partition_scan_work(struct work_struct * work)258 static void ublk_partition_scan_work(struct work_struct *work)
259 {
260 struct ublk_device *ub =
261 container_of(work, struct ublk_device, partition_scan_work);
262
263 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
264 &ub->ub_disk->state)))
265 return;
266
267 mutex_lock(&ub->ub_disk->open_mutex);
268 bdev_disk_changed(ub->ub_disk, false);
269 mutex_unlock(&ub->ub_disk->open_mutex);
270 }
271
272 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)273 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
274 {
275 return &ubq->io_cmd_buf[tag];
276 }
277
ublk_dev_is_zoned(const struct ublk_device * ub)278 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
279 {
280 return ub->dev_info.flags & UBLK_F_ZONED;
281 }
282
ublk_queue_is_zoned(const struct ublk_queue * ubq)283 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
284 {
285 return ubq->flags & UBLK_F_ZONED;
286 }
287
288 #ifdef CONFIG_BLK_DEV_ZONED
289
290 struct ublk_zoned_report_desc {
291 __u64 sector;
292 __u32 operation;
293 __u32 nr_zones;
294 };
295
296 static DEFINE_XARRAY(ublk_zoned_report_descs);
297
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)298 static int ublk_zoned_insert_report_desc(const struct request *req,
299 struct ublk_zoned_report_desc *desc)
300 {
301 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
302 desc, GFP_KERNEL);
303 }
304
ublk_zoned_erase_report_desc(const struct request * req)305 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
306 const struct request *req)
307 {
308 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
309 }
310
ublk_zoned_get_report_desc(const struct request * req)311 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
312 const struct request *req)
313 {
314 return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
315 }
316
ublk_get_nr_zones(const struct ublk_device * ub)317 static int ublk_get_nr_zones(const struct ublk_device *ub)
318 {
319 const struct ublk_param_basic *p = &ub->params.basic;
320
321 /* Zone size is a power of 2 */
322 return p->dev_sectors >> ilog2(p->chunk_sectors);
323 }
324
ublk_revalidate_disk_zones(struct ublk_device * ub)325 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
326 {
327 return blk_revalidate_disk_zones(ub->ub_disk);
328 }
329
ublk_dev_param_zoned_validate(const struct ublk_device * ub)330 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
331 {
332 const struct ublk_param_zoned *p = &ub->params.zoned;
333 int nr_zones;
334
335 if (!ublk_dev_is_zoned(ub))
336 return -EINVAL;
337
338 if (!p->max_zone_append_sectors)
339 return -EINVAL;
340
341 nr_zones = ublk_get_nr_zones(ub);
342
343 if (p->max_active_zones > nr_zones)
344 return -EINVAL;
345
346 if (p->max_open_zones > nr_zones)
347 return -EINVAL;
348
349 return 0;
350 }
351
ublk_dev_param_zoned_apply(struct ublk_device * ub)352 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
353 {
354 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
355 }
356
357 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)358 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
359 unsigned int nr_zones, size_t *buflen)
360 {
361 struct request_queue *q = ublk->ub_disk->queue;
362 size_t bufsize;
363 void *buf;
364
365 nr_zones = min_t(unsigned int, nr_zones,
366 ublk->ub_disk->nr_zones);
367
368 bufsize = nr_zones * sizeof(struct blk_zone);
369 bufsize =
370 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
371
372 while (bufsize >= sizeof(struct blk_zone)) {
373 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
374 if (buf) {
375 *buflen = bufsize;
376 return buf;
377 }
378 bufsize >>= 1;
379 }
380
381 *buflen = 0;
382 return NULL;
383 }
384
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)385 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
386 unsigned int nr_zones, struct blk_report_zones_args *args)
387 {
388 struct ublk_device *ub = disk->private_data;
389 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
390 unsigned int first_zone = sector >> ilog2(zone_size_sectors);
391 unsigned int done_zones = 0;
392 unsigned int max_zones_per_request;
393 int ret;
394 struct blk_zone *buffer;
395 size_t buffer_length;
396
397 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
398 nr_zones);
399
400 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
401 if (!buffer)
402 return -ENOMEM;
403
404 max_zones_per_request = buffer_length / sizeof(struct blk_zone);
405
406 while (done_zones < nr_zones) {
407 unsigned int remaining_zones = nr_zones - done_zones;
408 unsigned int zones_in_request =
409 min_t(unsigned int, remaining_zones, max_zones_per_request);
410 struct request *req;
411 struct ublk_zoned_report_desc desc;
412 blk_status_t status;
413
414 memset(buffer, 0, buffer_length);
415
416 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
417 if (IS_ERR(req)) {
418 ret = PTR_ERR(req);
419 goto out;
420 }
421
422 desc.operation = UBLK_IO_OP_REPORT_ZONES;
423 desc.sector = sector;
424 desc.nr_zones = zones_in_request;
425 ret = ublk_zoned_insert_report_desc(req, &desc);
426 if (ret)
427 goto free_req;
428
429 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
430 if (ret)
431 goto erase_desc;
432
433 status = blk_execute_rq(req, 0);
434 ret = blk_status_to_errno(status);
435 erase_desc:
436 ublk_zoned_erase_report_desc(req);
437 free_req:
438 blk_mq_free_request(req);
439 if (ret)
440 goto out;
441
442 for (unsigned int i = 0; i < zones_in_request; i++) {
443 struct blk_zone *zone = buffer + i;
444
445 /* A zero length zone means no more zones in this response */
446 if (!zone->len)
447 break;
448
449 ret = disk_report_zone(disk, zone, i, args);
450 if (ret)
451 goto out;
452
453 done_zones++;
454 sector += zone_size_sectors;
455
456 }
457 }
458
459 ret = done_zones;
460
461 out:
462 kvfree(buffer);
463 return ret;
464 }
465
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)466 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
467 struct request *req)
468 {
469 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
470 struct ublk_io *io = &ubq->ios[req->tag];
471 struct ublk_zoned_report_desc *desc;
472 u32 ublk_op;
473
474 switch (req_op(req)) {
475 case REQ_OP_ZONE_OPEN:
476 ublk_op = UBLK_IO_OP_ZONE_OPEN;
477 break;
478 case REQ_OP_ZONE_CLOSE:
479 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
480 break;
481 case REQ_OP_ZONE_FINISH:
482 ublk_op = UBLK_IO_OP_ZONE_FINISH;
483 break;
484 case REQ_OP_ZONE_RESET:
485 ublk_op = UBLK_IO_OP_ZONE_RESET;
486 break;
487 case REQ_OP_ZONE_APPEND:
488 ublk_op = UBLK_IO_OP_ZONE_APPEND;
489 break;
490 case REQ_OP_ZONE_RESET_ALL:
491 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
492 break;
493 case REQ_OP_DRV_IN:
494 desc = ublk_zoned_get_report_desc(req);
495 if (!desc)
496 return BLK_STS_IOERR;
497 ublk_op = desc->operation;
498 switch (ublk_op) {
499 case UBLK_IO_OP_REPORT_ZONES:
500 iod->op_flags = ublk_op | ublk_req_build_flags(req);
501 iod->nr_zones = desc->nr_zones;
502 iod->start_sector = desc->sector;
503 return BLK_STS_OK;
504 default:
505 return BLK_STS_IOERR;
506 }
507 case REQ_OP_DRV_OUT:
508 /* We do not support drv_out */
509 return BLK_STS_NOTSUPP;
510 default:
511 return BLK_STS_IOERR;
512 }
513
514 iod->op_flags = ublk_op | ublk_req_build_flags(req);
515 iod->nr_sectors = blk_rq_sectors(req);
516 iod->start_sector = blk_rq_pos(req);
517 iod->addr = io->buf.addr;
518
519 return BLK_STS_OK;
520 }
521
522 #else
523
524 #define ublk_report_zones (NULL)
525
ublk_dev_param_zoned_validate(const struct ublk_device * ub)526 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
527 {
528 return -EOPNOTSUPP;
529 }
530
ublk_dev_param_zoned_apply(struct ublk_device * ub)531 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
532 {
533 }
534
ublk_revalidate_disk_zones(struct ublk_device * ub)535 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
536 {
537 return 0;
538 }
539
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)540 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
541 struct request *req)
542 {
543 return BLK_STS_NOTSUPP;
544 }
545
546 #endif
547
548 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
549 bool need_map);
550
551 static dev_t ublk_chr_devt;
552 static const struct class ublk_chr_class = {
553 .name = "ublk-char",
554 };
555
556 static DEFINE_IDR(ublk_index_idr);
557 static DEFINE_SPINLOCK(ublk_idr_lock);
558 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
559
560 static DEFINE_MUTEX(ublk_ctl_mutex);
561
562
563 #define UBLK_MAX_UBLKS UBLK_MINORS
564
565 /*
566 * Max unprivileged ublk devices allowed to add
567 *
568 * It can be extended to one per-user limit in future or even controlled
569 * by cgroup.
570 */
571 static unsigned int unprivileged_ublks_max = 64;
572 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
573
574 static struct miscdevice ublk_misc;
575
ublk_pos_to_hwq(loff_t pos)576 static inline unsigned ublk_pos_to_hwq(loff_t pos)
577 {
578 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
579 UBLK_QID_BITS_MASK;
580 }
581
ublk_pos_to_buf_off(loff_t pos)582 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
583 {
584 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
585 }
586
ublk_pos_to_tag(loff_t pos)587 static inline unsigned ublk_pos_to_tag(loff_t pos)
588 {
589 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
590 UBLK_TAG_BITS_MASK;
591 }
592
ublk_dev_param_basic_apply(struct ublk_device * ub)593 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
594 {
595 const struct ublk_param_basic *p = &ub->params.basic;
596
597 if (p->attrs & UBLK_ATTR_READ_ONLY)
598 set_disk_ro(ub->ub_disk, true);
599
600 set_capacity(ub->ub_disk, p->dev_sectors);
601 }
602
ublk_validate_params(const struct ublk_device * ub)603 static int ublk_validate_params(const struct ublk_device *ub)
604 {
605 /* basic param is the only one which must be set */
606 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
607 const struct ublk_param_basic *p = &ub->params.basic;
608
609 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
610 return -EINVAL;
611
612 if (p->logical_bs_shift > p->physical_bs_shift)
613 return -EINVAL;
614
615 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
616 return -EINVAL;
617
618 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
619 return -EINVAL;
620 } else
621 return -EINVAL;
622
623 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
624 const struct ublk_param_discard *p = &ub->params.discard;
625
626 /* So far, only support single segment discard */
627 if (p->max_discard_sectors && p->max_discard_segments != 1)
628 return -EINVAL;
629
630 if (!p->discard_granularity)
631 return -EINVAL;
632 }
633
634 /* dev_t is read-only */
635 if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
636 return -EINVAL;
637
638 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
639 return ublk_dev_param_zoned_validate(ub);
640 else if (ublk_dev_is_zoned(ub))
641 return -EINVAL;
642
643 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
644 const struct ublk_param_dma_align *p = &ub->params.dma;
645
646 if (p->alignment >= PAGE_SIZE)
647 return -EINVAL;
648
649 if (!is_power_of_2(p->alignment + 1))
650 return -EINVAL;
651 }
652
653 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
654 const struct ublk_param_segment *p = &ub->params.seg;
655
656 if (!is_power_of_2(p->seg_boundary_mask + 1))
657 return -EINVAL;
658
659 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
660 return -EINVAL;
661 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
662 return -EINVAL;
663 }
664
665 return 0;
666 }
667
ublk_apply_params(struct ublk_device * ub)668 static void ublk_apply_params(struct ublk_device *ub)
669 {
670 ublk_dev_param_basic_apply(ub);
671
672 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
673 ublk_dev_param_zoned_apply(ub);
674 }
675
ublk_support_zero_copy(const struct ublk_queue * ubq)676 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
677 {
678 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
679 }
680
ublk_dev_support_zero_copy(const struct ublk_device * ub)681 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
682 {
683 return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
684 }
685
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)686 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
687 {
688 return ubq->flags & UBLK_F_AUTO_BUF_REG;
689 }
690
ublk_dev_support_auto_buf_reg(const struct ublk_device * ub)691 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
692 {
693 return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
694 }
695
ublk_support_user_copy(const struct ublk_queue * ubq)696 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
697 {
698 return ubq->flags & UBLK_F_USER_COPY;
699 }
700
ublk_dev_support_user_copy(const struct ublk_device * ub)701 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
702 {
703 return ub->dev_info.flags & UBLK_F_USER_COPY;
704 }
705
ublk_need_map_io(const struct ublk_queue * ubq)706 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
707 {
708 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
709 !ublk_support_auto_buf_reg(ubq);
710 }
711
ublk_dev_need_map_io(const struct ublk_device * ub)712 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
713 {
714 return !ublk_dev_support_user_copy(ub) &&
715 !ublk_dev_support_zero_copy(ub) &&
716 !ublk_dev_support_auto_buf_reg(ub);
717 }
718
ublk_need_req_ref(const struct ublk_queue * ubq)719 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
720 {
721 /*
722 * read()/write() is involved in user copy, so request reference
723 * has to be grabbed
724 *
725 * for zero copy, request buffer need to be registered to io_uring
726 * buffer table, so reference is needed
727 *
728 * For auto buffer register, ublk server still may issue
729 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
730 * so reference is required too.
731 */
732 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
733 ublk_support_auto_buf_reg(ubq);
734 }
735
ublk_dev_need_req_ref(const struct ublk_device * ub)736 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
737 {
738 return ublk_dev_support_user_copy(ub) ||
739 ublk_dev_support_zero_copy(ub) ||
740 ublk_dev_support_auto_buf_reg(ub);
741 }
742
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)743 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
744 struct ublk_io *io)
745 {
746 if (ublk_need_req_ref(ubq))
747 refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
748 }
749
ublk_get_req_ref(struct ublk_io * io)750 static inline bool ublk_get_req_ref(struct ublk_io *io)
751 {
752 return refcount_inc_not_zero(&io->ref);
753 }
754
ublk_put_req_ref(struct ublk_io * io,struct request * req)755 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
756 {
757 if (!refcount_dec_and_test(&io->ref))
758 return;
759
760 /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
761 __ublk_complete_rq(req, io, false);
762 }
763
ublk_sub_req_ref(struct ublk_io * io)764 static inline bool ublk_sub_req_ref(struct ublk_io *io)
765 {
766 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
767
768 io->task_registered_buffers = 0;
769 return refcount_sub_and_test(sub_refs, &io->ref);
770 }
771
ublk_need_get_data(const struct ublk_queue * ubq)772 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
773 {
774 return ubq->flags & UBLK_F_NEED_GET_DATA;
775 }
776
ublk_dev_need_get_data(const struct ublk_device * ub)777 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
778 {
779 return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
780 }
781
782 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)783 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
784 {
785 if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
786 return ub;
787 return NULL;
788 }
789
790 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)791 static noinline void ublk_put_device(struct ublk_device *ub)
792 {
793 put_device(&ub->cdev_dev);
794 }
795
ublk_get_queue(struct ublk_device * dev,int qid)796 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
797 int qid)
798 {
799 return dev->queues[qid];
800 }
801
ublk_rq_has_data(const struct request * rq)802 static inline bool ublk_rq_has_data(const struct request *rq)
803 {
804 return bio_has_data(rq->bio);
805 }
806
807 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)808 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
809 {
810 return ublk_get_queue(ub, q_id)->io_cmd_buf;
811 }
812
__ublk_queue_cmd_buf_size(int depth)813 static inline int __ublk_queue_cmd_buf_size(int depth)
814 {
815 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
816 }
817
ublk_queue_cmd_buf_size(struct ublk_device * ub)818 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
819 {
820 return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
821 }
822
ublk_max_cmd_buf_size(void)823 static int ublk_max_cmd_buf_size(void)
824 {
825 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
826 }
827
828 /*
829 * Should I/O outstanding to the ublk server when it exits be reissued?
830 * If not, outstanding I/O will get errors.
831 */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)832 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
833 {
834 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
835 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
836 }
837
838 /*
839 * Should I/O issued while there is no ublk server queue? If not, I/O
840 * issued while there is no ublk server will get errors.
841 */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)842 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
843 {
844 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
845 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
846 }
847
848 /*
849 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
850 * of the device flags for smaller cache footprint - better for fast
851 * paths.
852 */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)853 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
854 {
855 return (ubq->flags & UBLK_F_USER_RECOVERY) &&
856 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
857 }
858
859 /*
860 * Should ublk devices be stopped (i.e. no recovery possible) when the
861 * ublk server exits? If not, devices can be used again by a future
862 * incarnation of a ublk server via the start_recovery/end_recovery
863 * commands.
864 */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)865 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
866 {
867 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
868 }
869
ublk_dev_in_recoverable_state(struct ublk_device * ub)870 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
871 {
872 return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
873 ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
874 }
875
ublk_free_disk(struct gendisk * disk)876 static void ublk_free_disk(struct gendisk *disk)
877 {
878 struct ublk_device *ub = disk->private_data;
879
880 clear_bit(UB_STATE_USED, &ub->state);
881 ublk_put_device(ub);
882 }
883
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)884 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
885 unsigned int *owner_gid)
886 {
887 kuid_t uid;
888 kgid_t gid;
889
890 current_uid_gid(&uid, &gid);
891
892 *owner_uid = from_kuid(&init_user_ns, uid);
893 *owner_gid = from_kgid(&init_user_ns, gid);
894 }
895
ublk_open(struct gendisk * disk,blk_mode_t mode)896 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
897 {
898 struct ublk_device *ub = disk->private_data;
899
900 if (capable(CAP_SYS_ADMIN))
901 return 0;
902
903 /*
904 * If it is one unprivileged device, only owner can open
905 * the disk. Otherwise it could be one trap made by one
906 * evil user who grants this disk's privileges to other
907 * users deliberately.
908 *
909 * This way is reasonable too given anyone can create
910 * unprivileged device, and no need other's grant.
911 */
912 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
913 unsigned int curr_uid, curr_gid;
914
915 ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
916
917 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
918 ub->dev_info.owner_gid)
919 return -EPERM;
920 }
921
922 return 0;
923 }
924
925 static const struct block_device_operations ub_fops = {
926 .owner = THIS_MODULE,
927 .open = ublk_open,
928 .free_disk = ublk_free_disk,
929 .report_zones = ublk_report_zones,
930 };
931
932 /*
933 * Copy data between request pages and io_iter, and 'offset'
934 * is the start point of linear offset of request.
935 */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)936 static size_t ublk_copy_user_pages(const struct request *req,
937 unsigned offset, struct iov_iter *uiter, int dir)
938 {
939 struct req_iterator iter;
940 struct bio_vec bv;
941 size_t done = 0;
942
943 rq_for_each_segment(bv, req, iter) {
944 unsigned len;
945 void *bv_buf;
946 size_t copied;
947
948 if (offset >= bv.bv_len) {
949 offset -= bv.bv_len;
950 continue;
951 }
952
953 len = bv.bv_len - offset;
954 bv_buf = kmap_local_page(bv.bv_page) + bv.bv_offset + offset;
955 if (dir == ITER_DEST)
956 copied = copy_to_iter(bv_buf, len, uiter);
957 else
958 copied = copy_from_iter(bv_buf, len, uiter);
959
960 kunmap_local(bv_buf);
961
962 done += copied;
963 if (copied < len)
964 break;
965
966 offset = 0;
967 }
968 return done;
969 }
970
ublk_need_map_req(const struct request * req)971 static inline bool ublk_need_map_req(const struct request *req)
972 {
973 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
974 }
975
ublk_need_unmap_req(const struct request * req)976 static inline bool ublk_need_unmap_req(const struct request *req)
977 {
978 return ublk_rq_has_data(req) &&
979 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
980 }
981
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)982 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
983 const struct request *req,
984 const struct ublk_io *io)
985 {
986 const unsigned int rq_bytes = blk_rq_bytes(req);
987
988 if (!ublk_need_map_io(ubq))
989 return rq_bytes;
990
991 /*
992 * no zero copy, we delay copy WRITE request data into ublksrv
993 * context and the big benefit is that pinning pages in current
994 * context is pretty fast, see ublk_pin_user_pages
995 */
996 if (ublk_need_map_req(req)) {
997 struct iov_iter iter;
998 const int dir = ITER_DEST;
999
1000 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1001 return ublk_copy_user_pages(req, 0, &iter, dir);
1002 }
1003 return rq_bytes;
1004 }
1005
ublk_unmap_io(bool need_map,const struct request * req,const struct ublk_io * io)1006 static unsigned int ublk_unmap_io(bool need_map,
1007 const struct request *req,
1008 const struct ublk_io *io)
1009 {
1010 const unsigned int rq_bytes = blk_rq_bytes(req);
1011
1012 if (!need_map)
1013 return rq_bytes;
1014
1015 if (ublk_need_unmap_req(req)) {
1016 struct iov_iter iter;
1017 const int dir = ITER_SOURCE;
1018
1019 WARN_ON_ONCE(io->res > rq_bytes);
1020
1021 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1022 return ublk_copy_user_pages(req, 0, &iter, dir);
1023 }
1024 return rq_bytes;
1025 }
1026
ublk_req_build_flags(struct request * req)1027 static inline unsigned int ublk_req_build_flags(struct request *req)
1028 {
1029 unsigned flags = 0;
1030
1031 if (req->cmd_flags & REQ_FAILFAST_DEV)
1032 flags |= UBLK_IO_F_FAILFAST_DEV;
1033
1034 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1035 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1036
1037 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1038 flags |= UBLK_IO_F_FAILFAST_DRIVER;
1039
1040 if (req->cmd_flags & REQ_META)
1041 flags |= UBLK_IO_F_META;
1042
1043 if (req->cmd_flags & REQ_FUA)
1044 flags |= UBLK_IO_F_FUA;
1045
1046 if (req->cmd_flags & REQ_NOUNMAP)
1047 flags |= UBLK_IO_F_NOUNMAP;
1048
1049 if (req->cmd_flags & REQ_SWAP)
1050 flags |= UBLK_IO_F_SWAP;
1051
1052 return flags;
1053 }
1054
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1055 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1056 {
1057 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1058 struct ublk_io *io = &ubq->ios[req->tag];
1059 u32 ublk_op;
1060
1061 switch (req_op(req)) {
1062 case REQ_OP_READ:
1063 ublk_op = UBLK_IO_OP_READ;
1064 break;
1065 case REQ_OP_WRITE:
1066 ublk_op = UBLK_IO_OP_WRITE;
1067 break;
1068 case REQ_OP_FLUSH:
1069 ublk_op = UBLK_IO_OP_FLUSH;
1070 break;
1071 case REQ_OP_DISCARD:
1072 ublk_op = UBLK_IO_OP_DISCARD;
1073 break;
1074 case REQ_OP_WRITE_ZEROES:
1075 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1076 break;
1077 default:
1078 if (ublk_queue_is_zoned(ubq))
1079 return ublk_setup_iod_zoned(ubq, req);
1080 return BLK_STS_IOERR;
1081 }
1082
1083 /* need to translate since kernel may change */
1084 iod->op_flags = ublk_op | ublk_req_build_flags(req);
1085 iod->nr_sectors = blk_rq_sectors(req);
1086 iod->start_sector = blk_rq_pos(req);
1087 iod->addr = io->buf.addr;
1088
1089 return BLK_STS_OK;
1090 }
1091
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1092 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1093 struct io_uring_cmd *ioucmd)
1094 {
1095 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1096 }
1097
ublk_end_request(struct request * req,blk_status_t error)1098 static void ublk_end_request(struct request *req, blk_status_t error)
1099 {
1100 local_bh_disable();
1101 blk_mq_end_request(req, error);
1102 local_bh_enable();
1103 }
1104
1105 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req,struct ublk_io * io,bool need_map)1106 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1107 bool need_map)
1108 {
1109 unsigned int unmapped_bytes;
1110 blk_status_t res = BLK_STS_OK;
1111 bool requeue;
1112
1113 /* failed read IO if nothing is read */
1114 if (!io->res && req_op(req) == REQ_OP_READ)
1115 io->res = -EIO;
1116
1117 if (io->res < 0) {
1118 res = errno_to_blk_status(io->res);
1119 goto exit;
1120 }
1121
1122 /*
1123 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1124 * directly.
1125 *
1126 * Both the two needn't unmap.
1127 */
1128 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1129 req_op(req) != REQ_OP_DRV_IN)
1130 goto exit;
1131
1132 /* for READ request, writing data in iod->addr to rq buffers */
1133 unmapped_bytes = ublk_unmap_io(need_map, req, io);
1134
1135 /*
1136 * Extremely impossible since we got data filled in just before
1137 *
1138 * Re-read simply for this unlikely case.
1139 */
1140 if (unlikely(unmapped_bytes < io->res))
1141 io->res = unmapped_bytes;
1142
1143 /*
1144 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1145 * happens off this path, then that will prevent ublk's blkdev_release()
1146 * from being called on current's task work, see fput() implementation.
1147 *
1148 * Otherwise, ublk server may not provide forward progress in case of
1149 * reading the partition table from bdev_open() with disk->open_mutex
1150 * held, and causes dead lock as we could already be holding
1151 * disk->open_mutex here.
1152 *
1153 * Preferably we would not be doing IO with a mutex held that is also
1154 * used for release, but this work-around will suffice for now.
1155 */
1156 local_bh_disable();
1157 requeue = blk_update_request(req, BLK_STS_OK, io->res);
1158 local_bh_enable();
1159 if (requeue)
1160 blk_mq_requeue_request(req, true);
1161 else if (likely(!blk_should_fake_timeout(req->q)))
1162 __blk_mq_end_request(req, BLK_STS_OK);
1163
1164 return;
1165 exit:
1166 ublk_end_request(req, res);
1167 }
1168
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1169 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1170 struct request *req)
1171 {
1172 /* read cmd first because req will overwrite it */
1173 struct io_uring_cmd *cmd = io->cmd;
1174
1175 /* mark this cmd owned by ublksrv */
1176 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1177
1178 /*
1179 * clear ACTIVE since we are done with this sqe/cmd slot
1180 * We can only accept io cmd in case of being not active.
1181 */
1182 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1183
1184 io->req = req;
1185 return cmd;
1186 }
1187
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1188 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1189 int res, unsigned issue_flags)
1190 {
1191 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1192
1193 /* tell ublksrv one io request is coming */
1194 io_uring_cmd_done(cmd, res, issue_flags);
1195 }
1196
1197 #define UBLK_REQUEUE_DELAY_MS 3
1198
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1199 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1200 struct request *rq)
1201 {
1202 /* We cannot process this rq so just requeue it. */
1203 if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1204 blk_mq_requeue_request(rq, false);
1205 else
1206 ublk_end_request(rq, BLK_STS_IOERR);
1207 }
1208
1209 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,unsigned tag)1210 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1211 {
1212 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1213
1214 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1215 }
1216
1217 enum auto_buf_reg_res {
1218 AUTO_BUF_REG_FAIL,
1219 AUTO_BUF_REG_FALLBACK,
1220 AUTO_BUF_REG_OK,
1221 };
1222
ublk_prep_auto_buf_reg_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,enum auto_buf_reg_res res)1223 static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq,
1224 struct request *req, struct ublk_io *io,
1225 struct io_uring_cmd *cmd,
1226 enum auto_buf_reg_res res)
1227 {
1228 if (res == AUTO_BUF_REG_OK) {
1229 io->task_registered_buffers = 1;
1230 io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1231 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1232 }
1233 ublk_init_req_ref(ubq, io);
1234 __ublk_prep_compl_io_cmd(io, req);
1235 }
1236
1237 static enum auto_buf_reg_res
__ublk_do_auto_buf_reg(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1238 __ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
1239 struct ublk_io *io, struct io_uring_cmd *cmd,
1240 unsigned int issue_flags)
1241 {
1242 int ret;
1243
1244 ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1245 io->buf.auto_reg.index, issue_flags);
1246 if (ret) {
1247 if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1248 ublk_auto_buf_reg_fallback(ubq, req->tag);
1249 return AUTO_BUF_REG_FALLBACK;
1250 }
1251 ublk_end_request(req, BLK_STS_IOERR);
1252 return AUTO_BUF_REG_FAIL;
1253 }
1254
1255 return AUTO_BUF_REG_OK;
1256 }
1257
ublk_do_auto_buf_reg(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1258 static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
1259 struct ublk_io *io, struct io_uring_cmd *cmd,
1260 unsigned int issue_flags)
1261 {
1262 enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd,
1263 issue_flags);
1264
1265 if (res != AUTO_BUF_REG_FAIL) {
1266 ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res);
1267 io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1268 }
1269 }
1270
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1271 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1272 struct ublk_io *io)
1273 {
1274 unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1275
1276 /* partially mapped, update io descriptor */
1277 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1278 /*
1279 * Nothing mapped, retry until we succeed.
1280 *
1281 * We may never succeed in mapping any bytes here because
1282 * of OOM. TODO: reserve one buffer with single page pinned
1283 * for providing forward progress guarantee.
1284 */
1285 if (unlikely(!mapped_bytes)) {
1286 blk_mq_requeue_request(req, false);
1287 blk_mq_delay_kick_requeue_list(req->q,
1288 UBLK_REQUEUE_DELAY_MS);
1289 return false;
1290 }
1291
1292 ublk_get_iod(ubq, req->tag)->nr_sectors =
1293 mapped_bytes >> 9;
1294 }
1295
1296 return true;
1297 }
1298
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req)1299 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1300 {
1301 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1302 int tag = req->tag;
1303 struct ublk_io *io = &ubq->ios[tag];
1304
1305 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1306 __func__, ubq->q_id, req->tag, io->flags,
1307 ublk_get_iod(ubq, req->tag)->addr);
1308
1309 /*
1310 * Task is exiting if either:
1311 *
1312 * (1) current != io->task.
1313 * io_uring_cmd_complete_in_task() tries to run task_work
1314 * in a workqueue if cmd's task is PF_EXITING.
1315 *
1316 * (2) current->flags & PF_EXITING.
1317 */
1318 if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1319 __ublk_abort_rq(ubq, req);
1320 return;
1321 }
1322
1323 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1324 /*
1325 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1326 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1327 * and notify it.
1328 */
1329 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1330 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1331 __func__, ubq->q_id, req->tag, io->flags);
1332 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1333 issue_flags);
1334 return;
1335 }
1336
1337 if (!ublk_start_io(ubq, req, io))
1338 return;
1339
1340 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1341 ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags);
1342 } else {
1343 ublk_init_req_ref(ubq, io);
1344 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1345 }
1346 }
1347
ublk_cmd_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)1348 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1349 {
1350 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1351 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1352 struct ublk_queue *ubq = pdu->ubq;
1353
1354 ublk_dispatch_req(ubq, pdu->req);
1355 }
1356
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)1357 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1358 {
1359 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
1360 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1361
1362 pdu->req = rq;
1363 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
1364 }
1365
ublk_cmd_list_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)1366 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1367 {
1368 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1369 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1370 struct request *rq = pdu->req_list;
1371 struct request *next;
1372
1373 do {
1374 next = rq->rq_next;
1375 rq->rq_next = NULL;
1376 ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
1377 rq = next;
1378 } while (rq);
1379 }
1380
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)1381 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
1382 {
1383 struct io_uring_cmd *cmd = io->cmd;
1384 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1385
1386 pdu->req_list = rq_list_peek(l);
1387 rq_list_init(l);
1388 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
1389 }
1390
ublk_timeout(struct request * rq)1391 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1392 {
1393 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1394 pid_t tgid = ubq->dev->ublksrv_tgid;
1395 struct task_struct *p;
1396 struct pid *pid;
1397
1398 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
1399 return BLK_EH_RESET_TIMER;
1400
1401 if (unlikely(!tgid))
1402 return BLK_EH_RESET_TIMER;
1403
1404 rcu_read_lock();
1405 pid = find_vpid(tgid);
1406 p = pid_task(pid, PIDTYPE_PID);
1407 if (p)
1408 send_sig(SIGKILL, p, 0);
1409 rcu_read_unlock();
1410 return BLK_EH_DONE;
1411 }
1412
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)1413 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
1414 bool check_cancel)
1415 {
1416 blk_status_t res;
1417
1418 if (unlikely(READ_ONCE(ubq->fail_io)))
1419 return BLK_STS_TARGET;
1420
1421 /* With recovery feature enabled, force_abort is set in
1422 * ublk_stop_dev() before calling del_gendisk(). We have to
1423 * abort all requeued and new rqs here to let del_gendisk()
1424 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1425 * to avoid UAF on io_uring ctx.
1426 *
1427 * Note: force_abort is guaranteed to be seen because it is set
1428 * before request queue is unqiuesced.
1429 */
1430 if (ublk_nosrv_should_queue_io(ubq) &&
1431 unlikely(READ_ONCE(ubq->force_abort)))
1432 return BLK_STS_IOERR;
1433
1434 if (check_cancel && unlikely(ubq->canceling))
1435 return BLK_STS_IOERR;
1436
1437 /* fill iod to slot in io cmd buffer */
1438 res = ublk_setup_iod(ubq, rq);
1439 if (unlikely(res != BLK_STS_OK))
1440 return BLK_STS_IOERR;
1441
1442 blk_mq_start_request(rq);
1443 return BLK_STS_OK;
1444 }
1445
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)1446 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
1447 const struct blk_mq_queue_data *bd)
1448 {
1449 struct ublk_queue *ubq = hctx->driver_data;
1450 struct request *rq = bd->rq;
1451 blk_status_t res;
1452
1453 res = ublk_prep_req(ubq, rq, false);
1454 if (res != BLK_STS_OK)
1455 return res;
1456
1457 /*
1458 * ->canceling has to be handled after ->force_abort and ->fail_io
1459 * is dealt with, otherwise this request may not be failed in case
1460 * of recovery, and cause hang when deleting disk
1461 */
1462 if (unlikely(ubq->canceling)) {
1463 __ublk_abort_rq(ubq, rq);
1464 return BLK_STS_OK;
1465 }
1466
1467 ublk_queue_cmd(ubq, rq);
1468 return BLK_STS_OK;
1469 }
1470
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)1471 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
1472 const struct ublk_io *io2)
1473 {
1474 return (io_uring_cmd_ctx_handle(io->cmd) ==
1475 io_uring_cmd_ctx_handle(io2->cmd)) &&
1476 (io->task == io2->task);
1477 }
1478
ublk_queue_rqs(struct rq_list * rqlist)1479 static void ublk_queue_rqs(struct rq_list *rqlist)
1480 {
1481 struct rq_list requeue_list = { };
1482 struct rq_list submit_list = { };
1483 struct ublk_io *io = NULL;
1484 struct request *req;
1485
1486 while ((req = rq_list_pop(rqlist))) {
1487 struct ublk_queue *this_q = req->mq_hctx->driver_data;
1488 struct ublk_io *this_io = &this_q->ios[req->tag];
1489
1490 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
1491 rq_list_add_tail(&requeue_list, req);
1492 continue;
1493 }
1494
1495 if (io && !ublk_belong_to_same_batch(io, this_io) &&
1496 !rq_list_empty(&submit_list))
1497 ublk_queue_cmd_list(io, &submit_list);
1498 io = this_io;
1499 rq_list_add_tail(&submit_list, req);
1500 }
1501
1502 if (!rq_list_empty(&submit_list))
1503 ublk_queue_cmd_list(io, &submit_list);
1504 *rqlist = requeue_list;
1505 }
1506
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)1507 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1508 unsigned int hctx_idx)
1509 {
1510 struct ublk_device *ub = driver_data;
1511 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
1512
1513 hctx->driver_data = ubq;
1514 return 0;
1515 }
1516
1517 static const struct blk_mq_ops ublk_mq_ops = {
1518 .queue_rq = ublk_queue_rq,
1519 .queue_rqs = ublk_queue_rqs,
1520 .init_hctx = ublk_init_hctx,
1521 .timeout = ublk_timeout,
1522 };
1523
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)1524 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
1525 {
1526 int i;
1527
1528 for (i = 0; i < ubq->q_depth; i++) {
1529 struct ublk_io *io = &ubq->ios[i];
1530
1531 /*
1532 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
1533 * io->cmd
1534 */
1535 io->flags &= UBLK_IO_FLAG_CANCELED;
1536 io->cmd = NULL;
1537 io->buf.addr = 0;
1538
1539 /*
1540 * old task is PF_EXITING, put it now
1541 *
1542 * It could be NULL in case of closing one quiesced
1543 * device.
1544 */
1545 if (io->task) {
1546 put_task_struct(io->task);
1547 io->task = NULL;
1548 }
1549
1550 WARN_ON_ONCE(refcount_read(&io->ref));
1551 WARN_ON_ONCE(io->task_registered_buffers);
1552 }
1553 }
1554
ublk_ch_open(struct inode * inode,struct file * filp)1555 static int ublk_ch_open(struct inode *inode, struct file *filp)
1556 {
1557 struct ublk_device *ub = container_of(inode->i_cdev,
1558 struct ublk_device, cdev);
1559
1560 if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
1561 return -EBUSY;
1562 filp->private_data = ub;
1563 ub->ublksrv_tgid = current->tgid;
1564 return 0;
1565 }
1566
ublk_reset_ch_dev(struct ublk_device * ub)1567 static void ublk_reset_ch_dev(struct ublk_device *ub)
1568 {
1569 int i;
1570
1571 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1572 ublk_queue_reinit(ub, ublk_get_queue(ub, i));
1573
1574 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
1575 ub->mm = NULL;
1576 ub->nr_io_ready = 0;
1577 ub->unprivileged_daemons = false;
1578 ub->ublksrv_tgid = -1;
1579 }
1580
ublk_get_disk(struct ublk_device * ub)1581 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
1582 {
1583 struct gendisk *disk;
1584
1585 spin_lock(&ub->lock);
1586 disk = ub->ub_disk;
1587 if (disk)
1588 get_device(disk_to_dev(disk));
1589 spin_unlock(&ub->lock);
1590
1591 return disk;
1592 }
1593
ublk_put_disk(struct gendisk * disk)1594 static void ublk_put_disk(struct gendisk *disk)
1595 {
1596 if (disk)
1597 put_device(disk_to_dev(disk));
1598 }
1599
1600 /*
1601 * Use this function to ensure that ->canceling is consistently set for
1602 * the device and all queues. Do not set these flags directly.
1603 *
1604 * Caller must ensure that:
1605 * - cancel_mutex is held. This ensures that there is no concurrent
1606 * access to ub->canceling and no concurrent writes to ubq->canceling.
1607 * - there are no concurrent reads of ubq->canceling from the queue_rq
1608 * path. This can be done by quiescing the queue, or through other
1609 * means.
1610 */
ublk_set_canceling(struct ublk_device * ub,bool canceling)1611 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
1612 __must_hold(&ub->cancel_mutex)
1613 {
1614 int i;
1615
1616 ub->canceling = canceling;
1617 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1618 ublk_get_queue(ub, i)->canceling = canceling;
1619 }
1620
ublk_check_and_reset_active_ref(struct ublk_device * ub)1621 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
1622 {
1623 int i, j;
1624
1625 if (!ublk_dev_need_req_ref(ub))
1626 return false;
1627
1628 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1629 struct ublk_queue *ubq = ublk_get_queue(ub, i);
1630
1631 for (j = 0; j < ubq->q_depth; j++) {
1632 struct ublk_io *io = &ubq->ios[j];
1633 unsigned int refs = refcount_read(&io->ref) +
1634 io->task_registered_buffers;
1635
1636 /*
1637 * UBLK_REFCOUNT_INIT or zero means no active
1638 * reference
1639 */
1640 if (refs != UBLK_REFCOUNT_INIT && refs != 0)
1641 return true;
1642
1643 /* reset to zero if the io hasn't active references */
1644 refcount_set(&io->ref, 0);
1645 io->task_registered_buffers = 0;
1646 }
1647 }
1648 return false;
1649 }
1650
ublk_ch_release_work_fn(struct work_struct * work)1651 static void ublk_ch_release_work_fn(struct work_struct *work)
1652 {
1653 struct ublk_device *ub =
1654 container_of(work, struct ublk_device, exit_work.work);
1655 struct gendisk *disk;
1656 int i;
1657
1658 /*
1659 * For zero-copy and auto buffer register modes, I/O references
1660 * might not be dropped naturally when the daemon is killed, but
1661 * io_uring guarantees that registered bvec kernel buffers are
1662 * unregistered finally when freeing io_uring context, then the
1663 * active references are dropped.
1664 *
1665 * Wait until active references are dropped for avoiding use-after-free
1666 *
1667 * registered buffer may be unregistered in io_ring's release hander,
1668 * so have to wait by scheduling work function for avoiding the two
1669 * file release dependency.
1670 */
1671 if (ublk_check_and_reset_active_ref(ub)) {
1672 schedule_delayed_work(&ub->exit_work, 1);
1673 return;
1674 }
1675
1676 /*
1677 * disk isn't attached yet, either device isn't live, or it has
1678 * been removed already, so we needn't to do anything
1679 */
1680 disk = ublk_get_disk(ub);
1681 if (!disk)
1682 goto out;
1683
1684 /*
1685 * All uring_cmd are done now, so abort any request outstanding to
1686 * the ublk server
1687 *
1688 * This can be done in lockless way because ublk server has been
1689 * gone
1690 *
1691 * More importantly, we have to provide forward progress guarantee
1692 * without holding ub->mutex, otherwise control task grabbing
1693 * ub->mutex triggers deadlock
1694 *
1695 * All requests may be inflight, so ->canceling may not be set, set
1696 * it now.
1697 */
1698 mutex_lock(&ub->cancel_mutex);
1699 ublk_set_canceling(ub, true);
1700 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1701 ublk_abort_queue(ub, ublk_get_queue(ub, i));
1702 mutex_unlock(&ub->cancel_mutex);
1703 blk_mq_kick_requeue_list(disk->queue);
1704
1705 /*
1706 * All infligh requests have been completed or requeued and any new
1707 * request will be failed or requeued via `->canceling` now, so it is
1708 * fine to grab ub->mutex now.
1709 */
1710 mutex_lock(&ub->mutex);
1711
1712 /* double check after grabbing lock */
1713 if (!ub->ub_disk)
1714 goto unlock;
1715
1716 /*
1717 * Transition the device to the nosrv state. What exactly this
1718 * means depends on the recovery flags
1719 */
1720 if (ublk_nosrv_should_stop_dev(ub)) {
1721 /*
1722 * Allow any pending/future I/O to pass through quickly
1723 * with an error. This is needed because del_gendisk
1724 * waits for all pending I/O to complete
1725 */
1726 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1727 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
1728
1729 ublk_stop_dev_unlocked(ub);
1730 } else {
1731 if (ublk_nosrv_dev_should_queue_io(ub)) {
1732 /* ->canceling is set and all requests are aborted */
1733 ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1734 } else {
1735 ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
1736 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1737 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
1738 }
1739 }
1740 unlock:
1741 mutex_unlock(&ub->mutex);
1742 ublk_put_disk(disk);
1743
1744 /* all uring_cmd has been done now, reset device & ubq */
1745 ublk_reset_ch_dev(ub);
1746 out:
1747 clear_bit(UB_STATE_OPEN, &ub->state);
1748
1749 /* put the reference grabbed in ublk_ch_release() */
1750 ublk_put_device(ub);
1751 }
1752
ublk_ch_release(struct inode * inode,struct file * filp)1753 static int ublk_ch_release(struct inode *inode, struct file *filp)
1754 {
1755 struct ublk_device *ub = filp->private_data;
1756
1757 /*
1758 * Grab ublk device reference, so it won't be gone until we are
1759 * really released from work function.
1760 */
1761 ublk_get_device(ub);
1762
1763 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
1764 schedule_delayed_work(&ub->exit_work, 0);
1765 return 0;
1766 }
1767
1768 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)1769 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
1770 {
1771 struct ublk_device *ub = filp->private_data;
1772 size_t sz = vma->vm_end - vma->vm_start;
1773 unsigned max_sz = ublk_max_cmd_buf_size();
1774 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
1775 int q_id, ret = 0;
1776
1777 spin_lock(&ub->lock);
1778 if (!ub->mm)
1779 ub->mm = current->mm;
1780 if (current->mm != ub->mm)
1781 ret = -EINVAL;
1782 spin_unlock(&ub->lock);
1783
1784 if (ret)
1785 return ret;
1786
1787 if (vma->vm_flags & VM_WRITE)
1788 return -EPERM;
1789
1790 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
1791 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
1792 return -EINVAL;
1793
1794 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
1795 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
1796 __func__, q_id, current->pid, vma->vm_start,
1797 phys_off, (unsigned long)sz);
1798
1799 if (sz != ublk_queue_cmd_buf_size(ub))
1800 return -EINVAL;
1801
1802 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
1803 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1804 }
1805
__ublk_fail_req(struct ublk_device * ub,struct ublk_io * io,struct request * req)1806 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
1807 struct request *req)
1808 {
1809 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
1810
1811 if (ublk_nosrv_should_reissue_outstanding(ub))
1812 blk_mq_requeue_request(req, false);
1813 else {
1814 io->res = -EIO;
1815 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
1816 }
1817 }
1818
1819 /*
1820 * Called from ublk char device release handler, when any uring_cmd is
1821 * done, meantime request queue is "quiesced" since all inflight requests
1822 * can't be completed because ublk server is dead.
1823 *
1824 * So no one can hold our request IO reference any more, simply ignore the
1825 * reference, and complete the request immediately
1826 */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)1827 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
1828 {
1829 int i;
1830
1831 for (i = 0; i < ubq->q_depth; i++) {
1832 struct ublk_io *io = &ubq->ios[i];
1833
1834 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1835 __ublk_fail_req(ub, io, io->req);
1836 }
1837 }
1838
ublk_start_cancel(struct ublk_device * ub)1839 static void ublk_start_cancel(struct ublk_device *ub)
1840 {
1841 struct gendisk *disk = ublk_get_disk(ub);
1842
1843 /* Our disk has been dead */
1844 if (!disk)
1845 return;
1846
1847 mutex_lock(&ub->cancel_mutex);
1848 if (ub->canceling)
1849 goto out;
1850 /*
1851 * Now we are serialized with ublk_queue_rq()
1852 *
1853 * Make sure that ubq->canceling is set when queue is frozen,
1854 * because ublk_queue_rq() has to rely on this flag for avoiding to
1855 * touch completed uring_cmd
1856 */
1857 blk_mq_quiesce_queue(disk->queue);
1858 ublk_set_canceling(ub, true);
1859 blk_mq_unquiesce_queue(disk->queue);
1860 out:
1861 mutex_unlock(&ub->cancel_mutex);
1862 ublk_put_disk(disk);
1863 }
1864
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)1865 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
1866 unsigned int issue_flags)
1867 {
1868 struct ublk_io *io = &ubq->ios[tag];
1869 struct ublk_device *ub = ubq->dev;
1870 struct request *req;
1871 bool done;
1872
1873 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1874 return;
1875
1876 /*
1877 * Don't try to cancel this command if the request is started for
1878 * avoiding race between io_uring_cmd_done() and
1879 * io_uring_cmd_complete_in_task().
1880 *
1881 * Either the started request will be aborted via __ublk_abort_rq(),
1882 * then this uring_cmd is canceled next time, or it will be done in
1883 * task work function ublk_dispatch_req() because io_uring guarantees
1884 * that ublk_dispatch_req() is always called
1885 */
1886 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1887 if (req && blk_mq_request_started(req) && req->tag == tag)
1888 return;
1889
1890 spin_lock(&ubq->cancel_lock);
1891 done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1892 if (!done)
1893 io->flags |= UBLK_IO_FLAG_CANCELED;
1894 spin_unlock(&ubq->cancel_lock);
1895
1896 if (!done)
1897 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
1898 }
1899
1900 /*
1901 * The ublk char device won't be closed when calling cancel fn, so both
1902 * ublk device and queue are guaranteed to be live
1903 *
1904 * Two-stage cancel:
1905 *
1906 * - make every active uring_cmd done in ->cancel_fn()
1907 *
1908 * - aborting inflight ublk IO requests in ublk char device release handler,
1909 * which depends on 1st stage because device can only be closed iff all
1910 * uring_cmd are done
1911 *
1912 * Do _not_ try to acquire ub->mutex before all inflight requests are
1913 * aborted, otherwise deadlock may be caused.
1914 */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)1915 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1916 unsigned int issue_flags)
1917 {
1918 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1919 struct ublk_queue *ubq = pdu->ubq;
1920 struct task_struct *task;
1921 struct ublk_io *io;
1922
1923 if (WARN_ON_ONCE(!ubq))
1924 return;
1925
1926 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1927 return;
1928
1929 task = io_uring_cmd_get_task(cmd);
1930 io = &ubq->ios[pdu->tag];
1931 if (WARN_ON_ONCE(task && task != io->task))
1932 return;
1933
1934 ublk_start_cancel(ubq->dev);
1935
1936 WARN_ON_ONCE(io->cmd != cmd);
1937 ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
1938 }
1939
ublk_dev_ready(const struct ublk_device * ub)1940 static inline bool ublk_dev_ready(const struct ublk_device *ub)
1941 {
1942 u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth;
1943
1944 return ub->nr_io_ready == total;
1945 }
1946
ublk_cancel_queue(struct ublk_queue * ubq)1947 static void ublk_cancel_queue(struct ublk_queue *ubq)
1948 {
1949 int i;
1950
1951 for (i = 0; i < ubq->q_depth; i++)
1952 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
1953 }
1954
1955 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)1956 static void ublk_cancel_dev(struct ublk_device *ub)
1957 {
1958 int i;
1959
1960 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1961 ublk_cancel_queue(ublk_get_queue(ub, i));
1962 }
1963
ublk_check_inflight_rq(struct request * rq,void * data)1964 static bool ublk_check_inflight_rq(struct request *rq, void *data)
1965 {
1966 bool *idle = data;
1967
1968 if (blk_mq_request_started(rq)) {
1969 *idle = false;
1970 return false;
1971 }
1972 return true;
1973 }
1974
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)1975 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1976 {
1977 bool idle;
1978
1979 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1980 while (true) {
1981 idle = true;
1982 blk_mq_tagset_busy_iter(&ub->tag_set,
1983 ublk_check_inflight_rq, &idle);
1984 if (idle)
1985 break;
1986 msleep(UBLK_REQUEUE_DELAY_MS);
1987 }
1988 }
1989
ublk_force_abort_dev(struct ublk_device * ub)1990 static void ublk_force_abort_dev(struct ublk_device *ub)
1991 {
1992 int i;
1993
1994 pr_devel("%s: force abort ub: dev_id %d state %s\n",
1995 __func__, ub->dev_info.dev_id,
1996 ub->dev_info.state == UBLK_S_DEV_LIVE ?
1997 "LIVE" : "QUIESCED");
1998 blk_mq_quiesce_queue(ub->ub_disk->queue);
1999 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2000 ublk_wait_tagset_rqs_idle(ub);
2001
2002 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2003 ublk_get_queue(ub, i)->force_abort = true;
2004 blk_mq_unquiesce_queue(ub->ub_disk->queue);
2005 /* We may have requeued some rqs in ublk_quiesce_queue() */
2006 blk_mq_kick_requeue_list(ub->ub_disk->queue);
2007 }
2008
ublk_detach_disk(struct ublk_device * ub)2009 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2010 {
2011 struct gendisk *disk;
2012
2013 /* Sync with ublk_abort_queue() by holding the lock */
2014 spin_lock(&ub->lock);
2015 disk = ub->ub_disk;
2016 ub->dev_info.state = UBLK_S_DEV_DEAD;
2017 ub->dev_info.ublksrv_pid = -1;
2018 ub->ub_disk = NULL;
2019 spin_unlock(&ub->lock);
2020
2021 return disk;
2022 }
2023
ublk_stop_dev_unlocked(struct ublk_device * ub)2024 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2025 __must_hold(&ub->mutex)
2026 {
2027 struct gendisk *disk;
2028
2029 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2030 return;
2031
2032 if (ublk_nosrv_dev_should_queue_io(ub))
2033 ublk_force_abort_dev(ub);
2034 del_gendisk(ub->ub_disk);
2035 disk = ublk_detach_disk(ub);
2036 put_disk(disk);
2037 }
2038
ublk_stop_dev(struct ublk_device * ub)2039 static void ublk_stop_dev(struct ublk_device *ub)
2040 {
2041 mutex_lock(&ub->mutex);
2042 ublk_stop_dev_unlocked(ub);
2043 mutex_unlock(&ub->mutex);
2044 flush_work(&ub->partition_scan_work);
2045 ublk_cancel_dev(ub);
2046 }
2047
2048 /* reset ublk io_uring queue & io flags */
ublk_reset_io_flags(struct ublk_device * ub)2049 static void ublk_reset_io_flags(struct ublk_device *ub)
2050 {
2051 int i, j;
2052
2053 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2054 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2055
2056 /* UBLK_IO_FLAG_CANCELED can be cleared now */
2057 spin_lock(&ubq->cancel_lock);
2058 for (j = 0; j < ubq->q_depth; j++)
2059 ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2060 spin_unlock(&ubq->cancel_lock);
2061 ubq->fail_io = false;
2062 }
2063 mutex_lock(&ub->cancel_mutex);
2064 ublk_set_canceling(ub, false);
2065 mutex_unlock(&ub->cancel_mutex);
2066 }
2067
2068 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub)2069 static void ublk_mark_io_ready(struct ublk_device *ub)
2070 __must_hold(&ub->mutex)
2071 {
2072 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2073 ub->unprivileged_daemons = true;
2074
2075 ub->nr_io_ready++;
2076 if (ublk_dev_ready(ub)) {
2077 /* now we are ready for handling ublk io request */
2078 ublk_reset_io_flags(ub);
2079 complete_all(&ub->completion);
2080 }
2081 }
2082
ublk_check_cmd_op(u32 cmd_op)2083 static inline int ublk_check_cmd_op(u32 cmd_op)
2084 {
2085 u32 ioc_type = _IOC_TYPE(cmd_op);
2086
2087 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2088 return -EOPNOTSUPP;
2089
2090 if (ioc_type != 'u' && ioc_type != 0)
2091 return -EOPNOTSUPP;
2092
2093 return 0;
2094 }
2095
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)2096 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2097 {
2098 struct ublk_auto_buf_reg buf;
2099
2100 buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2101
2102 if (buf.reserved0 || buf.reserved1)
2103 return -EINVAL;
2104
2105 if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2106 return -EINVAL;
2107 io->buf.auto_reg = buf;
2108 return 0;
2109 }
2110
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)2111 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
2112 struct io_uring_cmd *cmd,
2113 u16 *buf_idx)
2114 {
2115 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
2116 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
2117
2118 /*
2119 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
2120 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
2121 * `io_ring_ctx`.
2122 *
2123 * If this uring_cmd's io_ring_ctx isn't same with the
2124 * one for registering the buffer, it is ublk server's
2125 * responsibility for unregistering the buffer, otherwise
2126 * this ublk request gets stuck.
2127 */
2128 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
2129 *buf_idx = io->buf.auto_reg.index;
2130 }
2131
2132 return ublk_set_auto_buf_reg(io, cmd);
2133 }
2134
2135 /* Once we return, `io->req` can't be used any more */
2136 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)2137 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
2138 {
2139 struct request *req = io->req;
2140
2141 io->cmd = cmd;
2142 io->flags |= UBLK_IO_FLAG_ACTIVE;
2143 /* now this cmd slot is owned by ublk driver */
2144 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
2145
2146 return req;
2147 }
2148
2149 static inline int
ublk_config_io_buf(const struct ublk_device * ub,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)2150 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
2151 struct io_uring_cmd *cmd, unsigned long buf_addr,
2152 u16 *buf_idx)
2153 {
2154 if (ublk_dev_support_auto_buf_reg(ub))
2155 return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
2156
2157 io->buf.addr = buf_addr;
2158 return 0;
2159 }
2160
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)2161 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
2162 unsigned int issue_flags,
2163 struct ublk_queue *ubq, unsigned int tag)
2164 {
2165 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2166
2167 /*
2168 * Safe to refer to @ubq since ublk_queue won't be died until its
2169 * commands are completed
2170 */
2171 pdu->ubq = ubq;
2172 pdu->tag = tag;
2173 io_uring_cmd_mark_cancelable(cmd, issue_flags);
2174 }
2175
ublk_io_release(void * priv)2176 static void ublk_io_release(void *priv)
2177 {
2178 struct request *rq = priv;
2179 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2180 struct ublk_io *io = &ubq->ios[rq->tag];
2181
2182 /*
2183 * task_registered_buffers may be 0 if buffers were registered off task
2184 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
2185 */
2186 if (current == io->task && io->task_registered_buffers)
2187 io->task_registered_buffers--;
2188 else
2189 ublk_put_req_ref(io, rq);
2190 }
2191
ublk_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned int index,unsigned int issue_flags)2192 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
2193 struct ublk_device *ub,
2194 u16 q_id, u16 tag,
2195 struct ublk_io *io,
2196 unsigned int index, unsigned int issue_flags)
2197 {
2198 struct request *req;
2199 int ret;
2200
2201 if (!ublk_dev_support_zero_copy(ub))
2202 return -EINVAL;
2203
2204 req = __ublk_check_and_get_req(ub, q_id, tag, io, 0);
2205 if (!req)
2206 return -EINVAL;
2207
2208 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2209 issue_flags);
2210 if (ret) {
2211 ublk_put_req_ref(io, req);
2212 return ret;
2213 }
2214
2215 return 0;
2216 }
2217
2218 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned index,unsigned issue_flags)2219 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
2220 struct ublk_device *ub,
2221 u16 q_id, u16 tag, struct ublk_io *io,
2222 unsigned index, unsigned issue_flags)
2223 {
2224 unsigned new_registered_buffers;
2225 struct request *req = io->req;
2226 int ret;
2227
2228 /*
2229 * Ensure there are still references for ublk_sub_req_ref() to release.
2230 * If not, fall back on the thread-safe buffer registration.
2231 */
2232 new_registered_buffers = io->task_registered_buffers + 1;
2233 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
2234 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
2235 issue_flags);
2236
2237 if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
2238 return -EINVAL;
2239
2240 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2241 issue_flags);
2242 if (ret)
2243 return ret;
2244
2245 io->task_registered_buffers = new_registered_buffers;
2246 return 0;
2247 }
2248
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)2249 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
2250 const struct ublk_device *ub,
2251 unsigned int index, unsigned int issue_flags)
2252 {
2253 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
2254 return -EINVAL;
2255
2256 return io_buffer_unregister_bvec(cmd, index, issue_flags);
2257 }
2258
ublk_check_fetch_buf(const struct ublk_device * ub,__u64 buf_addr)2259 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
2260 {
2261 if (ublk_dev_need_map_io(ub)) {
2262 /*
2263 * FETCH_RQ has to provide IO buffer if NEED GET
2264 * DATA is not enabled
2265 */
2266 if (!buf_addr && !ublk_dev_need_get_data(ub))
2267 return -EINVAL;
2268 } else if (buf_addr) {
2269 /* User copy requires addr to be unset */
2270 return -EINVAL;
2271 }
2272 return 0;
2273 }
2274
__ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io)2275 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
2276 struct ublk_io *io)
2277 {
2278 /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
2279 if (ublk_dev_ready(ub))
2280 return -EBUSY;
2281
2282 /* allow each command to be FETCHed at most once */
2283 if (io->flags & UBLK_IO_FLAG_ACTIVE)
2284 return -EINVAL;
2285
2286 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
2287
2288 ublk_fill_io_cmd(io, cmd);
2289
2290 WRITE_ONCE(io->task, get_task_struct(current));
2291 ublk_mark_io_ready(ub);
2292
2293 return 0;
2294 }
2295
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)2296 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
2297 struct ublk_io *io, __u64 buf_addr)
2298 {
2299 int ret;
2300
2301 /*
2302 * When handling FETCH command for setting up ublk uring queue,
2303 * ub->mutex is the innermost lock, and we won't block for handling
2304 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
2305 */
2306 mutex_lock(&ub->mutex);
2307 ret = __ublk_fetch(cmd, ub, io);
2308 if (!ret)
2309 ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
2310 mutex_unlock(&ub->mutex);
2311 return ret;
2312 }
2313
ublk_check_commit_and_fetch(const struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)2314 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
2315 struct ublk_io *io, __u64 buf_addr)
2316 {
2317 struct request *req = io->req;
2318
2319 if (ublk_dev_need_map_io(ub)) {
2320 /*
2321 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
2322 * NEED GET DATA is not enabled or it is Read IO.
2323 */
2324 if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
2325 req_op(req) == REQ_OP_READ))
2326 return -EINVAL;
2327 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
2328 /*
2329 * User copy requires addr to be unset when command is
2330 * not zone append
2331 */
2332 return -EINVAL;
2333 }
2334
2335 return 0;
2336 }
2337
ublk_need_complete_req(const struct ublk_device * ub,struct ublk_io * io)2338 static bool ublk_need_complete_req(const struct ublk_device *ub,
2339 struct ublk_io *io)
2340 {
2341 if (ublk_dev_need_req_ref(ub))
2342 return ublk_sub_req_ref(io);
2343 return true;
2344 }
2345
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)2346 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
2347 struct request *req)
2348 {
2349 /*
2350 * We have handled UBLK_IO_NEED_GET_DATA command,
2351 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
2352 * do the copy work.
2353 */
2354 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
2355 /* update iod->addr because ublksrv may have passed a new io buffer */
2356 ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
2357 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
2358 __func__, ubq->q_id, req->tag, io->flags,
2359 ublk_get_iod(ubq, req->tag)->addr);
2360
2361 return ublk_start_io(ubq, req, io);
2362 }
2363
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)2364 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
2365 unsigned int issue_flags)
2366 {
2367 /* May point to userspace-mapped memory */
2368 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
2369 u16 buf_idx = UBLK_INVALID_BUF_IDX;
2370 struct ublk_device *ub = cmd->file->private_data;
2371 struct ublk_queue *ubq;
2372 struct ublk_io *io = NULL;
2373 u32 cmd_op = cmd->cmd_op;
2374 u16 q_id = READ_ONCE(ub_src->q_id);
2375 u16 tag = READ_ONCE(ub_src->tag);
2376 s32 result = READ_ONCE(ub_src->result);
2377 u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
2378 struct request *req;
2379 int ret;
2380 bool compl;
2381
2382 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
2383
2384 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
2385 __func__, cmd->cmd_op, q_id, tag, result);
2386
2387 ret = ublk_check_cmd_op(cmd_op);
2388 if (ret)
2389 goto out;
2390
2391 /*
2392 * io_buffer_unregister_bvec() doesn't access the ubq or io,
2393 * so no need to validate the q_id, tag, or task
2394 */
2395 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
2396 return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
2397
2398 ret = -EINVAL;
2399 if (q_id >= ub->dev_info.nr_hw_queues)
2400 goto out;
2401
2402 ubq = ublk_get_queue(ub, q_id);
2403
2404 if (tag >= ub->dev_info.queue_depth)
2405 goto out;
2406
2407 io = &ubq->ios[tag];
2408 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
2409 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
2410 ret = ublk_check_fetch_buf(ub, addr);
2411 if (ret)
2412 goto out;
2413 ret = ublk_fetch(cmd, ub, io, addr);
2414 if (ret)
2415 goto out;
2416
2417 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2418 return -EIOCBQUEUED;
2419 }
2420
2421 if (READ_ONCE(io->task) != current) {
2422 /*
2423 * ublk_register_io_buf() accesses only the io's refcount,
2424 * so can be handled on any task
2425 */
2426 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
2427 return ublk_register_io_buf(cmd, ub, q_id, tag, io,
2428 addr, issue_flags);
2429
2430 goto out;
2431 }
2432
2433 /* there is pending io cmd, something must be wrong */
2434 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
2435 ret = -EBUSY;
2436 goto out;
2437 }
2438
2439 /*
2440 * ensure that the user issues UBLK_IO_NEED_GET_DATA
2441 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
2442 */
2443 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
2444 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
2445 goto out;
2446
2447 switch (_IOC_NR(cmd_op)) {
2448 case UBLK_IO_REGISTER_IO_BUF:
2449 return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
2450 issue_flags);
2451 case UBLK_IO_COMMIT_AND_FETCH_REQ:
2452 ret = ublk_check_commit_and_fetch(ub, io, addr);
2453 if (ret)
2454 goto out;
2455 io->res = result;
2456 req = ublk_fill_io_cmd(io, cmd);
2457 ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
2458 compl = ublk_need_complete_req(ub, io);
2459
2460 /* can't touch 'ublk_io' any more */
2461 if (buf_idx != UBLK_INVALID_BUF_IDX)
2462 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
2463 if (req_op(req) == REQ_OP_ZONE_APPEND)
2464 req->__sector = addr;
2465 if (compl)
2466 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
2467
2468 if (ret)
2469 goto out;
2470 break;
2471 case UBLK_IO_NEED_GET_DATA:
2472 /*
2473 * ublk_get_data() may fail and fallback to requeue, so keep
2474 * uring_cmd active first and prepare for handling new requeued
2475 * request
2476 */
2477 req = ublk_fill_io_cmd(io, cmd);
2478 ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
2479 WARN_ON_ONCE(ret);
2480 if (likely(ublk_get_data(ubq, io, req))) {
2481 __ublk_prep_compl_io_cmd(io, req);
2482 return UBLK_IO_RES_OK;
2483 }
2484 break;
2485 default:
2486 goto out;
2487 }
2488 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2489 return -EIOCBQUEUED;
2490
2491 out:
2492 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
2493 __func__, cmd_op, tag, ret, io ? io->flags : 0);
2494 return ret;
2495 }
2496
__ublk_check_and_get_req(struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,size_t offset)2497 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
2498 u16 q_id, u16 tag, struct ublk_io *io, size_t offset)
2499 {
2500 struct request *req;
2501
2502 /*
2503 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
2504 * which would overwrite it with io->cmd
2505 */
2506 req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
2507 if (!req)
2508 return NULL;
2509
2510 if (!ublk_get_req_ref(io))
2511 return NULL;
2512
2513 if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
2514 goto fail_put;
2515
2516 if (!ublk_rq_has_data(req))
2517 goto fail_put;
2518
2519 if (offset > blk_rq_bytes(req))
2520 goto fail_put;
2521
2522 return req;
2523 fail_put:
2524 ublk_put_req_ref(io, req);
2525 return NULL;
2526 }
2527
ublk_ch_uring_cmd_cb(struct io_tw_req tw_req,io_tw_token_t tw)2528 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2529 {
2530 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
2531 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2532 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
2533
2534 if (ret != -EIOCBQUEUED)
2535 io_uring_cmd_done(cmd, ret, issue_flags);
2536 }
2537
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)2538 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
2539 {
2540 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
2541 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
2542 return 0;
2543 }
2544
2545 /* well-implemented server won't run into unlocked */
2546 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
2547 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
2548 return -EIOCBQUEUED;
2549 }
2550
2551 return ublk_ch_uring_cmd_local(cmd, issue_flags);
2552 }
2553
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)2554 static inline bool ublk_check_ubuf_dir(const struct request *req,
2555 int ubuf_dir)
2556 {
2557 /* copy ubuf to request pages */
2558 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
2559 ubuf_dir == ITER_SOURCE)
2560 return true;
2561
2562 /* copy request pages to ubuf */
2563 if ((req_op(req) == REQ_OP_WRITE ||
2564 req_op(req) == REQ_OP_ZONE_APPEND) &&
2565 ubuf_dir == ITER_DEST)
2566 return true;
2567
2568 return false;
2569 }
2570
ublk_check_and_get_req(struct kiocb * iocb,struct iov_iter * iter,size_t * off,int dir,struct ublk_io ** io)2571 static struct request *ublk_check_and_get_req(struct kiocb *iocb,
2572 struct iov_iter *iter, size_t *off, int dir,
2573 struct ublk_io **io)
2574 {
2575 struct ublk_device *ub = iocb->ki_filp->private_data;
2576 struct ublk_queue *ubq;
2577 struct request *req;
2578 size_t buf_off;
2579 u16 tag, q_id;
2580
2581 if (!user_backed_iter(iter))
2582 return ERR_PTR(-EACCES);
2583
2584 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2585 return ERR_PTR(-EACCES);
2586
2587 tag = ublk_pos_to_tag(iocb->ki_pos);
2588 q_id = ublk_pos_to_hwq(iocb->ki_pos);
2589 buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
2590
2591 if (q_id >= ub->dev_info.nr_hw_queues)
2592 return ERR_PTR(-EINVAL);
2593
2594 ubq = ublk_get_queue(ub, q_id);
2595 if (!ublk_dev_support_user_copy(ub))
2596 return ERR_PTR(-EACCES);
2597
2598 if (tag >= ub->dev_info.queue_depth)
2599 return ERR_PTR(-EINVAL);
2600
2601 *io = &ubq->ios[tag];
2602 req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off);
2603 if (!req)
2604 return ERR_PTR(-EINVAL);
2605
2606 if (!ublk_check_ubuf_dir(req, dir))
2607 goto fail;
2608
2609 *off = buf_off;
2610 return req;
2611 fail:
2612 ublk_put_req_ref(*io, req);
2613 return ERR_PTR(-EACCES);
2614 }
2615
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)2616 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
2617 {
2618 struct request *req;
2619 struct ublk_io *io;
2620 size_t buf_off;
2621 size_t ret;
2622
2623 req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io);
2624 if (IS_ERR(req))
2625 return PTR_ERR(req);
2626
2627 ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
2628 ublk_put_req_ref(io, req);
2629
2630 return ret;
2631 }
2632
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)2633 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
2634 {
2635 struct request *req;
2636 struct ublk_io *io;
2637 size_t buf_off;
2638 size_t ret;
2639
2640 req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io);
2641 if (IS_ERR(req))
2642 return PTR_ERR(req);
2643
2644 ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
2645 ublk_put_req_ref(io, req);
2646
2647 return ret;
2648 }
2649
2650 static const struct file_operations ublk_ch_fops = {
2651 .owner = THIS_MODULE,
2652 .open = ublk_ch_open,
2653 .release = ublk_ch_release,
2654 .read_iter = ublk_ch_read_iter,
2655 .write_iter = ublk_ch_write_iter,
2656 .uring_cmd = ublk_ch_uring_cmd,
2657 .mmap = ublk_ch_mmap,
2658 };
2659
ublk_deinit_queue(struct ublk_device * ub,int q_id)2660 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
2661 {
2662 struct ublk_queue *ubq = ub->queues[q_id];
2663 int size, i;
2664
2665 if (!ubq)
2666 return;
2667
2668 size = ublk_queue_cmd_buf_size(ub);
2669
2670 for (i = 0; i < ubq->q_depth; i++) {
2671 struct ublk_io *io = &ubq->ios[i];
2672 if (io->task)
2673 put_task_struct(io->task);
2674 WARN_ON_ONCE(refcount_read(&io->ref));
2675 WARN_ON_ONCE(io->task_registered_buffers);
2676 }
2677
2678 if (ubq->io_cmd_buf)
2679 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
2680
2681 kvfree(ubq);
2682 ub->queues[q_id] = NULL;
2683 }
2684
ublk_get_queue_numa_node(struct ublk_device * ub,int q_id)2685 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
2686 {
2687 unsigned int cpu;
2688
2689 /* Find first CPU mapped to this queue */
2690 for_each_possible_cpu(cpu) {
2691 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
2692 return cpu_to_node(cpu);
2693 }
2694
2695 return NUMA_NO_NODE;
2696 }
2697
ublk_init_queue(struct ublk_device * ub,int q_id)2698 static int ublk_init_queue(struct ublk_device *ub, int q_id)
2699 {
2700 int depth = ub->dev_info.queue_depth;
2701 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
2702 struct ublk_queue *ubq;
2703 struct page *page;
2704 int numa_node;
2705 int size;
2706
2707 /* Determine NUMA node based on queue's CPU affinity */
2708 numa_node = ublk_get_queue_numa_node(ub, q_id);
2709
2710 /* Allocate queue structure on local NUMA node */
2711 ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
2712 numa_node);
2713 if (!ubq)
2714 return -ENOMEM;
2715
2716 spin_lock_init(&ubq->cancel_lock);
2717 ubq->flags = ub->dev_info.flags;
2718 ubq->q_id = q_id;
2719 ubq->q_depth = depth;
2720 size = ublk_queue_cmd_buf_size(ub);
2721
2722 /* Allocate I/O command buffer on local NUMA node */
2723 page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
2724 if (!page) {
2725 kvfree(ubq);
2726 return -ENOMEM;
2727 }
2728 ubq->io_cmd_buf = page_address(page);
2729
2730 ub->queues[q_id] = ubq;
2731 ubq->dev = ub;
2732 return 0;
2733 }
2734
ublk_deinit_queues(struct ublk_device * ub)2735 static void ublk_deinit_queues(struct ublk_device *ub)
2736 {
2737 int i;
2738
2739 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2740 ublk_deinit_queue(ub, i);
2741 }
2742
ublk_init_queues(struct ublk_device * ub)2743 static int ublk_init_queues(struct ublk_device *ub)
2744 {
2745 int i, ret;
2746
2747 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2748 ret = ublk_init_queue(ub, i);
2749 if (ret)
2750 goto fail;
2751 }
2752
2753 init_completion(&ub->completion);
2754 return 0;
2755
2756 fail:
2757 ublk_deinit_queues(ub);
2758 return ret;
2759 }
2760
ublk_alloc_dev_number(struct ublk_device * ub,int idx)2761 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
2762 {
2763 int i = idx;
2764 int err;
2765
2766 spin_lock(&ublk_idr_lock);
2767 /* allocate id, if @id >= 0, we're requesting that specific id */
2768 if (i >= 0) {
2769 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
2770 if (err == -ENOSPC)
2771 err = -EEXIST;
2772 } else {
2773 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
2774 GFP_NOWAIT);
2775 }
2776 spin_unlock(&ublk_idr_lock);
2777
2778 if (err >= 0)
2779 ub->ub_number = err;
2780
2781 return err;
2782 }
2783
ublk_free_dev_number(struct ublk_device * ub)2784 static void ublk_free_dev_number(struct ublk_device *ub)
2785 {
2786 spin_lock(&ublk_idr_lock);
2787 idr_remove(&ublk_index_idr, ub->ub_number);
2788 wake_up_all(&ublk_idr_wq);
2789 spin_unlock(&ublk_idr_lock);
2790 }
2791
ublk_cdev_rel(struct device * dev)2792 static void ublk_cdev_rel(struct device *dev)
2793 {
2794 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
2795
2796 blk_mq_free_tag_set(&ub->tag_set);
2797 ublk_deinit_queues(ub);
2798 ublk_free_dev_number(ub);
2799 mutex_destroy(&ub->mutex);
2800 mutex_destroy(&ub->cancel_mutex);
2801 kfree(ub);
2802 }
2803
ublk_add_chdev(struct ublk_device * ub)2804 static int ublk_add_chdev(struct ublk_device *ub)
2805 {
2806 struct device *dev = &ub->cdev_dev;
2807 int minor = ub->ub_number;
2808 int ret;
2809
2810 dev->parent = ublk_misc.this_device;
2811 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
2812 dev->class = &ublk_chr_class;
2813 dev->release = ublk_cdev_rel;
2814 device_initialize(dev);
2815
2816 ret = dev_set_name(dev, "ublkc%d", minor);
2817 if (ret)
2818 goto fail;
2819
2820 cdev_init(&ub->cdev, &ublk_ch_fops);
2821 ret = cdev_device_add(&ub->cdev, dev);
2822 if (ret)
2823 goto fail;
2824
2825 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
2826 unprivileged_ublks_added++;
2827 return 0;
2828 fail:
2829 put_device(dev);
2830 return ret;
2831 }
2832
2833 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)2834 static void ublk_align_max_io_size(struct ublk_device *ub)
2835 {
2836 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
2837
2838 ub->dev_info.max_io_buf_bytes =
2839 round_down(max_io_bytes, PAGE_SIZE);
2840 }
2841
ublk_add_tag_set(struct ublk_device * ub)2842 static int ublk_add_tag_set(struct ublk_device *ub)
2843 {
2844 ub->tag_set.ops = &ublk_mq_ops;
2845 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
2846 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
2847 ub->tag_set.numa_node = NUMA_NO_NODE;
2848 ub->tag_set.driver_data = ub;
2849 return blk_mq_alloc_tag_set(&ub->tag_set);
2850 }
2851
ublk_remove(struct ublk_device * ub)2852 static void ublk_remove(struct ublk_device *ub)
2853 {
2854 bool unprivileged;
2855
2856 ublk_stop_dev(ub);
2857 cdev_device_del(&ub->cdev, &ub->cdev_dev);
2858 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
2859 ublk_put_device(ub);
2860
2861 if (unprivileged)
2862 unprivileged_ublks_added--;
2863 }
2864
ublk_get_device_from_id(int idx)2865 static struct ublk_device *ublk_get_device_from_id(int idx)
2866 {
2867 struct ublk_device *ub = NULL;
2868
2869 if (idx < 0)
2870 return NULL;
2871
2872 spin_lock(&ublk_idr_lock);
2873 ub = idr_find(&ublk_index_idr, idx);
2874 if (ub)
2875 ub = ublk_get_device(ub);
2876 spin_unlock(&ublk_idr_lock);
2877
2878 return ub;
2879 }
2880
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)2881 static int ublk_ctrl_start_dev(struct ublk_device *ub,
2882 const struct ublksrv_ctrl_cmd *header)
2883 {
2884 const struct ublk_param_basic *p = &ub->params.basic;
2885 int ublksrv_pid = (int)header->data[0];
2886 struct queue_limits lim = {
2887 .logical_block_size = 1 << p->logical_bs_shift,
2888 .physical_block_size = 1 << p->physical_bs_shift,
2889 .io_min = 1 << p->io_min_shift,
2890 .io_opt = 1 << p->io_opt_shift,
2891 .max_hw_sectors = p->max_sectors,
2892 .chunk_sectors = p->chunk_sectors,
2893 .virt_boundary_mask = p->virt_boundary_mask,
2894 .max_segments = USHRT_MAX,
2895 .max_segment_size = UINT_MAX,
2896 .dma_alignment = 3,
2897 };
2898 struct gendisk *disk;
2899 int ret = -EINVAL;
2900
2901 if (ublksrv_pid <= 0)
2902 return -EINVAL;
2903 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
2904 return -EINVAL;
2905
2906 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
2907 const struct ublk_param_discard *pd = &ub->params.discard;
2908
2909 lim.discard_alignment = pd->discard_alignment;
2910 lim.discard_granularity = pd->discard_granularity;
2911 lim.max_hw_discard_sectors = pd->max_discard_sectors;
2912 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
2913 lim.max_discard_segments = pd->max_discard_segments;
2914 }
2915
2916 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
2917 const struct ublk_param_zoned *p = &ub->params.zoned;
2918
2919 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
2920 return -EOPNOTSUPP;
2921
2922 lim.features |= BLK_FEAT_ZONED;
2923 lim.max_active_zones = p->max_active_zones;
2924 lim.max_open_zones = p->max_open_zones;
2925 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
2926 }
2927
2928 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
2929 lim.features |= BLK_FEAT_WRITE_CACHE;
2930 if (ub->params.basic.attrs & UBLK_ATTR_FUA)
2931 lim.features |= BLK_FEAT_FUA;
2932 }
2933
2934 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
2935 lim.features |= BLK_FEAT_ROTATIONAL;
2936
2937 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
2938 lim.dma_alignment = ub->params.dma.alignment;
2939
2940 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
2941 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
2942 lim.max_segment_size = ub->params.seg.max_segment_size;
2943 lim.max_segments = ub->params.seg.max_segments;
2944 }
2945
2946 if (wait_for_completion_interruptible(&ub->completion) != 0)
2947 return -EINTR;
2948
2949 if (ub->ublksrv_tgid != ublksrv_pid)
2950 return -EINVAL;
2951
2952 mutex_lock(&ub->mutex);
2953 if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
2954 test_bit(UB_STATE_USED, &ub->state)) {
2955 ret = -EEXIST;
2956 goto out_unlock;
2957 }
2958
2959 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
2960 if (IS_ERR(disk)) {
2961 ret = PTR_ERR(disk);
2962 goto out_unlock;
2963 }
2964 sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
2965 disk->fops = &ub_fops;
2966 disk->private_data = ub;
2967
2968 ub->dev_info.ublksrv_pid = ublksrv_pid;
2969 ub->ub_disk = disk;
2970
2971 ublk_apply_params(ub);
2972
2973 /*
2974 * Suppress partition scan to avoid potential IO hang.
2975 *
2976 * If ublk server error occurs during partition scan, the IO may
2977 * wait while holding ub->mutex, which can deadlock with other
2978 * operations that need the mutex. Defer partition scan to async
2979 * work.
2980 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
2981 * permanently.
2982 */
2983 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
2984
2985 ublk_get_device(ub);
2986 ub->dev_info.state = UBLK_S_DEV_LIVE;
2987
2988 if (ublk_dev_is_zoned(ub)) {
2989 ret = ublk_revalidate_disk_zones(ub);
2990 if (ret)
2991 goto out_put_cdev;
2992 }
2993
2994 ret = add_disk(disk);
2995 if (ret)
2996 goto out_put_cdev;
2997
2998 set_bit(UB_STATE_USED, &ub->state);
2999
3000 /* Schedule async partition scan for trusted daemons */
3001 if (!ub->unprivileged_daemons)
3002 schedule_work(&ub->partition_scan_work);
3003
3004 out_put_cdev:
3005 if (ret) {
3006 ublk_detach_disk(ub);
3007 ublk_put_device(ub);
3008 }
3009 if (ret)
3010 put_disk(disk);
3011 out_unlock:
3012 mutex_unlock(&ub->mutex);
3013 return ret;
3014 }
3015
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3016 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
3017 const struct ublksrv_ctrl_cmd *header)
3018 {
3019 void __user *argp = (void __user *)(unsigned long)header->addr;
3020 cpumask_var_t cpumask;
3021 unsigned long queue;
3022 unsigned int retlen;
3023 unsigned int i;
3024 int ret;
3025
3026 if (header->len * BITS_PER_BYTE < nr_cpu_ids)
3027 return -EINVAL;
3028 if (header->len & (sizeof(unsigned long)-1))
3029 return -EINVAL;
3030 if (!header->addr)
3031 return -EINVAL;
3032
3033 queue = header->data[0];
3034 if (queue >= ub->dev_info.nr_hw_queues)
3035 return -EINVAL;
3036
3037 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
3038 return -ENOMEM;
3039
3040 for_each_possible_cpu(i) {
3041 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
3042 cpumask_set_cpu(i, cpumask);
3043 }
3044
3045 ret = -EFAULT;
3046 retlen = min_t(unsigned short, header->len, cpumask_size());
3047 if (copy_to_user(argp, cpumask, retlen))
3048 goto out_free_cpumask;
3049 if (retlen != header->len &&
3050 clear_user(argp + retlen, header->len - retlen))
3051 goto out_free_cpumask;
3052
3053 ret = 0;
3054 out_free_cpumask:
3055 free_cpumask_var(cpumask);
3056 return ret;
3057 }
3058
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)3059 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
3060 {
3061 pr_devel("%s: dev id %d flags %llx\n", __func__,
3062 info->dev_id, info->flags);
3063 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
3064 info->nr_hw_queues, info->queue_depth);
3065 }
3066
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)3067 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
3068 {
3069 void __user *argp = (void __user *)(unsigned long)header->addr;
3070 struct ublksrv_ctrl_dev_info info;
3071 struct ublk_device *ub;
3072 int ret = -EINVAL;
3073
3074 if (header->len < sizeof(info) || !header->addr)
3075 return -EINVAL;
3076 if (header->queue_id != (u16)-1) {
3077 pr_warn("%s: queue_id is wrong %x\n",
3078 __func__, header->queue_id);
3079 return -EINVAL;
3080 }
3081
3082 if (copy_from_user(&info, argp, sizeof(info)))
3083 return -EFAULT;
3084
3085 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
3086 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
3087 return -EINVAL;
3088
3089 if (capable(CAP_SYS_ADMIN))
3090 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
3091 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
3092 return -EPERM;
3093
3094 /* forbid nonsense combinations of recovery flags */
3095 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
3096 case 0:
3097 case UBLK_F_USER_RECOVERY:
3098 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
3099 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
3100 break;
3101 default:
3102 pr_warn("%s: invalid recovery flags %llx\n", __func__,
3103 info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
3104 return -EINVAL;
3105 }
3106
3107 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
3108 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
3109 return -EINVAL;
3110 }
3111
3112 /*
3113 * unprivileged device can't be trusted, but RECOVERY and
3114 * RECOVERY_REISSUE still may hang error handling, so can't
3115 * support recovery features for unprivileged ublk now
3116 *
3117 * TODO: provide forward progress for RECOVERY handler, so that
3118 * unprivileged device can benefit from it
3119 */
3120 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
3121 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
3122 UBLK_F_USER_RECOVERY);
3123
3124 /*
3125 * For USER_COPY, we depends on userspace to fill request
3126 * buffer by pwrite() to ublk char device, which can't be
3127 * used for unprivileged device
3128 *
3129 * Same with zero copy or auto buffer register.
3130 */
3131 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3132 UBLK_F_AUTO_BUF_REG))
3133 return -EINVAL;
3134 }
3135
3136 /* the created device is always owned by current user */
3137 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
3138
3139 if (header->dev_id != info.dev_id) {
3140 pr_warn("%s: dev id not match %u %u\n",
3141 __func__, header->dev_id, info.dev_id);
3142 return -EINVAL;
3143 }
3144
3145 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
3146 pr_warn("%s: dev id is too large. Max supported is %d\n",
3147 __func__, UBLK_MAX_UBLKS - 1);
3148 return -EINVAL;
3149 }
3150
3151 ublk_dump_dev_info(&info);
3152
3153 ret = mutex_lock_killable(&ublk_ctl_mutex);
3154 if (ret)
3155 return ret;
3156
3157 ret = -EACCES;
3158 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
3159 unprivileged_ublks_added >= unprivileged_ublks_max)
3160 goto out_unlock;
3161
3162 ret = -ENOMEM;
3163 ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL);
3164 if (!ub)
3165 goto out_unlock;
3166 mutex_init(&ub->mutex);
3167 spin_lock_init(&ub->lock);
3168 mutex_init(&ub->cancel_mutex);
3169 INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
3170
3171 ret = ublk_alloc_dev_number(ub, header->dev_id);
3172 if (ret < 0)
3173 goto out_free_ub;
3174
3175 memcpy(&ub->dev_info, &info, sizeof(info));
3176
3177 /* update device id */
3178 ub->dev_info.dev_id = ub->ub_number;
3179
3180 /*
3181 * 64bit flags will be copied back to userspace as feature
3182 * negotiation result, so have to clear flags which driver
3183 * doesn't support yet, then userspace can get correct flags
3184 * (features) to handle.
3185 */
3186 ub->dev_info.flags &= UBLK_F_ALL;
3187
3188 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
3189 UBLK_F_URING_CMD_COMP_IN_TASK |
3190 UBLK_F_PER_IO_DAEMON |
3191 UBLK_F_BUF_REG_OFF_DAEMON;
3192
3193 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
3194 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3195 UBLK_F_AUTO_BUF_REG))
3196 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
3197
3198 /*
3199 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
3200 * returning write_append_lba, which is only allowed in case of
3201 * user copy or zero copy
3202 */
3203 if (ublk_dev_is_zoned(ub) &&
3204 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
3205 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
3206 ret = -EINVAL;
3207 goto out_free_dev_number;
3208 }
3209
3210 ub->dev_info.nr_hw_queues = min_t(unsigned int,
3211 ub->dev_info.nr_hw_queues, nr_cpu_ids);
3212 ublk_align_max_io_size(ub);
3213
3214 ret = ublk_add_tag_set(ub);
3215 if (ret)
3216 goto out_free_dev_number;
3217
3218 ret = ublk_init_queues(ub);
3219 if (ret)
3220 goto out_free_tag_set;
3221
3222 ret = -EFAULT;
3223 if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
3224 goto out_deinit_queues;
3225
3226 /*
3227 * Add the char dev so that ublksrv daemon can be setup.
3228 * ublk_add_chdev() will cleanup everything if it fails.
3229 */
3230 ret = ublk_add_chdev(ub);
3231 goto out_unlock;
3232
3233 out_deinit_queues:
3234 ublk_deinit_queues(ub);
3235 out_free_tag_set:
3236 blk_mq_free_tag_set(&ub->tag_set);
3237 out_free_dev_number:
3238 ublk_free_dev_number(ub);
3239 out_free_ub:
3240 mutex_destroy(&ub->mutex);
3241 mutex_destroy(&ub->cancel_mutex);
3242 kfree(ub);
3243 out_unlock:
3244 mutex_unlock(&ublk_ctl_mutex);
3245 return ret;
3246 }
3247
ublk_idr_freed(int id)3248 static inline bool ublk_idr_freed(int id)
3249 {
3250 void *ptr;
3251
3252 spin_lock(&ublk_idr_lock);
3253 ptr = idr_find(&ublk_index_idr, id);
3254 spin_unlock(&ublk_idr_lock);
3255
3256 return ptr == NULL;
3257 }
3258
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)3259 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
3260 {
3261 struct ublk_device *ub = *p_ub;
3262 int idx = ub->ub_number;
3263 int ret;
3264
3265 ret = mutex_lock_killable(&ublk_ctl_mutex);
3266 if (ret)
3267 return ret;
3268
3269 if (!test_bit(UB_STATE_DELETED, &ub->state)) {
3270 ublk_remove(ub);
3271 set_bit(UB_STATE_DELETED, &ub->state);
3272 }
3273
3274 /* Mark the reference as consumed */
3275 *p_ub = NULL;
3276 ublk_put_device(ub);
3277 mutex_unlock(&ublk_ctl_mutex);
3278
3279 /*
3280 * Wait until the idr is removed, then it can be reused after
3281 * DEL_DEV command is returned.
3282 *
3283 * If we returns because of user interrupt, future delete command
3284 * may come:
3285 *
3286 * - the device number isn't freed, this device won't or needn't
3287 * be deleted again, since UB_STATE_DELETED is set, and device
3288 * will be released after the last reference is dropped
3289 *
3290 * - the device number is freed already, we will not find this
3291 * device via ublk_get_device_from_id()
3292 */
3293 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
3294 return -EINTR;
3295 return 0;
3296 }
3297
ublk_ctrl_cmd_dump(struct io_uring_cmd * cmd)3298 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
3299 {
3300 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3301
3302 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
3303 __func__, cmd->cmd_op, header->dev_id, header->queue_id,
3304 header->data[0], header->addr, header->len);
3305 }
3306
ublk_ctrl_stop_dev(struct ublk_device * ub)3307 static int ublk_ctrl_stop_dev(struct ublk_device *ub)
3308 {
3309 ublk_stop_dev(ub);
3310 return 0;
3311 }
3312
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3313 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
3314 const struct ublksrv_ctrl_cmd *header)
3315 {
3316 void __user *argp = (void __user *)(unsigned long)header->addr;
3317
3318 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
3319 return -EINVAL;
3320
3321 if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
3322 return -EFAULT;
3323
3324 return 0;
3325 }
3326
3327 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)3328 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
3329 {
3330 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
3331 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
3332
3333 if (ub->ub_disk) {
3334 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
3335 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
3336 } else {
3337 ub->params.devt.disk_major = 0;
3338 ub->params.devt.disk_minor = 0;
3339 }
3340 ub->params.types |= UBLK_PARAM_TYPE_DEVT;
3341 }
3342
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3343 static int ublk_ctrl_get_params(struct ublk_device *ub,
3344 const struct ublksrv_ctrl_cmd *header)
3345 {
3346 void __user *argp = (void __user *)(unsigned long)header->addr;
3347 struct ublk_params_header ph;
3348 int ret;
3349
3350 if (header->len <= sizeof(ph) || !header->addr)
3351 return -EINVAL;
3352
3353 if (copy_from_user(&ph, argp, sizeof(ph)))
3354 return -EFAULT;
3355
3356 if (ph.len > header->len || !ph.len)
3357 return -EINVAL;
3358
3359 if (ph.len > sizeof(struct ublk_params))
3360 ph.len = sizeof(struct ublk_params);
3361
3362 mutex_lock(&ub->mutex);
3363 ublk_ctrl_fill_params_devt(ub);
3364 if (copy_to_user(argp, &ub->params, ph.len))
3365 ret = -EFAULT;
3366 else
3367 ret = 0;
3368 mutex_unlock(&ub->mutex);
3369
3370 return ret;
3371 }
3372
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3373 static int ublk_ctrl_set_params(struct ublk_device *ub,
3374 const struct ublksrv_ctrl_cmd *header)
3375 {
3376 void __user *argp = (void __user *)(unsigned long)header->addr;
3377 struct ublk_params_header ph;
3378 int ret = -EFAULT;
3379
3380 if (header->len <= sizeof(ph) || !header->addr)
3381 return -EINVAL;
3382
3383 if (copy_from_user(&ph, argp, sizeof(ph)))
3384 return -EFAULT;
3385
3386 if (ph.len > header->len || !ph.len || !ph.types)
3387 return -EINVAL;
3388
3389 if (ph.len > sizeof(struct ublk_params))
3390 ph.len = sizeof(struct ublk_params);
3391
3392 mutex_lock(&ub->mutex);
3393 if (test_bit(UB_STATE_USED, &ub->state)) {
3394 /*
3395 * Parameters can only be changed when device hasn't
3396 * been started yet
3397 */
3398 ret = -EACCES;
3399 } else if (copy_from_user(&ub->params, argp, ph.len)) {
3400 ret = -EFAULT;
3401 } else {
3402 /* clear all we don't support yet */
3403 ub->params.types &= UBLK_PARAM_TYPE_ALL;
3404 ret = ublk_validate_params(ub);
3405 if (ret)
3406 ub->params.types = 0;
3407 }
3408 mutex_unlock(&ub->mutex);
3409
3410 return ret;
3411 }
3412
ublk_ctrl_start_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3413 static int ublk_ctrl_start_recovery(struct ublk_device *ub,
3414 const struct ublksrv_ctrl_cmd *header)
3415 {
3416 int ret = -EINVAL;
3417
3418 mutex_lock(&ub->mutex);
3419 if (ublk_nosrv_should_stop_dev(ub))
3420 goto out_unlock;
3421 /*
3422 * START_RECOVERY is only allowd after:
3423 *
3424 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
3425 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
3426 * released.
3427 *
3428 * and one of the following holds
3429 *
3430 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
3431 * (a)has quiesced request queue
3432 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
3433 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
3434 * (d)has completed/camceled all ioucmds owned by ther dying process
3435 *
3436 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
3437 * quiesced, but all I/O is being immediately errored
3438 */
3439 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
3440 ret = -EBUSY;
3441 goto out_unlock;
3442 }
3443 pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
3444 init_completion(&ub->completion);
3445 ret = 0;
3446 out_unlock:
3447 mutex_unlock(&ub->mutex);
3448 return ret;
3449 }
3450
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3451 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
3452 const struct ublksrv_ctrl_cmd *header)
3453 {
3454 int ublksrv_pid = (int)header->data[0];
3455 int ret = -EINVAL;
3456
3457 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
3458 header->dev_id);
3459
3460 if (wait_for_completion_interruptible(&ub->completion))
3461 return -EINTR;
3462
3463 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
3464 header->dev_id);
3465
3466 if (ub->ublksrv_tgid != ublksrv_pid)
3467 return -EINVAL;
3468
3469 mutex_lock(&ub->mutex);
3470 if (ublk_nosrv_should_stop_dev(ub))
3471 goto out_unlock;
3472
3473 if (!ublk_dev_in_recoverable_state(ub)) {
3474 ret = -EBUSY;
3475 goto out_unlock;
3476 }
3477 ub->dev_info.ublksrv_pid = ublksrv_pid;
3478 ub->dev_info.state = UBLK_S_DEV_LIVE;
3479 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
3480 __func__, ublksrv_pid, header->dev_id);
3481 blk_mq_kick_requeue_list(ub->ub_disk->queue);
3482 ret = 0;
3483 out_unlock:
3484 mutex_unlock(&ub->mutex);
3485 return ret;
3486 }
3487
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)3488 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
3489 {
3490 void __user *argp = (void __user *)(unsigned long)header->addr;
3491 u64 features = UBLK_F_ALL;
3492
3493 if (header->len != UBLK_FEATURES_LEN || !header->addr)
3494 return -EINVAL;
3495
3496 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
3497 return -EFAULT;
3498
3499 return 0;
3500 }
3501
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3502 static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
3503 {
3504 struct ublk_param_basic *p = &ub->params.basic;
3505 u64 new_size = header->data[0];
3506
3507 mutex_lock(&ub->mutex);
3508 p->dev_sectors = new_size;
3509 set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
3510 mutex_unlock(&ub->mutex);
3511 }
3512
3513 struct count_busy {
3514 const struct ublk_queue *ubq;
3515 unsigned int nr_busy;
3516 };
3517
ublk_count_busy_req(struct request * rq,void * data)3518 static bool ublk_count_busy_req(struct request *rq, void *data)
3519 {
3520 struct count_busy *idle = data;
3521
3522 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
3523 idle->nr_busy += 1;
3524 return true;
3525 }
3526
3527 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)3528 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
3529 {
3530 struct count_busy data = {
3531 .ubq = ubq,
3532 };
3533
3534 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
3535 return data.nr_busy < ubq->q_depth;
3536 }
3537
3538 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)3539 static int ublk_wait_for_idle_io(struct ublk_device *ub,
3540 unsigned int timeout_ms)
3541 {
3542 unsigned int elapsed = 0;
3543 int ret;
3544
3545 while (elapsed < timeout_ms && !signal_pending(current)) {
3546 unsigned int queues_cancelable = 0;
3547 int i;
3548
3549 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
3550 struct ublk_queue *ubq = ublk_get_queue(ub, i);
3551
3552 queues_cancelable += !!ubq_has_idle_io(ubq);
3553 }
3554
3555 /*
3556 * Each queue needs at least one active command for
3557 * notifying ublk server
3558 */
3559 if (queues_cancelable == ub->dev_info.nr_hw_queues)
3560 break;
3561
3562 msleep(UBLK_REQUEUE_DELAY_MS);
3563 elapsed += UBLK_REQUEUE_DELAY_MS;
3564 }
3565
3566 if (signal_pending(current))
3567 ret = -EINTR;
3568 else if (elapsed >= timeout_ms)
3569 ret = -EBUSY;
3570 else
3571 ret = 0;
3572
3573 return ret;
3574 }
3575
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3576 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
3577 const struct ublksrv_ctrl_cmd *header)
3578 {
3579 /* zero means wait forever */
3580 u64 timeout_ms = header->data[0];
3581 struct gendisk *disk;
3582 int ret = -ENODEV;
3583
3584 if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
3585 return -EOPNOTSUPP;
3586
3587 mutex_lock(&ub->mutex);
3588 disk = ublk_get_disk(ub);
3589 if (!disk)
3590 goto unlock;
3591 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3592 goto put_disk;
3593
3594 ret = 0;
3595 /* already in expected state */
3596 if (ub->dev_info.state != UBLK_S_DEV_LIVE)
3597 goto put_disk;
3598
3599 /* Mark the device as canceling */
3600 mutex_lock(&ub->cancel_mutex);
3601 blk_mq_quiesce_queue(disk->queue);
3602 ublk_set_canceling(ub, true);
3603 blk_mq_unquiesce_queue(disk->queue);
3604 mutex_unlock(&ub->cancel_mutex);
3605
3606 if (!timeout_ms)
3607 timeout_ms = UINT_MAX;
3608 ret = ublk_wait_for_idle_io(ub, timeout_ms);
3609
3610 put_disk:
3611 ublk_put_disk(disk);
3612 unlock:
3613 mutex_unlock(&ub->mutex);
3614
3615 /* Cancel pending uring_cmd */
3616 if (!ret)
3617 ublk_cancel_dev(ub);
3618 return ret;
3619 }
3620
3621 /*
3622 * All control commands are sent via /dev/ublk-control, so we have to check
3623 * the destination device's permission
3624 */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)3625 static int ublk_char_dev_permission(struct ublk_device *ub,
3626 const char *dev_path, int mask)
3627 {
3628 int err;
3629 struct path path;
3630 struct kstat stat;
3631
3632 err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
3633 if (err)
3634 return err;
3635
3636 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
3637 if (err)
3638 goto exit;
3639
3640 err = -EPERM;
3641 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
3642 goto exit;
3643
3644 err = inode_permission(&nop_mnt_idmap,
3645 d_backing_inode(path.dentry), mask);
3646 exit:
3647 path_put(&path);
3648 return err;
3649 }
3650
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,struct io_uring_cmd * cmd)3651 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
3652 struct io_uring_cmd *cmd)
3653 {
3654 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
3655 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
3656 void __user *argp = (void __user *)(unsigned long)header->addr;
3657 char *dev_path = NULL;
3658 int ret = 0;
3659 int mask;
3660
3661 if (!unprivileged) {
3662 if (!capable(CAP_SYS_ADMIN))
3663 return -EPERM;
3664 /*
3665 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
3666 * char_dev_path in payload too, since userspace may not
3667 * know if the specified device is created as unprivileged
3668 * mode.
3669 */
3670 if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
3671 return 0;
3672 }
3673
3674 /*
3675 * User has to provide the char device path for unprivileged ublk
3676 *
3677 * header->addr always points to the dev path buffer, and
3678 * header->dev_path_len records length of dev path buffer.
3679 */
3680 if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
3681 return -EINVAL;
3682
3683 if (header->len < header->dev_path_len)
3684 return -EINVAL;
3685
3686 dev_path = memdup_user_nul(argp, header->dev_path_len);
3687 if (IS_ERR(dev_path))
3688 return PTR_ERR(dev_path);
3689
3690 ret = -EINVAL;
3691 switch (_IOC_NR(cmd->cmd_op)) {
3692 case UBLK_CMD_GET_DEV_INFO:
3693 case UBLK_CMD_GET_DEV_INFO2:
3694 case UBLK_CMD_GET_QUEUE_AFFINITY:
3695 case UBLK_CMD_GET_PARAMS:
3696 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
3697 mask = MAY_READ;
3698 break;
3699 case UBLK_CMD_START_DEV:
3700 case UBLK_CMD_STOP_DEV:
3701 case UBLK_CMD_ADD_DEV:
3702 case UBLK_CMD_DEL_DEV:
3703 case UBLK_CMD_SET_PARAMS:
3704 case UBLK_CMD_START_USER_RECOVERY:
3705 case UBLK_CMD_END_USER_RECOVERY:
3706 case UBLK_CMD_UPDATE_SIZE:
3707 case UBLK_CMD_QUIESCE_DEV:
3708 mask = MAY_READ | MAY_WRITE;
3709 break;
3710 default:
3711 goto exit;
3712 }
3713
3714 ret = ublk_char_dev_permission(ub, dev_path, mask);
3715 if (!ret) {
3716 header->len -= header->dev_path_len;
3717 header->addr += header->dev_path_len;
3718 }
3719 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
3720 __func__, ub->ub_number, cmd->cmd_op,
3721 ub->dev_info.owner_uid, ub->dev_info.owner_gid,
3722 dev_path, ret);
3723 exit:
3724 kfree(dev_path);
3725 return ret;
3726 }
3727
ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)3728 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
3729 {
3730 switch (_IOC_NR(cmd_op)) {
3731 case UBLK_CMD_GET_QUEUE_AFFINITY:
3732 case UBLK_CMD_GET_DEV_INFO:
3733 case UBLK_CMD_GET_DEV_INFO2:
3734 case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
3735 return false;
3736 default:
3737 return true;
3738 }
3739 }
3740
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3741 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
3742 unsigned int issue_flags)
3743 {
3744 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3745 struct ublk_device *ub = NULL;
3746 u32 cmd_op = cmd->cmd_op;
3747 int ret = -EINVAL;
3748
3749 if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
3750 issue_flags & IO_URING_F_NONBLOCK)
3751 return -EAGAIN;
3752
3753 ublk_ctrl_cmd_dump(cmd);
3754
3755 if (!(issue_flags & IO_URING_F_SQE128))
3756 goto out;
3757
3758 ret = ublk_check_cmd_op(cmd_op);
3759 if (ret)
3760 goto out;
3761
3762 if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
3763 ret = ublk_ctrl_get_features(header);
3764 goto out;
3765 }
3766
3767 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
3768 ret = -ENODEV;
3769 ub = ublk_get_device_from_id(header->dev_id);
3770 if (!ub)
3771 goto out;
3772
3773 ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
3774 if (ret)
3775 goto put_dev;
3776 }
3777
3778 switch (_IOC_NR(cmd_op)) {
3779 case UBLK_CMD_START_DEV:
3780 ret = ublk_ctrl_start_dev(ub, header);
3781 break;
3782 case UBLK_CMD_STOP_DEV:
3783 ret = ublk_ctrl_stop_dev(ub);
3784 break;
3785 case UBLK_CMD_GET_DEV_INFO:
3786 case UBLK_CMD_GET_DEV_INFO2:
3787 ret = ublk_ctrl_get_dev_info(ub, header);
3788 break;
3789 case UBLK_CMD_ADD_DEV:
3790 ret = ublk_ctrl_add_dev(header);
3791 break;
3792 case UBLK_CMD_DEL_DEV:
3793 ret = ublk_ctrl_del_dev(&ub, true);
3794 break;
3795 case UBLK_CMD_DEL_DEV_ASYNC:
3796 ret = ublk_ctrl_del_dev(&ub, false);
3797 break;
3798 case UBLK_CMD_GET_QUEUE_AFFINITY:
3799 ret = ublk_ctrl_get_queue_affinity(ub, header);
3800 break;
3801 case UBLK_CMD_GET_PARAMS:
3802 ret = ublk_ctrl_get_params(ub, header);
3803 break;
3804 case UBLK_CMD_SET_PARAMS:
3805 ret = ublk_ctrl_set_params(ub, header);
3806 break;
3807 case UBLK_CMD_START_USER_RECOVERY:
3808 ret = ublk_ctrl_start_recovery(ub, header);
3809 break;
3810 case UBLK_CMD_END_USER_RECOVERY:
3811 ret = ublk_ctrl_end_recovery(ub, header);
3812 break;
3813 case UBLK_CMD_UPDATE_SIZE:
3814 ublk_ctrl_set_size(ub, header);
3815 ret = 0;
3816 break;
3817 case UBLK_CMD_QUIESCE_DEV:
3818 ret = ublk_ctrl_quiesce_dev(ub, header);
3819 break;
3820 default:
3821 ret = -EOPNOTSUPP;
3822 break;
3823 }
3824
3825 put_dev:
3826 if (ub)
3827 ublk_put_device(ub);
3828 out:
3829 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
3830 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
3831 return ret;
3832 }
3833
3834 static const struct file_operations ublk_ctl_fops = {
3835 .open = nonseekable_open,
3836 .uring_cmd = ublk_ctrl_uring_cmd,
3837 .owner = THIS_MODULE,
3838 .llseek = noop_llseek,
3839 };
3840
3841 static struct miscdevice ublk_misc = {
3842 .minor = MISC_DYNAMIC_MINOR,
3843 .name = "ublk-control",
3844 .fops = &ublk_ctl_fops,
3845 };
3846
ublk_init(void)3847 static int __init ublk_init(void)
3848 {
3849 int ret;
3850
3851 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
3852 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
3853 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
3854
3855 init_waitqueue_head(&ublk_idr_wq);
3856
3857 ret = misc_register(&ublk_misc);
3858 if (ret)
3859 return ret;
3860
3861 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
3862 if (ret)
3863 goto unregister_mis;
3864
3865 ret = class_register(&ublk_chr_class);
3866 if (ret)
3867 goto free_chrdev_region;
3868
3869 return 0;
3870
3871 free_chrdev_region:
3872 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3873 unregister_mis:
3874 misc_deregister(&ublk_misc);
3875 return ret;
3876 }
3877
ublk_exit(void)3878 static void __exit ublk_exit(void)
3879 {
3880 struct ublk_device *ub;
3881 int id;
3882
3883 idr_for_each_entry(&ublk_index_idr, ub, id)
3884 ublk_remove(ub);
3885
3886 class_unregister(&ublk_chr_class);
3887 misc_deregister(&ublk_misc);
3888
3889 idr_destroy(&ublk_index_idr);
3890 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3891 }
3892
3893 module_init(ublk_init);
3894 module_exit(ublk_exit);
3895
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)3896 static int ublk_set_max_unprivileged_ublks(const char *buf,
3897 const struct kernel_param *kp)
3898 {
3899 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
3900 }
3901
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)3902 static int ublk_get_max_unprivileged_ublks(char *buf,
3903 const struct kernel_param *kp)
3904 {
3905 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
3906 }
3907
3908 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
3909 .set = ublk_set_max_unprivileged_ublks,
3910 .get = ublk_get_max_unprivileged_ublks,
3911 };
3912
3913 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
3914 &unprivileged_ublks_max, 0644);
3915 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
3916
3917 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
3918 MODULE_DESCRIPTION("Userspace block device");
3919 MODULE_LICENSE("GPL");
3920