1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <uapi/linux/ublk_cmd.h>
48
49 #define UBLK_MINORS (1U << MINORBITS)
50
51 #define UBLK_INVALID_BUF_IDX ((u16)-1)
52
53 /* private ioctl command mirror */
54 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
55 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
56 #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
57
58 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
59 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
60
61 /* All UBLK_F_* have to be included into UBLK_F_ALL */
62 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
63 | UBLK_F_URING_CMD_COMP_IN_TASK \
64 | UBLK_F_NEED_GET_DATA \
65 | UBLK_F_USER_RECOVERY \
66 | UBLK_F_USER_RECOVERY_REISSUE \
67 | UBLK_F_UNPRIVILEGED_DEV \
68 | UBLK_F_CMD_IOCTL_ENCODE \
69 | UBLK_F_USER_COPY \
70 | UBLK_F_ZONED \
71 | UBLK_F_USER_RECOVERY_FAIL_IO \
72 | UBLK_F_UPDATE_SIZE \
73 | UBLK_F_AUTO_BUF_REG \
74 | UBLK_F_QUIESCE \
75 | UBLK_F_PER_IO_DAEMON \
76 | UBLK_F_BUF_REG_OFF_DAEMON)
77
78 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
79 | UBLK_F_USER_RECOVERY_REISSUE \
80 | UBLK_F_USER_RECOVERY_FAIL_IO)
81
82 /* All UBLK_PARAM_TYPE_* should be included here */
83 #define UBLK_PARAM_TYPE_ALL \
84 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
85 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
86 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
87
88 struct ublk_uring_cmd_pdu {
89 /*
90 * Store requests in same batch temporarily for queuing them to
91 * daemon context.
92 *
93 * It should have been stored to request payload, but we do want
94 * to avoid extra pre-allocation, and uring_cmd payload is always
95 * free for us
96 */
97 union {
98 struct request *req;
99 struct request *req_list;
100 };
101
102 /*
103 * The following two are valid in this cmd whole lifetime, and
104 * setup in ublk uring_cmd handler
105 */
106 struct ublk_queue *ubq;
107
108 u16 tag;
109 };
110
111 /*
112 * io command is active: sqe cmd is received, and its cqe isn't done
113 *
114 * If the flag is set, the io command is owned by ublk driver, and waited
115 * for incoming blk-mq request from the ublk block device.
116 *
117 * If the flag is cleared, the io command will be completed, and owned by
118 * ublk server.
119 */
120 #define UBLK_IO_FLAG_ACTIVE 0x01
121
122 /*
123 * IO command is completed via cqe, and it is being handled by ublksrv, and
124 * not committed yet
125 *
126 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
127 * cross verification
128 */
129 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
130
131 /*
132 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
133 * get data buffer address from ublksrv.
134 *
135 * Then, bio data could be copied into this data buffer for a WRITE request
136 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
137 */
138 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
139
140 /*
141 * request buffer is registered automatically, so we have to unregister it
142 * before completing this request.
143 *
144 * io_uring will unregister buffer automatically for us during exiting.
145 */
146 #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
147
148 /* atomic RW with ubq->cancel_lock */
149 #define UBLK_IO_FLAG_CANCELED 0x80000000
150
151 /*
152 * Initialize refcount to a large number to include any registered buffers.
153 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
154 * any buffers registered on the io daemon task.
155 */
156 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
157
158 struct ublk_io {
159 /* userspace buffer address from io cmd */
160 union {
161 __u64 addr;
162 struct ublk_auto_buf_reg buf;
163 };
164 unsigned int flags;
165 int res;
166
167 union {
168 /* valid if UBLK_IO_FLAG_ACTIVE is set */
169 struct io_uring_cmd *cmd;
170 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
171 struct request *req;
172 };
173
174 struct task_struct *task;
175
176 /*
177 * The number of uses of this I/O by the ublk server
178 * if user copy or zero copy are enabled:
179 * - UBLK_REFCOUNT_INIT from dispatch to the server
180 * until UBLK_IO_COMMIT_AND_FETCH_REQ
181 * - 1 for each inflight ublk_ch_{read,write}_iter() call
182 * - 1 for each io_uring registered buffer not registered on task
183 * The I/O can only be completed once all references are dropped.
184 * User copy and buffer registration operations are only permitted
185 * if the reference count is nonzero.
186 */
187 refcount_t ref;
188 /* Count of buffers registered on task and not yet unregistered */
189 unsigned task_registered_buffers;
190
191 void *buf_ctx_handle;
192 } ____cacheline_aligned_in_smp;
193
194 struct ublk_queue {
195 int q_id;
196 int q_depth;
197
198 unsigned long flags;
199 struct ublksrv_io_desc *io_cmd_buf;
200
201 bool force_abort;
202 bool canceling;
203 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
204 unsigned short nr_io_ready; /* how many ios setup */
205 spinlock_t cancel_lock;
206 struct ublk_device *dev;
207 struct ublk_io ios[];
208 };
209
210 struct ublk_device {
211 struct gendisk *ub_disk;
212
213 char *__queues;
214
215 unsigned int queue_size;
216 struct ublksrv_ctrl_dev_info dev_info;
217
218 struct blk_mq_tag_set tag_set;
219
220 struct cdev cdev;
221 struct device cdev_dev;
222
223 #define UB_STATE_OPEN 0
224 #define UB_STATE_USED 1
225 #define UB_STATE_DELETED 2
226 unsigned long state;
227 int ub_number;
228
229 struct mutex mutex;
230
231 spinlock_t lock;
232 struct mm_struct *mm;
233
234 struct ublk_params params;
235
236 struct completion completion;
237 unsigned int nr_queues_ready;
238 bool unprivileged_daemons;
239 struct mutex cancel_mutex;
240 bool canceling;
241 pid_t ublksrv_tgid;
242 struct delayed_work exit_work;
243 };
244
245 /* header of ublk_params */
246 struct ublk_params_header {
247 __u32 len;
248 __u32 types;
249 };
250
251 static void ublk_io_release(void *priv);
252 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
253 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
254 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
255 const struct ublk_queue *ubq, struct ublk_io *io,
256 size_t offset);
257 static inline unsigned int ublk_req_build_flags(struct request *req);
258
259 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)260 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
261 {
262 return &ubq->io_cmd_buf[tag];
263 }
264
ublk_dev_is_zoned(const struct ublk_device * ub)265 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
266 {
267 return ub->dev_info.flags & UBLK_F_ZONED;
268 }
269
ublk_queue_is_zoned(struct ublk_queue * ubq)270 static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
271 {
272 return ubq->flags & UBLK_F_ZONED;
273 }
274
275 #ifdef CONFIG_BLK_DEV_ZONED
276
277 struct ublk_zoned_report_desc {
278 __u64 sector;
279 __u32 operation;
280 __u32 nr_zones;
281 };
282
283 static DEFINE_XARRAY(ublk_zoned_report_descs);
284
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)285 static int ublk_zoned_insert_report_desc(const struct request *req,
286 struct ublk_zoned_report_desc *desc)
287 {
288 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
289 desc, GFP_KERNEL);
290 }
291
ublk_zoned_erase_report_desc(const struct request * req)292 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
293 const struct request *req)
294 {
295 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
296 }
297
ublk_zoned_get_report_desc(const struct request * req)298 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
299 const struct request *req)
300 {
301 return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
302 }
303
ublk_get_nr_zones(const struct ublk_device * ub)304 static int ublk_get_nr_zones(const struct ublk_device *ub)
305 {
306 const struct ublk_param_basic *p = &ub->params.basic;
307
308 /* Zone size is a power of 2 */
309 return p->dev_sectors >> ilog2(p->chunk_sectors);
310 }
311
ublk_revalidate_disk_zones(struct ublk_device * ub)312 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
313 {
314 return blk_revalidate_disk_zones(ub->ub_disk);
315 }
316
ublk_dev_param_zoned_validate(const struct ublk_device * ub)317 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
318 {
319 const struct ublk_param_zoned *p = &ub->params.zoned;
320 int nr_zones;
321
322 if (!ublk_dev_is_zoned(ub))
323 return -EINVAL;
324
325 if (!p->max_zone_append_sectors)
326 return -EINVAL;
327
328 nr_zones = ublk_get_nr_zones(ub);
329
330 if (p->max_active_zones > nr_zones)
331 return -EINVAL;
332
333 if (p->max_open_zones > nr_zones)
334 return -EINVAL;
335
336 return 0;
337 }
338
ublk_dev_param_zoned_apply(struct ublk_device * ub)339 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
340 {
341 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
342 }
343
344 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)345 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
346 unsigned int nr_zones, size_t *buflen)
347 {
348 struct request_queue *q = ublk->ub_disk->queue;
349 size_t bufsize;
350 void *buf;
351
352 nr_zones = min_t(unsigned int, nr_zones,
353 ublk->ub_disk->nr_zones);
354
355 bufsize = nr_zones * sizeof(struct blk_zone);
356 bufsize =
357 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
358
359 while (bufsize >= sizeof(struct blk_zone)) {
360 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
361 if (buf) {
362 *buflen = bufsize;
363 return buf;
364 }
365 bufsize >>= 1;
366 }
367
368 *buflen = 0;
369 return NULL;
370 }
371
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)372 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
373 unsigned int nr_zones, report_zones_cb cb, void *data)
374 {
375 struct ublk_device *ub = disk->private_data;
376 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
377 unsigned int first_zone = sector >> ilog2(zone_size_sectors);
378 unsigned int done_zones = 0;
379 unsigned int max_zones_per_request;
380 int ret;
381 struct blk_zone *buffer;
382 size_t buffer_length;
383
384 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
385 nr_zones);
386
387 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
388 if (!buffer)
389 return -ENOMEM;
390
391 max_zones_per_request = buffer_length / sizeof(struct blk_zone);
392
393 while (done_zones < nr_zones) {
394 unsigned int remaining_zones = nr_zones - done_zones;
395 unsigned int zones_in_request =
396 min_t(unsigned int, remaining_zones, max_zones_per_request);
397 struct request *req;
398 struct ublk_zoned_report_desc desc;
399 blk_status_t status;
400
401 memset(buffer, 0, buffer_length);
402
403 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
404 if (IS_ERR(req)) {
405 ret = PTR_ERR(req);
406 goto out;
407 }
408
409 desc.operation = UBLK_IO_OP_REPORT_ZONES;
410 desc.sector = sector;
411 desc.nr_zones = zones_in_request;
412 ret = ublk_zoned_insert_report_desc(req, &desc);
413 if (ret)
414 goto free_req;
415
416 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
417 if (ret)
418 goto erase_desc;
419
420 status = blk_execute_rq(req, 0);
421 ret = blk_status_to_errno(status);
422 erase_desc:
423 ublk_zoned_erase_report_desc(req);
424 free_req:
425 blk_mq_free_request(req);
426 if (ret)
427 goto out;
428
429 for (unsigned int i = 0; i < zones_in_request; i++) {
430 struct blk_zone *zone = buffer + i;
431
432 /* A zero length zone means no more zones in this response */
433 if (!zone->len)
434 break;
435
436 ret = cb(zone, i, data);
437 if (ret)
438 goto out;
439
440 done_zones++;
441 sector += zone_size_sectors;
442
443 }
444 }
445
446 ret = done_zones;
447
448 out:
449 kvfree(buffer);
450 return ret;
451 }
452
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)453 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
454 struct request *req)
455 {
456 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
457 struct ublk_io *io = &ubq->ios[req->tag];
458 struct ublk_zoned_report_desc *desc;
459 u32 ublk_op;
460
461 switch (req_op(req)) {
462 case REQ_OP_ZONE_OPEN:
463 ublk_op = UBLK_IO_OP_ZONE_OPEN;
464 break;
465 case REQ_OP_ZONE_CLOSE:
466 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
467 break;
468 case REQ_OP_ZONE_FINISH:
469 ublk_op = UBLK_IO_OP_ZONE_FINISH;
470 break;
471 case REQ_OP_ZONE_RESET:
472 ublk_op = UBLK_IO_OP_ZONE_RESET;
473 break;
474 case REQ_OP_ZONE_APPEND:
475 ublk_op = UBLK_IO_OP_ZONE_APPEND;
476 break;
477 case REQ_OP_ZONE_RESET_ALL:
478 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
479 break;
480 case REQ_OP_DRV_IN:
481 desc = ublk_zoned_get_report_desc(req);
482 if (!desc)
483 return BLK_STS_IOERR;
484 ublk_op = desc->operation;
485 switch (ublk_op) {
486 case UBLK_IO_OP_REPORT_ZONES:
487 iod->op_flags = ublk_op | ublk_req_build_flags(req);
488 iod->nr_zones = desc->nr_zones;
489 iod->start_sector = desc->sector;
490 return BLK_STS_OK;
491 default:
492 return BLK_STS_IOERR;
493 }
494 case REQ_OP_DRV_OUT:
495 /* We do not support drv_out */
496 return BLK_STS_NOTSUPP;
497 default:
498 return BLK_STS_IOERR;
499 }
500
501 iod->op_flags = ublk_op | ublk_req_build_flags(req);
502 iod->nr_sectors = blk_rq_sectors(req);
503 iod->start_sector = blk_rq_pos(req);
504 iod->addr = io->addr;
505
506 return BLK_STS_OK;
507 }
508
509 #else
510
511 #define ublk_report_zones (NULL)
512
ublk_dev_param_zoned_validate(const struct ublk_device * ub)513 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
514 {
515 return -EOPNOTSUPP;
516 }
517
ublk_dev_param_zoned_apply(struct ublk_device * ub)518 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
519 {
520 }
521
ublk_revalidate_disk_zones(struct ublk_device * ub)522 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
523 {
524 return 0;
525 }
526
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)527 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
528 struct request *req)
529 {
530 return BLK_STS_NOTSUPP;
531 }
532
533 #endif
534
535 static inline void __ublk_complete_rq(struct request *req);
536
537 static dev_t ublk_chr_devt;
538 static const struct class ublk_chr_class = {
539 .name = "ublk-char",
540 };
541
542 static DEFINE_IDR(ublk_index_idr);
543 static DEFINE_SPINLOCK(ublk_idr_lock);
544 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
545
546 static DEFINE_MUTEX(ublk_ctl_mutex);
547
548
549 #define UBLK_MAX_UBLKS UBLK_MINORS
550
551 /*
552 * Max unprivileged ublk devices allowed to add
553 *
554 * It can be extended to one per-user limit in future or even controlled
555 * by cgroup.
556 */
557 static unsigned int unprivileged_ublks_max = 64;
558 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
559
560 static struct miscdevice ublk_misc;
561
ublk_pos_to_hwq(loff_t pos)562 static inline unsigned ublk_pos_to_hwq(loff_t pos)
563 {
564 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
565 UBLK_QID_BITS_MASK;
566 }
567
ublk_pos_to_buf_off(loff_t pos)568 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
569 {
570 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
571 }
572
ublk_pos_to_tag(loff_t pos)573 static inline unsigned ublk_pos_to_tag(loff_t pos)
574 {
575 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
576 UBLK_TAG_BITS_MASK;
577 }
578
ublk_dev_param_basic_apply(struct ublk_device * ub)579 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
580 {
581 const struct ublk_param_basic *p = &ub->params.basic;
582
583 if (p->attrs & UBLK_ATTR_READ_ONLY)
584 set_disk_ro(ub->ub_disk, true);
585
586 set_capacity(ub->ub_disk, p->dev_sectors);
587 }
588
ublk_validate_params(const struct ublk_device * ub)589 static int ublk_validate_params(const struct ublk_device *ub)
590 {
591 /* basic param is the only one which must be set */
592 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
593 const struct ublk_param_basic *p = &ub->params.basic;
594
595 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
596 return -EINVAL;
597
598 if (p->logical_bs_shift > p->physical_bs_shift)
599 return -EINVAL;
600
601 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
602 return -EINVAL;
603
604 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
605 return -EINVAL;
606 } else
607 return -EINVAL;
608
609 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
610 const struct ublk_param_discard *p = &ub->params.discard;
611
612 /* So far, only support single segment discard */
613 if (p->max_discard_sectors && p->max_discard_segments != 1)
614 return -EINVAL;
615
616 if (!p->discard_granularity)
617 return -EINVAL;
618 }
619
620 /* dev_t is read-only */
621 if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
622 return -EINVAL;
623
624 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
625 return ublk_dev_param_zoned_validate(ub);
626 else if (ublk_dev_is_zoned(ub))
627 return -EINVAL;
628
629 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
630 const struct ublk_param_dma_align *p = &ub->params.dma;
631
632 if (p->alignment >= PAGE_SIZE)
633 return -EINVAL;
634
635 if (!is_power_of_2(p->alignment + 1))
636 return -EINVAL;
637 }
638
639 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
640 const struct ublk_param_segment *p = &ub->params.seg;
641
642 if (!is_power_of_2(p->seg_boundary_mask + 1))
643 return -EINVAL;
644
645 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
646 return -EINVAL;
647 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
648 return -EINVAL;
649 }
650
651 return 0;
652 }
653
ublk_apply_params(struct ublk_device * ub)654 static void ublk_apply_params(struct ublk_device *ub)
655 {
656 ublk_dev_param_basic_apply(ub);
657
658 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
659 ublk_dev_param_zoned_apply(ub);
660 }
661
ublk_support_zero_copy(const struct ublk_queue * ubq)662 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
663 {
664 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
665 }
666
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)667 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
668 {
669 return ubq->flags & UBLK_F_AUTO_BUF_REG;
670 }
671
ublk_support_user_copy(const struct ublk_queue * ubq)672 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
673 {
674 return ubq->flags & UBLK_F_USER_COPY;
675 }
676
ublk_need_map_io(const struct ublk_queue * ubq)677 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
678 {
679 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
680 !ublk_support_auto_buf_reg(ubq);
681 }
682
ublk_need_req_ref(const struct ublk_queue * ubq)683 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
684 {
685 /*
686 * read()/write() is involved in user copy, so request reference
687 * has to be grabbed
688 *
689 * for zero copy, request buffer need to be registered to io_uring
690 * buffer table, so reference is needed
691 *
692 * For auto buffer register, ublk server still may issue
693 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
694 * so reference is required too.
695 */
696 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
697 ublk_support_auto_buf_reg(ubq);
698 }
699
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)700 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
701 struct ublk_io *io)
702 {
703 if (ublk_need_req_ref(ubq))
704 refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
705 }
706
ublk_get_req_ref(struct ublk_io * io)707 static inline bool ublk_get_req_ref(struct ublk_io *io)
708 {
709 return refcount_inc_not_zero(&io->ref);
710 }
711
ublk_put_req_ref(struct ublk_io * io,struct request * req)712 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
713 {
714 if (refcount_dec_and_test(&io->ref))
715 __ublk_complete_rq(req);
716 }
717
ublk_sub_req_ref(struct ublk_io * io)718 static inline bool ublk_sub_req_ref(struct ublk_io *io)
719 {
720 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
721
722 io->task_registered_buffers = 0;
723 return refcount_sub_and_test(sub_refs, &io->ref);
724 }
725
ublk_need_get_data(const struct ublk_queue * ubq)726 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
727 {
728 return ubq->flags & UBLK_F_NEED_GET_DATA;
729 }
730
731 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)732 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
733 {
734 if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
735 return ub;
736 return NULL;
737 }
738
739 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)740 static noinline void ublk_put_device(struct ublk_device *ub)
741 {
742 put_device(&ub->cdev_dev);
743 }
744
ublk_get_queue(struct ublk_device * dev,int qid)745 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
746 int qid)
747 {
748 return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
749 }
750
ublk_rq_has_data(const struct request * rq)751 static inline bool ublk_rq_has_data(const struct request *rq)
752 {
753 return bio_has_data(rq->bio);
754 }
755
756 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)757 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
758 {
759 return ublk_get_queue(ub, q_id)->io_cmd_buf;
760 }
761
__ublk_queue_cmd_buf_size(int depth)762 static inline int __ublk_queue_cmd_buf_size(int depth)
763 {
764 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
765 }
766
ublk_queue_cmd_buf_size(struct ublk_device * ub,int q_id)767 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
768 {
769 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
770
771 return __ublk_queue_cmd_buf_size(ubq->q_depth);
772 }
773
ublk_max_cmd_buf_size(void)774 static int ublk_max_cmd_buf_size(void)
775 {
776 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
777 }
778
779 /*
780 * Should I/O outstanding to the ublk server when it exits be reissued?
781 * If not, outstanding I/O will get errors.
782 */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)783 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
784 {
785 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
786 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
787 }
788
789 /*
790 * Should I/O issued while there is no ublk server queue? If not, I/O
791 * issued while there is no ublk server will get errors.
792 */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)793 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
794 {
795 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
796 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
797 }
798
799 /*
800 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
801 * of the device flags for smaller cache footprint - better for fast
802 * paths.
803 */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)804 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
805 {
806 return (ubq->flags & UBLK_F_USER_RECOVERY) &&
807 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
808 }
809
810 /*
811 * Should ublk devices be stopped (i.e. no recovery possible) when the
812 * ublk server exits? If not, devices can be used again by a future
813 * incarnation of a ublk server via the start_recovery/end_recovery
814 * commands.
815 */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)816 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
817 {
818 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
819 }
820
ublk_dev_in_recoverable_state(struct ublk_device * ub)821 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
822 {
823 return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
824 ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
825 }
826
ublk_free_disk(struct gendisk * disk)827 static void ublk_free_disk(struct gendisk *disk)
828 {
829 struct ublk_device *ub = disk->private_data;
830
831 clear_bit(UB_STATE_USED, &ub->state);
832 ublk_put_device(ub);
833 }
834
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)835 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
836 unsigned int *owner_gid)
837 {
838 kuid_t uid;
839 kgid_t gid;
840
841 current_uid_gid(&uid, &gid);
842
843 *owner_uid = from_kuid(&init_user_ns, uid);
844 *owner_gid = from_kgid(&init_user_ns, gid);
845 }
846
ublk_open(struct gendisk * disk,blk_mode_t mode)847 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
848 {
849 struct ublk_device *ub = disk->private_data;
850
851 if (capable(CAP_SYS_ADMIN))
852 return 0;
853
854 /*
855 * If it is one unprivileged device, only owner can open
856 * the disk. Otherwise it could be one trap made by one
857 * evil user who grants this disk's privileges to other
858 * users deliberately.
859 *
860 * This way is reasonable too given anyone can create
861 * unprivileged device, and no need other's grant.
862 */
863 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
864 unsigned int curr_uid, curr_gid;
865
866 ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
867
868 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
869 ub->dev_info.owner_gid)
870 return -EPERM;
871 }
872
873 return 0;
874 }
875
876 static const struct block_device_operations ub_fops = {
877 .owner = THIS_MODULE,
878 .open = ublk_open,
879 .free_disk = ublk_free_disk,
880 .report_zones = ublk_report_zones,
881 };
882
883 #define UBLK_MAX_PIN_PAGES 32
884
885 struct ublk_io_iter {
886 struct page *pages[UBLK_MAX_PIN_PAGES];
887 struct bio *bio;
888 struct bvec_iter iter;
889 };
890
891 /* return how many pages are copied */
ublk_copy_io_pages(struct ublk_io_iter * data,size_t total,size_t pg_off,int dir)892 static void ublk_copy_io_pages(struct ublk_io_iter *data,
893 size_t total, size_t pg_off, int dir)
894 {
895 unsigned done = 0;
896 unsigned pg_idx = 0;
897
898 while (done < total) {
899 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
900 unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
901 (unsigned)(PAGE_SIZE - pg_off));
902 void *bv_buf = bvec_kmap_local(&bv);
903 void *pg_buf = kmap_local_page(data->pages[pg_idx]);
904
905 if (dir == ITER_DEST)
906 memcpy(pg_buf + pg_off, bv_buf, bytes);
907 else
908 memcpy(bv_buf, pg_buf + pg_off, bytes);
909
910 kunmap_local(pg_buf);
911 kunmap_local(bv_buf);
912
913 /* advance page array */
914 pg_off += bytes;
915 if (pg_off == PAGE_SIZE) {
916 pg_idx += 1;
917 pg_off = 0;
918 }
919
920 done += bytes;
921
922 /* advance bio */
923 bio_advance_iter_single(data->bio, &data->iter, bytes);
924 if (!data->iter.bi_size) {
925 data->bio = data->bio->bi_next;
926 if (data->bio == NULL)
927 break;
928 data->iter = data->bio->bi_iter;
929 }
930 }
931 }
932
ublk_advance_io_iter(const struct request * req,struct ublk_io_iter * iter,unsigned int offset)933 static bool ublk_advance_io_iter(const struct request *req,
934 struct ublk_io_iter *iter, unsigned int offset)
935 {
936 struct bio *bio = req->bio;
937
938 for_each_bio(bio) {
939 if (bio->bi_iter.bi_size > offset) {
940 iter->bio = bio;
941 iter->iter = bio->bi_iter;
942 bio_advance_iter(iter->bio, &iter->iter, offset);
943 return true;
944 }
945 offset -= bio->bi_iter.bi_size;
946 }
947 return false;
948 }
949
950 /*
951 * Copy data between request pages and io_iter, and 'offset'
952 * is the start point of linear offset of request.
953 */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)954 static size_t ublk_copy_user_pages(const struct request *req,
955 unsigned offset, struct iov_iter *uiter, int dir)
956 {
957 struct ublk_io_iter iter;
958 size_t done = 0;
959
960 if (!ublk_advance_io_iter(req, &iter, offset))
961 return 0;
962
963 while (iov_iter_count(uiter) && iter.bio) {
964 unsigned nr_pages;
965 ssize_t len;
966 size_t off;
967 int i;
968
969 len = iov_iter_get_pages2(uiter, iter.pages,
970 iov_iter_count(uiter),
971 UBLK_MAX_PIN_PAGES, &off);
972 if (len <= 0)
973 return done;
974
975 ublk_copy_io_pages(&iter, len, off, dir);
976 nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
977 for (i = 0; i < nr_pages; i++) {
978 if (dir == ITER_DEST)
979 set_page_dirty(iter.pages[i]);
980 put_page(iter.pages[i]);
981 }
982 done += len;
983 }
984
985 return done;
986 }
987
ublk_need_map_req(const struct request * req)988 static inline bool ublk_need_map_req(const struct request *req)
989 {
990 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
991 }
992
ublk_need_unmap_req(const struct request * req)993 static inline bool ublk_need_unmap_req(const struct request *req)
994 {
995 return ublk_rq_has_data(req) &&
996 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
997 }
998
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)999 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
1000 const struct ublk_io *io)
1001 {
1002 const unsigned int rq_bytes = blk_rq_bytes(req);
1003
1004 if (!ublk_need_map_io(ubq))
1005 return rq_bytes;
1006
1007 /*
1008 * no zero copy, we delay copy WRITE request data into ublksrv
1009 * context and the big benefit is that pinning pages in current
1010 * context is pretty fast, see ublk_pin_user_pages
1011 */
1012 if (ublk_need_map_req(req)) {
1013 struct iov_iter iter;
1014 const int dir = ITER_DEST;
1015
1016 import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
1017 return ublk_copy_user_pages(req, 0, &iter, dir);
1018 }
1019 return rq_bytes;
1020 }
1021
ublk_unmap_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)1022 static int ublk_unmap_io(const struct ublk_queue *ubq,
1023 const struct request *req,
1024 const struct ublk_io *io)
1025 {
1026 const unsigned int rq_bytes = blk_rq_bytes(req);
1027
1028 if (!ublk_need_map_io(ubq))
1029 return rq_bytes;
1030
1031 if (ublk_need_unmap_req(req)) {
1032 struct iov_iter iter;
1033 const int dir = ITER_SOURCE;
1034
1035 WARN_ON_ONCE(io->res > rq_bytes);
1036
1037 import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
1038 return ublk_copy_user_pages(req, 0, &iter, dir);
1039 }
1040 return rq_bytes;
1041 }
1042
ublk_req_build_flags(struct request * req)1043 static inline unsigned int ublk_req_build_flags(struct request *req)
1044 {
1045 unsigned flags = 0;
1046
1047 if (req->cmd_flags & REQ_FAILFAST_DEV)
1048 flags |= UBLK_IO_F_FAILFAST_DEV;
1049
1050 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1051 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1052
1053 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1054 flags |= UBLK_IO_F_FAILFAST_DRIVER;
1055
1056 if (req->cmd_flags & REQ_META)
1057 flags |= UBLK_IO_F_META;
1058
1059 if (req->cmd_flags & REQ_FUA)
1060 flags |= UBLK_IO_F_FUA;
1061
1062 if (req->cmd_flags & REQ_NOUNMAP)
1063 flags |= UBLK_IO_F_NOUNMAP;
1064
1065 if (req->cmd_flags & REQ_SWAP)
1066 flags |= UBLK_IO_F_SWAP;
1067
1068 return flags;
1069 }
1070
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1071 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1072 {
1073 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1074 struct ublk_io *io = &ubq->ios[req->tag];
1075 enum req_op op = req_op(req);
1076 u32 ublk_op;
1077
1078 if (!ublk_queue_is_zoned(ubq) &&
1079 (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
1080 return BLK_STS_IOERR;
1081
1082 switch (req_op(req)) {
1083 case REQ_OP_READ:
1084 ublk_op = UBLK_IO_OP_READ;
1085 break;
1086 case REQ_OP_WRITE:
1087 ublk_op = UBLK_IO_OP_WRITE;
1088 break;
1089 case REQ_OP_FLUSH:
1090 ublk_op = UBLK_IO_OP_FLUSH;
1091 break;
1092 case REQ_OP_DISCARD:
1093 ublk_op = UBLK_IO_OP_DISCARD;
1094 break;
1095 case REQ_OP_WRITE_ZEROES:
1096 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1097 break;
1098 default:
1099 if (ublk_queue_is_zoned(ubq))
1100 return ublk_setup_iod_zoned(ubq, req);
1101 return BLK_STS_IOERR;
1102 }
1103
1104 /* need to translate since kernel may change */
1105 iod->op_flags = ublk_op | ublk_req_build_flags(req);
1106 iod->nr_sectors = blk_rq_sectors(req);
1107 iod->start_sector = blk_rq_pos(req);
1108 iod->addr = io->addr;
1109
1110 return BLK_STS_OK;
1111 }
1112
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1113 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1114 struct io_uring_cmd *ioucmd)
1115 {
1116 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1117 }
1118
1119 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req)1120 static inline void __ublk_complete_rq(struct request *req)
1121 {
1122 struct ublk_queue *ubq = req->mq_hctx->driver_data;
1123 struct ublk_io *io = &ubq->ios[req->tag];
1124 unsigned int unmapped_bytes;
1125 blk_status_t res = BLK_STS_OK;
1126
1127 /* failed read IO if nothing is read */
1128 if (!io->res && req_op(req) == REQ_OP_READ)
1129 io->res = -EIO;
1130
1131 if (io->res < 0) {
1132 res = errno_to_blk_status(io->res);
1133 goto exit;
1134 }
1135
1136 /*
1137 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1138 * directly.
1139 *
1140 * Both the two needn't unmap.
1141 */
1142 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1143 req_op(req) != REQ_OP_DRV_IN)
1144 goto exit;
1145
1146 /* for READ request, writing data in iod->addr to rq buffers */
1147 unmapped_bytes = ublk_unmap_io(ubq, req, io);
1148
1149 /*
1150 * Extremely impossible since we got data filled in just before
1151 *
1152 * Re-read simply for this unlikely case.
1153 */
1154 if (unlikely(unmapped_bytes < io->res))
1155 io->res = unmapped_bytes;
1156
1157 if (blk_update_request(req, BLK_STS_OK, io->res))
1158 blk_mq_requeue_request(req, true);
1159 else if (likely(!blk_should_fake_timeout(req->q)))
1160 __blk_mq_end_request(req, BLK_STS_OK);
1161
1162 return;
1163 exit:
1164 blk_mq_end_request(req, res);
1165 }
1166
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1167 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1168 struct request *req)
1169 {
1170 /* read cmd first because req will overwrite it */
1171 struct io_uring_cmd *cmd = io->cmd;
1172
1173 /* mark this cmd owned by ublksrv */
1174 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1175
1176 /*
1177 * clear ACTIVE since we are done with this sqe/cmd slot
1178 * We can only accept io cmd in case of being not active.
1179 */
1180 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1181
1182 io->req = req;
1183 return cmd;
1184 }
1185
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1186 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1187 int res, unsigned issue_flags)
1188 {
1189 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1190
1191 /* tell ublksrv one io request is coming */
1192 io_uring_cmd_done(cmd, res, 0, issue_flags);
1193 }
1194
1195 #define UBLK_REQUEUE_DELAY_MS 3
1196
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1197 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1198 struct request *rq)
1199 {
1200 /* We cannot process this rq so just requeue it. */
1201 if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1202 blk_mq_requeue_request(rq, false);
1203 else
1204 blk_mq_end_request(rq, BLK_STS_IOERR);
1205 }
1206
1207 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,struct ublk_io * io)1208 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io)
1209 {
1210 unsigned tag = io - ubq->ios;
1211 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1212
1213 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1214 }
1215
ublk_auto_buf_reg(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,unsigned int issue_flags)1216 static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
1217 struct ublk_io *io, unsigned int issue_flags)
1218 {
1219 int ret;
1220
1221 ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
1222 io->buf.index, issue_flags);
1223 if (ret) {
1224 if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1225 ublk_auto_buf_reg_fallback(ubq, io);
1226 return true;
1227 }
1228 blk_mq_end_request(req, BLK_STS_IOERR);
1229 return false;
1230 }
1231
1232 io->task_registered_buffers = 1;
1233 io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
1234 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1235 return true;
1236 }
1237
ublk_prep_auto_buf_reg(struct ublk_queue * ubq,struct request * req,struct ublk_io * io,unsigned int issue_flags)1238 static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
1239 struct request *req, struct ublk_io *io,
1240 unsigned int issue_flags)
1241 {
1242 ublk_init_req_ref(ubq, io);
1243 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
1244 return ublk_auto_buf_reg(ubq, req, io, issue_flags);
1245
1246 return true;
1247 }
1248
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1249 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1250 struct ublk_io *io)
1251 {
1252 unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1253
1254 /* partially mapped, update io descriptor */
1255 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1256 /*
1257 * Nothing mapped, retry until we succeed.
1258 *
1259 * We may never succeed in mapping any bytes here because
1260 * of OOM. TODO: reserve one buffer with single page pinned
1261 * for providing forward progress guarantee.
1262 */
1263 if (unlikely(!mapped_bytes)) {
1264 blk_mq_requeue_request(req, false);
1265 blk_mq_delay_kick_requeue_list(req->q,
1266 UBLK_REQUEUE_DELAY_MS);
1267 return false;
1268 }
1269
1270 ublk_get_iod(ubq, req->tag)->nr_sectors =
1271 mapped_bytes >> 9;
1272 }
1273
1274 return true;
1275 }
1276
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req,unsigned int issue_flags)1277 static void ublk_dispatch_req(struct ublk_queue *ubq,
1278 struct request *req,
1279 unsigned int issue_flags)
1280 {
1281 int tag = req->tag;
1282 struct ublk_io *io = &ubq->ios[tag];
1283
1284 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1285 __func__, ubq->q_id, req->tag, io->flags,
1286 ublk_get_iod(ubq, req->tag)->addr);
1287
1288 /*
1289 * Task is exiting if either:
1290 *
1291 * (1) current != io->task.
1292 * io_uring_cmd_complete_in_task() tries to run task_work
1293 * in a workqueue if cmd's task is PF_EXITING.
1294 *
1295 * (2) current->flags & PF_EXITING.
1296 */
1297 if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1298 __ublk_abort_rq(ubq, req);
1299 return;
1300 }
1301
1302 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1303 /*
1304 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1305 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1306 * and notify it.
1307 */
1308 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1309 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1310 __func__, ubq->q_id, req->tag, io->flags);
1311 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1312 issue_flags);
1313 return;
1314 }
1315
1316 if (!ublk_start_io(ubq, req, io))
1317 return;
1318
1319 if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
1320 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1321 }
1322
ublk_cmd_tw_cb(struct io_uring_cmd * cmd,unsigned int issue_flags)1323 static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
1324 unsigned int issue_flags)
1325 {
1326 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1327 struct ublk_queue *ubq = pdu->ubq;
1328
1329 ublk_dispatch_req(ubq, pdu->req, issue_flags);
1330 }
1331
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)1332 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1333 {
1334 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
1335 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1336
1337 pdu->req = rq;
1338 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
1339 }
1340
ublk_cmd_list_tw_cb(struct io_uring_cmd * cmd,unsigned int issue_flags)1341 static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
1342 unsigned int issue_flags)
1343 {
1344 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1345 struct request *rq = pdu->req_list;
1346 struct request *next;
1347
1348 do {
1349 next = rq->rq_next;
1350 rq->rq_next = NULL;
1351 ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags);
1352 rq = next;
1353 } while (rq);
1354 }
1355
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)1356 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
1357 {
1358 struct io_uring_cmd *cmd = io->cmd;
1359 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1360
1361 pdu->req_list = rq_list_peek(l);
1362 rq_list_init(l);
1363 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
1364 }
1365
ublk_timeout(struct request * rq)1366 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1367 {
1368 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1369 pid_t tgid = ubq->dev->ublksrv_tgid;
1370 struct task_struct *p;
1371 struct pid *pid;
1372
1373 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
1374 return BLK_EH_RESET_TIMER;
1375
1376 if (unlikely(!tgid))
1377 return BLK_EH_RESET_TIMER;
1378
1379 rcu_read_lock();
1380 pid = find_vpid(tgid);
1381 p = pid_task(pid, PIDTYPE_PID);
1382 if (p)
1383 send_sig(SIGKILL, p, 0);
1384 rcu_read_unlock();
1385 return BLK_EH_DONE;
1386 }
1387
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)1388 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
1389 bool check_cancel)
1390 {
1391 blk_status_t res;
1392
1393 if (unlikely(READ_ONCE(ubq->fail_io)))
1394 return BLK_STS_TARGET;
1395
1396 /* With recovery feature enabled, force_abort is set in
1397 * ublk_stop_dev() before calling del_gendisk(). We have to
1398 * abort all requeued and new rqs here to let del_gendisk()
1399 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1400 * to avoid UAF on io_uring ctx.
1401 *
1402 * Note: force_abort is guaranteed to be seen because it is set
1403 * before request queue is unqiuesced.
1404 */
1405 if (ublk_nosrv_should_queue_io(ubq) &&
1406 unlikely(READ_ONCE(ubq->force_abort)))
1407 return BLK_STS_IOERR;
1408
1409 if (check_cancel && unlikely(ubq->canceling))
1410 return BLK_STS_IOERR;
1411
1412 /* fill iod to slot in io cmd buffer */
1413 res = ublk_setup_iod(ubq, rq);
1414 if (unlikely(res != BLK_STS_OK))
1415 return BLK_STS_IOERR;
1416
1417 blk_mq_start_request(rq);
1418 return BLK_STS_OK;
1419 }
1420
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)1421 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
1422 const struct blk_mq_queue_data *bd)
1423 {
1424 struct ublk_queue *ubq = hctx->driver_data;
1425 struct request *rq = bd->rq;
1426 blk_status_t res;
1427
1428 res = ublk_prep_req(ubq, rq, false);
1429 if (res != BLK_STS_OK)
1430 return res;
1431
1432 /*
1433 * ->canceling has to be handled after ->force_abort and ->fail_io
1434 * is dealt with, otherwise this request may not be failed in case
1435 * of recovery, and cause hang when deleting disk
1436 */
1437 if (unlikely(ubq->canceling)) {
1438 __ublk_abort_rq(ubq, rq);
1439 return BLK_STS_OK;
1440 }
1441
1442 ublk_queue_cmd(ubq, rq);
1443 return BLK_STS_OK;
1444 }
1445
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)1446 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
1447 const struct ublk_io *io2)
1448 {
1449 return (io_uring_cmd_ctx_handle(io->cmd) ==
1450 io_uring_cmd_ctx_handle(io2->cmd)) &&
1451 (io->task == io2->task);
1452 }
1453
ublk_queue_rqs(struct rq_list * rqlist)1454 static void ublk_queue_rqs(struct rq_list *rqlist)
1455 {
1456 struct rq_list requeue_list = { };
1457 struct rq_list submit_list = { };
1458 struct ublk_io *io = NULL;
1459 struct request *req;
1460
1461 while ((req = rq_list_pop(rqlist))) {
1462 struct ublk_queue *this_q = req->mq_hctx->driver_data;
1463 struct ublk_io *this_io = &this_q->ios[req->tag];
1464
1465 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
1466 rq_list_add_tail(&requeue_list, req);
1467 continue;
1468 }
1469
1470 if (io && !ublk_belong_to_same_batch(io, this_io) &&
1471 !rq_list_empty(&submit_list))
1472 ublk_queue_cmd_list(io, &submit_list);
1473 io = this_io;
1474 rq_list_add_tail(&submit_list, req);
1475 }
1476
1477 if (!rq_list_empty(&submit_list))
1478 ublk_queue_cmd_list(io, &submit_list);
1479 *rqlist = requeue_list;
1480 }
1481
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)1482 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1483 unsigned int hctx_idx)
1484 {
1485 struct ublk_device *ub = driver_data;
1486 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
1487
1488 hctx->driver_data = ubq;
1489 return 0;
1490 }
1491
1492 static const struct blk_mq_ops ublk_mq_ops = {
1493 .queue_rq = ublk_queue_rq,
1494 .queue_rqs = ublk_queue_rqs,
1495 .init_hctx = ublk_init_hctx,
1496 .timeout = ublk_timeout,
1497 };
1498
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)1499 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
1500 {
1501 int i;
1502
1503 /* All old ioucmds have to be completed */
1504 ubq->nr_io_ready = 0;
1505
1506 for (i = 0; i < ubq->q_depth; i++) {
1507 struct ublk_io *io = &ubq->ios[i];
1508
1509 /*
1510 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
1511 * io->cmd
1512 */
1513 io->flags &= UBLK_IO_FLAG_CANCELED;
1514 io->cmd = NULL;
1515 io->addr = 0;
1516
1517 /*
1518 * old task is PF_EXITING, put it now
1519 *
1520 * It could be NULL in case of closing one quiesced
1521 * device.
1522 */
1523 if (io->task) {
1524 put_task_struct(io->task);
1525 io->task = NULL;
1526 }
1527
1528 WARN_ON_ONCE(refcount_read(&io->ref));
1529 WARN_ON_ONCE(io->task_registered_buffers);
1530 }
1531 }
1532
ublk_ch_open(struct inode * inode,struct file * filp)1533 static int ublk_ch_open(struct inode *inode, struct file *filp)
1534 {
1535 struct ublk_device *ub = container_of(inode->i_cdev,
1536 struct ublk_device, cdev);
1537
1538 if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
1539 return -EBUSY;
1540 filp->private_data = ub;
1541 ub->ublksrv_tgid = current->tgid;
1542 return 0;
1543 }
1544
ublk_reset_ch_dev(struct ublk_device * ub)1545 static void ublk_reset_ch_dev(struct ublk_device *ub)
1546 {
1547 int i;
1548
1549 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1550 ublk_queue_reinit(ub, ublk_get_queue(ub, i));
1551
1552 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
1553 ub->mm = NULL;
1554 ub->nr_queues_ready = 0;
1555 ub->unprivileged_daemons = false;
1556 ub->ublksrv_tgid = -1;
1557 }
1558
ublk_get_disk(struct ublk_device * ub)1559 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
1560 {
1561 struct gendisk *disk;
1562
1563 spin_lock(&ub->lock);
1564 disk = ub->ub_disk;
1565 if (disk)
1566 get_device(disk_to_dev(disk));
1567 spin_unlock(&ub->lock);
1568
1569 return disk;
1570 }
1571
ublk_put_disk(struct gendisk * disk)1572 static void ublk_put_disk(struct gendisk *disk)
1573 {
1574 if (disk)
1575 put_device(disk_to_dev(disk));
1576 }
1577
1578 /*
1579 * Use this function to ensure that ->canceling is consistently set for
1580 * the device and all queues. Do not set these flags directly.
1581 *
1582 * Caller must ensure that:
1583 * - cancel_mutex is held. This ensures that there is no concurrent
1584 * access to ub->canceling and no concurrent writes to ubq->canceling.
1585 * - there are no concurrent reads of ubq->canceling from the queue_rq
1586 * path. This can be done by quiescing the queue, or through other
1587 * means.
1588 */
ublk_set_canceling(struct ublk_device * ub,bool canceling)1589 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
1590 __must_hold(&ub->cancel_mutex)
1591 {
1592 int i;
1593
1594 ub->canceling = canceling;
1595 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1596 ublk_get_queue(ub, i)->canceling = canceling;
1597 }
1598
ublk_check_and_reset_active_ref(struct ublk_device * ub)1599 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
1600 {
1601 int i, j;
1602
1603 if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY |
1604 UBLK_F_AUTO_BUF_REG)))
1605 return false;
1606
1607 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1608 struct ublk_queue *ubq = ublk_get_queue(ub, i);
1609
1610 for (j = 0; j < ubq->q_depth; j++) {
1611 struct ublk_io *io = &ubq->ios[j];
1612 unsigned int refs = refcount_read(&io->ref) +
1613 io->task_registered_buffers;
1614
1615 /*
1616 * UBLK_REFCOUNT_INIT or zero means no active
1617 * reference
1618 */
1619 if (refs != UBLK_REFCOUNT_INIT && refs != 0)
1620 return true;
1621
1622 /* reset to zero if the io hasn't active references */
1623 refcount_set(&io->ref, 0);
1624 io->task_registered_buffers = 0;
1625 }
1626 }
1627 return false;
1628 }
1629
ublk_ch_release_work_fn(struct work_struct * work)1630 static void ublk_ch_release_work_fn(struct work_struct *work)
1631 {
1632 struct ublk_device *ub =
1633 container_of(work, struct ublk_device, exit_work.work);
1634 struct gendisk *disk;
1635 int i;
1636
1637 /*
1638 * For zero-copy and auto buffer register modes, I/O references
1639 * might not be dropped naturally when the daemon is killed, but
1640 * io_uring guarantees that registered bvec kernel buffers are
1641 * unregistered finally when freeing io_uring context, then the
1642 * active references are dropped.
1643 *
1644 * Wait until active references are dropped for avoiding use-after-free
1645 *
1646 * registered buffer may be unregistered in io_ring's release hander,
1647 * so have to wait by scheduling work function for avoiding the two
1648 * file release dependency.
1649 */
1650 if (ublk_check_and_reset_active_ref(ub)) {
1651 schedule_delayed_work(&ub->exit_work, 1);
1652 return;
1653 }
1654
1655 /*
1656 * disk isn't attached yet, either device isn't live, or it has
1657 * been removed already, so we needn't to do anything
1658 */
1659 disk = ublk_get_disk(ub);
1660 if (!disk)
1661 goto out;
1662
1663 /*
1664 * All uring_cmd are done now, so abort any request outstanding to
1665 * the ublk server
1666 *
1667 * This can be done in lockless way because ublk server has been
1668 * gone
1669 *
1670 * More importantly, we have to provide forward progress guarantee
1671 * without holding ub->mutex, otherwise control task grabbing
1672 * ub->mutex triggers deadlock
1673 *
1674 * All requests may be inflight, so ->canceling may not be set, set
1675 * it now.
1676 */
1677 mutex_lock(&ub->cancel_mutex);
1678 ublk_set_canceling(ub, true);
1679 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1680 ublk_abort_queue(ub, ublk_get_queue(ub, i));
1681 mutex_unlock(&ub->cancel_mutex);
1682 blk_mq_kick_requeue_list(disk->queue);
1683
1684 /*
1685 * All infligh requests have been completed or requeued and any new
1686 * request will be failed or requeued via `->canceling` now, so it is
1687 * fine to grab ub->mutex now.
1688 */
1689 mutex_lock(&ub->mutex);
1690
1691 /* double check after grabbing lock */
1692 if (!ub->ub_disk)
1693 goto unlock;
1694
1695 /*
1696 * Transition the device to the nosrv state. What exactly this
1697 * means depends on the recovery flags
1698 */
1699 if (ublk_nosrv_should_stop_dev(ub)) {
1700 /*
1701 * Allow any pending/future I/O to pass through quickly
1702 * with an error. This is needed because del_gendisk
1703 * waits for all pending I/O to complete
1704 */
1705 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1706 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
1707
1708 ublk_stop_dev_unlocked(ub);
1709 } else {
1710 if (ublk_nosrv_dev_should_queue_io(ub)) {
1711 /* ->canceling is set and all requests are aborted */
1712 ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1713 } else {
1714 ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
1715 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1716 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
1717 }
1718 }
1719 unlock:
1720 mutex_unlock(&ub->mutex);
1721 ublk_put_disk(disk);
1722
1723 /* all uring_cmd has been done now, reset device & ubq */
1724 ublk_reset_ch_dev(ub);
1725 out:
1726 clear_bit(UB_STATE_OPEN, &ub->state);
1727
1728 /* put the reference grabbed in ublk_ch_release() */
1729 ublk_put_device(ub);
1730 }
1731
ublk_ch_release(struct inode * inode,struct file * filp)1732 static int ublk_ch_release(struct inode *inode, struct file *filp)
1733 {
1734 struct ublk_device *ub = filp->private_data;
1735
1736 /*
1737 * Grab ublk device reference, so it won't be gone until we are
1738 * really released from work function.
1739 */
1740 ublk_get_device(ub);
1741
1742 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
1743 schedule_delayed_work(&ub->exit_work, 0);
1744 return 0;
1745 }
1746
1747 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)1748 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
1749 {
1750 struct ublk_device *ub = filp->private_data;
1751 size_t sz = vma->vm_end - vma->vm_start;
1752 unsigned max_sz = ublk_max_cmd_buf_size();
1753 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
1754 int q_id, ret = 0;
1755
1756 spin_lock(&ub->lock);
1757 if (!ub->mm)
1758 ub->mm = current->mm;
1759 if (current->mm != ub->mm)
1760 ret = -EINVAL;
1761 spin_unlock(&ub->lock);
1762
1763 if (ret)
1764 return ret;
1765
1766 if (vma->vm_flags & VM_WRITE)
1767 return -EPERM;
1768
1769 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
1770 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
1771 return -EINVAL;
1772
1773 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
1774 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
1775 __func__, q_id, current->pid, vma->vm_start,
1776 phys_off, (unsigned long)sz);
1777
1778 if (sz != ublk_queue_cmd_buf_size(ub, q_id))
1779 return -EINVAL;
1780
1781 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
1782 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1783 }
1784
__ublk_fail_req(struct ublk_queue * ubq,struct ublk_io * io,struct request * req)1785 static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
1786 struct request *req)
1787 {
1788 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
1789
1790 if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
1791 blk_mq_requeue_request(req, false);
1792 else {
1793 io->res = -EIO;
1794 __ublk_complete_rq(req);
1795 }
1796 }
1797
1798 /*
1799 * Called from ublk char device release handler, when any uring_cmd is
1800 * done, meantime request queue is "quiesced" since all inflight requests
1801 * can't be completed because ublk server is dead.
1802 *
1803 * So no one can hold our request IO reference any more, simply ignore the
1804 * reference, and complete the request immediately
1805 */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)1806 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
1807 {
1808 int i;
1809
1810 for (i = 0; i < ubq->q_depth; i++) {
1811 struct ublk_io *io = &ubq->ios[i];
1812
1813 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1814 __ublk_fail_req(ubq, io, io->req);
1815 }
1816 }
1817
ublk_start_cancel(struct ublk_device * ub)1818 static void ublk_start_cancel(struct ublk_device *ub)
1819 {
1820 struct gendisk *disk = ublk_get_disk(ub);
1821
1822 /* Our disk has been dead */
1823 if (!disk)
1824 return;
1825
1826 mutex_lock(&ub->cancel_mutex);
1827 if (ub->canceling)
1828 goto out;
1829 /*
1830 * Now we are serialized with ublk_queue_rq()
1831 *
1832 * Make sure that ubq->canceling is set when queue is frozen,
1833 * because ublk_queue_rq() has to rely on this flag for avoiding to
1834 * touch completed uring_cmd
1835 */
1836 blk_mq_quiesce_queue(disk->queue);
1837 ublk_set_canceling(ub, true);
1838 blk_mq_unquiesce_queue(disk->queue);
1839 out:
1840 mutex_unlock(&ub->cancel_mutex);
1841 ublk_put_disk(disk);
1842 }
1843
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)1844 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
1845 unsigned int issue_flags)
1846 {
1847 struct ublk_io *io = &ubq->ios[tag];
1848 struct ublk_device *ub = ubq->dev;
1849 struct request *req;
1850 bool done;
1851
1852 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1853 return;
1854
1855 /*
1856 * Don't try to cancel this command if the request is started for
1857 * avoiding race between io_uring_cmd_done() and
1858 * io_uring_cmd_complete_in_task().
1859 *
1860 * Either the started request will be aborted via __ublk_abort_rq(),
1861 * then this uring_cmd is canceled next time, or it will be done in
1862 * task work function ublk_dispatch_req() because io_uring guarantees
1863 * that ublk_dispatch_req() is always called
1864 */
1865 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1866 if (req && blk_mq_request_started(req) && req->tag == tag)
1867 return;
1868
1869 spin_lock(&ubq->cancel_lock);
1870 done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1871 if (!done)
1872 io->flags |= UBLK_IO_FLAG_CANCELED;
1873 spin_unlock(&ubq->cancel_lock);
1874
1875 if (!done)
1876 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
1877 }
1878
1879 /*
1880 * The ublk char device won't be closed when calling cancel fn, so both
1881 * ublk device and queue are guaranteed to be live
1882 *
1883 * Two-stage cancel:
1884 *
1885 * - make every active uring_cmd done in ->cancel_fn()
1886 *
1887 * - aborting inflight ublk IO requests in ublk char device release handler,
1888 * which depends on 1st stage because device can only be closed iff all
1889 * uring_cmd are done
1890 *
1891 * Do _not_ try to acquire ub->mutex before all inflight requests are
1892 * aborted, otherwise deadlock may be caused.
1893 */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)1894 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1895 unsigned int issue_flags)
1896 {
1897 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1898 struct ublk_queue *ubq = pdu->ubq;
1899 struct task_struct *task;
1900 struct ublk_io *io;
1901
1902 if (WARN_ON_ONCE(!ubq))
1903 return;
1904
1905 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1906 return;
1907
1908 task = io_uring_cmd_get_task(cmd);
1909 io = &ubq->ios[pdu->tag];
1910 if (WARN_ON_ONCE(task && task != io->task))
1911 return;
1912
1913 ublk_start_cancel(ubq->dev);
1914
1915 WARN_ON_ONCE(io->cmd != cmd);
1916 ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
1917 }
1918
ublk_queue_ready(struct ublk_queue * ubq)1919 static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1920 {
1921 return ubq->nr_io_ready == ubq->q_depth;
1922 }
1923
ublk_cancel_queue(struct ublk_queue * ubq)1924 static void ublk_cancel_queue(struct ublk_queue *ubq)
1925 {
1926 int i;
1927
1928 for (i = 0; i < ubq->q_depth; i++)
1929 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
1930 }
1931
1932 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)1933 static void ublk_cancel_dev(struct ublk_device *ub)
1934 {
1935 int i;
1936
1937 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1938 ublk_cancel_queue(ublk_get_queue(ub, i));
1939 }
1940
ublk_check_inflight_rq(struct request * rq,void * data)1941 static bool ublk_check_inflight_rq(struct request *rq, void *data)
1942 {
1943 bool *idle = data;
1944
1945 if (blk_mq_request_started(rq)) {
1946 *idle = false;
1947 return false;
1948 }
1949 return true;
1950 }
1951
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)1952 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1953 {
1954 bool idle;
1955
1956 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1957 while (true) {
1958 idle = true;
1959 blk_mq_tagset_busy_iter(&ub->tag_set,
1960 ublk_check_inflight_rq, &idle);
1961 if (idle)
1962 break;
1963 msleep(UBLK_REQUEUE_DELAY_MS);
1964 }
1965 }
1966
ublk_force_abort_dev(struct ublk_device * ub)1967 static void ublk_force_abort_dev(struct ublk_device *ub)
1968 {
1969 int i;
1970
1971 pr_devel("%s: force abort ub: dev_id %d state %s\n",
1972 __func__, ub->dev_info.dev_id,
1973 ub->dev_info.state == UBLK_S_DEV_LIVE ?
1974 "LIVE" : "QUIESCED");
1975 blk_mq_quiesce_queue(ub->ub_disk->queue);
1976 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1977 ublk_wait_tagset_rqs_idle(ub);
1978
1979 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1980 ublk_get_queue(ub, i)->force_abort = true;
1981 blk_mq_unquiesce_queue(ub->ub_disk->queue);
1982 /* We may have requeued some rqs in ublk_quiesce_queue() */
1983 blk_mq_kick_requeue_list(ub->ub_disk->queue);
1984 }
1985
ublk_detach_disk(struct ublk_device * ub)1986 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
1987 {
1988 struct gendisk *disk;
1989
1990 /* Sync with ublk_abort_queue() by holding the lock */
1991 spin_lock(&ub->lock);
1992 disk = ub->ub_disk;
1993 ub->dev_info.state = UBLK_S_DEV_DEAD;
1994 ub->dev_info.ublksrv_pid = -1;
1995 ub->ub_disk = NULL;
1996 spin_unlock(&ub->lock);
1997
1998 return disk;
1999 }
2000
ublk_stop_dev_unlocked(struct ublk_device * ub)2001 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2002 __must_hold(&ub->mutex)
2003 {
2004 struct gendisk *disk;
2005
2006 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2007 return;
2008
2009 if (ublk_nosrv_dev_should_queue_io(ub))
2010 ublk_force_abort_dev(ub);
2011 del_gendisk(ub->ub_disk);
2012 disk = ublk_detach_disk(ub);
2013 put_disk(disk);
2014 }
2015
ublk_stop_dev(struct ublk_device * ub)2016 static void ublk_stop_dev(struct ublk_device *ub)
2017 {
2018 mutex_lock(&ub->mutex);
2019 ublk_stop_dev_unlocked(ub);
2020 mutex_unlock(&ub->mutex);
2021 ublk_cancel_dev(ub);
2022 }
2023
2024 /* reset ublk io_uring queue & io flags */
ublk_reset_io_flags(struct ublk_device * ub)2025 static void ublk_reset_io_flags(struct ublk_device *ub)
2026 {
2027 int i, j;
2028
2029 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2030 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2031
2032 /* UBLK_IO_FLAG_CANCELED can be cleared now */
2033 spin_lock(&ubq->cancel_lock);
2034 for (j = 0; j < ubq->q_depth; j++)
2035 ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2036 spin_unlock(&ubq->cancel_lock);
2037 ubq->fail_io = false;
2038 }
2039 mutex_lock(&ub->cancel_mutex);
2040 ublk_set_canceling(ub, false);
2041 mutex_unlock(&ub->cancel_mutex);
2042 }
2043
2044 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,struct ublk_queue * ubq)2045 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
2046 __must_hold(&ub->mutex)
2047 {
2048 ubq->nr_io_ready++;
2049 if (ublk_queue_ready(ubq))
2050 ub->nr_queues_ready++;
2051 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2052 ub->unprivileged_daemons = true;
2053
2054 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
2055 /* now we are ready for handling ublk io request */
2056 ublk_reset_io_flags(ub);
2057 complete_all(&ub->completion);
2058 }
2059 }
2060
ublk_check_cmd_op(u32 cmd_op)2061 static inline int ublk_check_cmd_op(u32 cmd_op)
2062 {
2063 u32 ioc_type = _IOC_TYPE(cmd_op);
2064
2065 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2066 return -EOPNOTSUPP;
2067
2068 if (ioc_type != 'u' && ioc_type != 0)
2069 return -EOPNOTSUPP;
2070
2071 return 0;
2072 }
2073
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)2074 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2075 {
2076 io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2077
2078 if (io->buf.reserved0 || io->buf.reserved1)
2079 return -EINVAL;
2080
2081 if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2082 return -EINVAL;
2083 return 0;
2084 }
2085
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)2086 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
2087 struct io_uring_cmd *cmd,
2088 u16 *buf_idx)
2089 {
2090 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
2091 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
2092
2093 /*
2094 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
2095 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
2096 * `io_ring_ctx`.
2097 *
2098 * If this uring_cmd's io_ring_ctx isn't same with the
2099 * one for registering the buffer, it is ublk server's
2100 * responsibility for unregistering the buffer, otherwise
2101 * this ublk request gets stuck.
2102 */
2103 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
2104 *buf_idx = io->buf.index;
2105 }
2106
2107 return ublk_set_auto_buf_reg(io, cmd);
2108 }
2109
2110 /* Once we return, `io->req` can't be used any more */
2111 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)2112 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
2113 {
2114 struct request *req = io->req;
2115
2116 io->cmd = cmd;
2117 io->flags |= UBLK_IO_FLAG_ACTIVE;
2118 /* now this cmd slot is owned by ublk driver */
2119 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
2120
2121 return req;
2122 }
2123
2124 static inline int
ublk_config_io_buf(const struct ublk_queue * ubq,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)2125 ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io,
2126 struct io_uring_cmd *cmd, unsigned long buf_addr,
2127 u16 *buf_idx)
2128 {
2129 if (ublk_support_auto_buf_reg(ubq))
2130 return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
2131
2132 io->addr = buf_addr;
2133 return 0;
2134 }
2135
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)2136 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
2137 unsigned int issue_flags,
2138 struct ublk_queue *ubq, unsigned int tag)
2139 {
2140 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2141
2142 /*
2143 * Safe to refer to @ubq since ublk_queue won't be died until its
2144 * commands are completed
2145 */
2146 pdu->ubq = ubq;
2147 pdu->tag = tag;
2148 io_uring_cmd_mark_cancelable(cmd, issue_flags);
2149 }
2150
ublk_io_release(void * priv)2151 static void ublk_io_release(void *priv)
2152 {
2153 struct request *rq = priv;
2154 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2155 struct ublk_io *io = &ubq->ios[rq->tag];
2156
2157 /*
2158 * task_registered_buffers may be 0 if buffers were registered off task
2159 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
2160 */
2161 if (current == io->task && io->task_registered_buffers)
2162 io->task_registered_buffers--;
2163 else
2164 ublk_put_req_ref(io, rq);
2165 }
2166
ublk_register_io_buf(struct io_uring_cmd * cmd,const struct ublk_queue * ubq,struct ublk_io * io,unsigned int index,unsigned int issue_flags)2167 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
2168 const struct ublk_queue *ubq,
2169 struct ublk_io *io,
2170 unsigned int index, unsigned int issue_flags)
2171 {
2172 struct ublk_device *ub = cmd->file->private_data;
2173 struct request *req;
2174 int ret;
2175
2176 if (!ublk_support_zero_copy(ubq))
2177 return -EINVAL;
2178
2179 req = __ublk_check_and_get_req(ub, ubq, io, 0);
2180 if (!req)
2181 return -EINVAL;
2182
2183 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2184 issue_flags);
2185 if (ret) {
2186 ublk_put_req_ref(io, req);
2187 return ret;
2188 }
2189
2190 return 0;
2191 }
2192
2193 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,const struct ublk_queue * ubq,struct ublk_io * io,unsigned index,unsigned issue_flags)2194 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
2195 const struct ublk_queue *ubq, struct ublk_io *io,
2196 unsigned index, unsigned issue_flags)
2197 {
2198 unsigned new_registered_buffers;
2199 struct request *req = io->req;
2200 int ret;
2201
2202 /*
2203 * Ensure there are still references for ublk_sub_req_ref() to release.
2204 * If not, fall back on the thread-safe buffer registration.
2205 */
2206 new_registered_buffers = io->task_registered_buffers + 1;
2207 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
2208 return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
2209
2210 if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
2211 return -EINVAL;
2212
2213 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2214 issue_flags);
2215 if (ret)
2216 return ret;
2217
2218 io->task_registered_buffers = new_registered_buffers;
2219 return 0;
2220 }
2221
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)2222 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
2223 const struct ublk_device *ub,
2224 unsigned int index, unsigned int issue_flags)
2225 {
2226 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
2227 return -EINVAL;
2228
2229 return io_buffer_unregister_bvec(cmd, index, issue_flags);
2230 }
2231
ublk_check_fetch_buf(const struct ublk_queue * ubq,__u64 buf_addr)2232 static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
2233 {
2234 if (ublk_need_map_io(ubq)) {
2235 /*
2236 * FETCH_RQ has to provide IO buffer if NEED GET
2237 * DATA is not enabled
2238 */
2239 if (!buf_addr && !ublk_need_get_data(ubq))
2240 return -EINVAL;
2241 } else if (buf_addr) {
2242 /* User copy requires addr to be unset */
2243 return -EINVAL;
2244 }
2245 return 0;
2246 }
2247
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_queue * ubq,struct ublk_io * io,__u64 buf_addr)2248 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
2249 struct ublk_io *io, __u64 buf_addr)
2250 {
2251 struct ublk_device *ub = ubq->dev;
2252 int ret = 0;
2253
2254 /*
2255 * When handling FETCH command for setting up ublk uring queue,
2256 * ub->mutex is the innermost lock, and we won't block for handling
2257 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
2258 */
2259 mutex_lock(&ub->mutex);
2260 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
2261 if (ublk_queue_ready(ubq)) {
2262 ret = -EBUSY;
2263 goto out;
2264 }
2265
2266 /* allow each command to be FETCHed at most once */
2267 if (io->flags & UBLK_IO_FLAG_ACTIVE) {
2268 ret = -EINVAL;
2269 goto out;
2270 }
2271
2272 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
2273
2274 ublk_fill_io_cmd(io, cmd);
2275 ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL);
2276 if (ret)
2277 goto out;
2278
2279 WRITE_ONCE(io->task, get_task_struct(current));
2280 ublk_mark_io_ready(ub, ubq);
2281 out:
2282 mutex_unlock(&ub->mutex);
2283 return ret;
2284 }
2285
ublk_check_commit_and_fetch(const struct ublk_queue * ubq,struct ublk_io * io,__u64 buf_addr)2286 static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
2287 struct ublk_io *io, __u64 buf_addr)
2288 {
2289 struct request *req = io->req;
2290
2291 if (ublk_need_map_io(ubq)) {
2292 /*
2293 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
2294 * NEED GET DATA is not enabled or it is Read IO.
2295 */
2296 if (!buf_addr && (!ublk_need_get_data(ubq) ||
2297 req_op(req) == REQ_OP_READ))
2298 return -EINVAL;
2299 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
2300 /*
2301 * User copy requires addr to be unset when command is
2302 * not zone append
2303 */
2304 return -EINVAL;
2305 }
2306
2307 return 0;
2308 }
2309
ublk_need_complete_req(const struct ublk_queue * ubq,struct ublk_io * io)2310 static bool ublk_need_complete_req(const struct ublk_queue *ubq,
2311 struct ublk_io *io)
2312 {
2313 if (ublk_need_req_ref(ubq))
2314 return ublk_sub_req_ref(io);
2315 return true;
2316 }
2317
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)2318 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
2319 struct request *req)
2320 {
2321 /*
2322 * We have handled UBLK_IO_NEED_GET_DATA command,
2323 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
2324 * do the copy work.
2325 */
2326 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
2327 /* update iod->addr because ublksrv may have passed a new io buffer */
2328 ublk_get_iod(ubq, req->tag)->addr = io->addr;
2329 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
2330 __func__, ubq->q_id, req->tag, io->flags,
2331 ublk_get_iod(ubq, req->tag)->addr);
2332
2333 return ublk_start_io(ubq, req, io);
2334 }
2335
__ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags,const struct ublksrv_io_cmd * ub_cmd)2336 static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
2337 unsigned int issue_flags,
2338 const struct ublksrv_io_cmd *ub_cmd)
2339 {
2340 u16 buf_idx = UBLK_INVALID_BUF_IDX;
2341 struct ublk_device *ub = cmd->file->private_data;
2342 struct ublk_queue *ubq;
2343 struct ublk_io *io;
2344 u32 cmd_op = cmd->cmd_op;
2345 unsigned tag = ub_cmd->tag;
2346 struct request *req;
2347 int ret;
2348 bool compl;
2349
2350 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
2351 __func__, cmd->cmd_op, ub_cmd->q_id, tag,
2352 ub_cmd->result);
2353
2354 ret = ublk_check_cmd_op(cmd_op);
2355 if (ret)
2356 goto out;
2357
2358 /*
2359 * io_buffer_unregister_bvec() doesn't access the ubq or io,
2360 * so no need to validate the q_id, tag, or task
2361 */
2362 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
2363 return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
2364 issue_flags);
2365
2366 ret = -EINVAL;
2367 if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
2368 goto out;
2369
2370 ubq = ublk_get_queue(ub, ub_cmd->q_id);
2371
2372 if (tag >= ubq->q_depth)
2373 goto out;
2374
2375 io = &ubq->ios[tag];
2376 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
2377 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
2378 ret = ublk_check_fetch_buf(ubq, ub_cmd->addr);
2379 if (ret)
2380 goto out;
2381 ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
2382 if (ret)
2383 goto out;
2384
2385 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2386 return -EIOCBQUEUED;
2387 }
2388
2389 if (READ_ONCE(io->task) != current) {
2390 /*
2391 * ublk_register_io_buf() accesses only the io's refcount,
2392 * so can be handled on any task
2393 */
2394 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
2395 return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
2396 issue_flags);
2397
2398 goto out;
2399 }
2400
2401 /* there is pending io cmd, something must be wrong */
2402 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
2403 ret = -EBUSY;
2404 goto out;
2405 }
2406
2407 /*
2408 * ensure that the user issues UBLK_IO_NEED_GET_DATA
2409 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
2410 */
2411 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
2412 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
2413 goto out;
2414
2415 switch (_IOC_NR(cmd_op)) {
2416 case UBLK_IO_REGISTER_IO_BUF:
2417 return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
2418 issue_flags);
2419 case UBLK_IO_COMMIT_AND_FETCH_REQ:
2420 ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr);
2421 if (ret)
2422 goto out;
2423 io->res = ub_cmd->result;
2424 req = ublk_fill_io_cmd(io, cmd);
2425 ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx);
2426 compl = ublk_need_complete_req(ubq, io);
2427
2428 /* can't touch 'ublk_io' any more */
2429 if (buf_idx != UBLK_INVALID_BUF_IDX)
2430 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
2431 if (req_op(req) == REQ_OP_ZONE_APPEND)
2432 req->__sector = ub_cmd->zone_append_lba;
2433 if (compl)
2434 __ublk_complete_rq(req);
2435
2436 if (ret)
2437 goto out;
2438 break;
2439 case UBLK_IO_NEED_GET_DATA:
2440 /*
2441 * ublk_get_data() may fail and fallback to requeue, so keep
2442 * uring_cmd active first and prepare for handling new requeued
2443 * request
2444 */
2445 req = ublk_fill_io_cmd(io, cmd);
2446 ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
2447 WARN_ON_ONCE(ret);
2448 if (likely(ublk_get_data(ubq, io, req))) {
2449 __ublk_prep_compl_io_cmd(io, req);
2450 return UBLK_IO_RES_OK;
2451 }
2452 break;
2453 default:
2454 goto out;
2455 }
2456 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2457 return -EIOCBQUEUED;
2458
2459 out:
2460 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
2461 __func__, cmd_op, tag, ret, io->flags);
2462 return ret;
2463 }
2464
__ublk_check_and_get_req(struct ublk_device * ub,const struct ublk_queue * ubq,struct ublk_io * io,size_t offset)2465 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
2466 const struct ublk_queue *ubq, struct ublk_io *io, size_t offset)
2467 {
2468 unsigned tag = io - ubq->ios;
2469 struct request *req;
2470
2471 /*
2472 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
2473 * which would overwrite it with io->cmd
2474 */
2475 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2476 if (!req)
2477 return NULL;
2478
2479 if (!ublk_get_req_ref(io))
2480 return NULL;
2481
2482 if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
2483 goto fail_put;
2484
2485 if (!ublk_rq_has_data(req))
2486 goto fail_put;
2487
2488 if (offset > blk_rq_bytes(req))
2489 goto fail_put;
2490
2491 return req;
2492 fail_put:
2493 ublk_put_req_ref(io, req);
2494 return NULL;
2495 }
2496
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)2497 static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
2498 unsigned int issue_flags)
2499 {
2500 /*
2501 * Not necessary for async retry, but let's keep it simple and always
2502 * copy the values to avoid any potential reuse.
2503 */
2504 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
2505 const struct ublksrv_io_cmd ub_cmd = {
2506 .q_id = READ_ONCE(ub_src->q_id),
2507 .tag = READ_ONCE(ub_src->tag),
2508 .result = READ_ONCE(ub_src->result),
2509 .addr = READ_ONCE(ub_src->addr)
2510 };
2511
2512 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
2513
2514 return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
2515 }
2516
ublk_ch_uring_cmd_cb(struct io_uring_cmd * cmd,unsigned int issue_flags)2517 static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
2518 unsigned int issue_flags)
2519 {
2520 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
2521
2522 if (ret != -EIOCBQUEUED)
2523 io_uring_cmd_done(cmd, ret, 0, issue_flags);
2524 }
2525
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)2526 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
2527 {
2528 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
2529 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
2530 return 0;
2531 }
2532
2533 /* well-implemented server won't run into unlocked */
2534 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
2535 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
2536 return -EIOCBQUEUED;
2537 }
2538
2539 return ublk_ch_uring_cmd_local(cmd, issue_flags);
2540 }
2541
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)2542 static inline bool ublk_check_ubuf_dir(const struct request *req,
2543 int ubuf_dir)
2544 {
2545 /* copy ubuf to request pages */
2546 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
2547 ubuf_dir == ITER_SOURCE)
2548 return true;
2549
2550 /* copy request pages to ubuf */
2551 if ((req_op(req) == REQ_OP_WRITE ||
2552 req_op(req) == REQ_OP_ZONE_APPEND) &&
2553 ubuf_dir == ITER_DEST)
2554 return true;
2555
2556 return false;
2557 }
2558
ublk_check_and_get_req(struct kiocb * iocb,struct iov_iter * iter,size_t * off,int dir,struct ublk_io ** io)2559 static struct request *ublk_check_and_get_req(struct kiocb *iocb,
2560 struct iov_iter *iter, size_t *off, int dir,
2561 struct ublk_io **io)
2562 {
2563 struct ublk_device *ub = iocb->ki_filp->private_data;
2564 struct ublk_queue *ubq;
2565 struct request *req;
2566 size_t buf_off;
2567 u16 tag, q_id;
2568
2569 if (!ub)
2570 return ERR_PTR(-EACCES);
2571
2572 if (!user_backed_iter(iter))
2573 return ERR_PTR(-EACCES);
2574
2575 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2576 return ERR_PTR(-EACCES);
2577
2578 tag = ublk_pos_to_tag(iocb->ki_pos);
2579 q_id = ublk_pos_to_hwq(iocb->ki_pos);
2580 buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
2581
2582 if (q_id >= ub->dev_info.nr_hw_queues)
2583 return ERR_PTR(-EINVAL);
2584
2585 ubq = ublk_get_queue(ub, q_id);
2586 if (!ubq)
2587 return ERR_PTR(-EINVAL);
2588
2589 if (!ublk_support_user_copy(ubq))
2590 return ERR_PTR(-EACCES);
2591
2592 if (tag >= ubq->q_depth)
2593 return ERR_PTR(-EINVAL);
2594
2595 *io = &ubq->ios[tag];
2596 req = __ublk_check_and_get_req(ub, ubq, *io, buf_off);
2597 if (!req)
2598 return ERR_PTR(-EINVAL);
2599
2600 if (!req->mq_hctx || !req->mq_hctx->driver_data)
2601 goto fail;
2602
2603 if (!ublk_check_ubuf_dir(req, dir))
2604 goto fail;
2605
2606 *off = buf_off;
2607 return req;
2608 fail:
2609 ublk_put_req_ref(*io, req);
2610 return ERR_PTR(-EACCES);
2611 }
2612
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)2613 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
2614 {
2615 struct request *req;
2616 struct ublk_io *io;
2617 size_t buf_off;
2618 size_t ret;
2619
2620 req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io);
2621 if (IS_ERR(req))
2622 return PTR_ERR(req);
2623
2624 ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
2625 ublk_put_req_ref(io, req);
2626
2627 return ret;
2628 }
2629
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)2630 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
2631 {
2632 struct request *req;
2633 struct ublk_io *io;
2634 size_t buf_off;
2635 size_t ret;
2636
2637 req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io);
2638 if (IS_ERR(req))
2639 return PTR_ERR(req);
2640
2641 ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
2642 ublk_put_req_ref(io, req);
2643
2644 return ret;
2645 }
2646
2647 static const struct file_operations ublk_ch_fops = {
2648 .owner = THIS_MODULE,
2649 .open = ublk_ch_open,
2650 .release = ublk_ch_release,
2651 .read_iter = ublk_ch_read_iter,
2652 .write_iter = ublk_ch_write_iter,
2653 .uring_cmd = ublk_ch_uring_cmd,
2654 .mmap = ublk_ch_mmap,
2655 };
2656
ublk_deinit_queue(struct ublk_device * ub,int q_id)2657 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
2658 {
2659 int size = ublk_queue_cmd_buf_size(ub, q_id);
2660 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2661 int i;
2662
2663 for (i = 0; i < ubq->q_depth; i++) {
2664 struct ublk_io *io = &ubq->ios[i];
2665 if (io->task)
2666 put_task_struct(io->task);
2667 WARN_ON_ONCE(refcount_read(&io->ref));
2668 WARN_ON_ONCE(io->task_registered_buffers);
2669 }
2670
2671 if (ubq->io_cmd_buf)
2672 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
2673 }
2674
ublk_init_queue(struct ublk_device * ub,int q_id)2675 static int ublk_init_queue(struct ublk_device *ub, int q_id)
2676 {
2677 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2678 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
2679 void *ptr;
2680 int size;
2681
2682 spin_lock_init(&ubq->cancel_lock);
2683 ubq->flags = ub->dev_info.flags;
2684 ubq->q_id = q_id;
2685 ubq->q_depth = ub->dev_info.queue_depth;
2686 size = ublk_queue_cmd_buf_size(ub, q_id);
2687
2688 ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
2689 if (!ptr)
2690 return -ENOMEM;
2691
2692 ubq->io_cmd_buf = ptr;
2693 ubq->dev = ub;
2694 return 0;
2695 }
2696
ublk_deinit_queues(struct ublk_device * ub)2697 static void ublk_deinit_queues(struct ublk_device *ub)
2698 {
2699 int nr_queues = ub->dev_info.nr_hw_queues;
2700 int i;
2701
2702 if (!ub->__queues)
2703 return;
2704
2705 for (i = 0; i < nr_queues; i++)
2706 ublk_deinit_queue(ub, i);
2707 kvfree(ub->__queues);
2708 }
2709
ublk_init_queues(struct ublk_device * ub)2710 static int ublk_init_queues(struct ublk_device *ub)
2711 {
2712 int nr_queues = ub->dev_info.nr_hw_queues;
2713 int depth = ub->dev_info.queue_depth;
2714 int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
2715 int i, ret = -ENOMEM;
2716
2717 ub->queue_size = ubq_size;
2718 ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
2719 if (!ub->__queues)
2720 return ret;
2721
2722 for (i = 0; i < nr_queues; i++) {
2723 if (ublk_init_queue(ub, i))
2724 goto fail;
2725 }
2726
2727 init_completion(&ub->completion);
2728 return 0;
2729
2730 fail:
2731 ublk_deinit_queues(ub);
2732 return ret;
2733 }
2734
ublk_alloc_dev_number(struct ublk_device * ub,int idx)2735 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
2736 {
2737 int i = idx;
2738 int err;
2739
2740 spin_lock(&ublk_idr_lock);
2741 /* allocate id, if @id >= 0, we're requesting that specific id */
2742 if (i >= 0) {
2743 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
2744 if (err == -ENOSPC)
2745 err = -EEXIST;
2746 } else {
2747 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
2748 GFP_NOWAIT);
2749 }
2750 spin_unlock(&ublk_idr_lock);
2751
2752 if (err >= 0)
2753 ub->ub_number = err;
2754
2755 return err;
2756 }
2757
ublk_free_dev_number(struct ublk_device * ub)2758 static void ublk_free_dev_number(struct ublk_device *ub)
2759 {
2760 spin_lock(&ublk_idr_lock);
2761 idr_remove(&ublk_index_idr, ub->ub_number);
2762 wake_up_all(&ublk_idr_wq);
2763 spin_unlock(&ublk_idr_lock);
2764 }
2765
ublk_cdev_rel(struct device * dev)2766 static void ublk_cdev_rel(struct device *dev)
2767 {
2768 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
2769
2770 blk_mq_free_tag_set(&ub->tag_set);
2771 ublk_deinit_queues(ub);
2772 ublk_free_dev_number(ub);
2773 mutex_destroy(&ub->mutex);
2774 mutex_destroy(&ub->cancel_mutex);
2775 kfree(ub);
2776 }
2777
ublk_add_chdev(struct ublk_device * ub)2778 static int ublk_add_chdev(struct ublk_device *ub)
2779 {
2780 struct device *dev = &ub->cdev_dev;
2781 int minor = ub->ub_number;
2782 int ret;
2783
2784 dev->parent = ublk_misc.this_device;
2785 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
2786 dev->class = &ublk_chr_class;
2787 dev->release = ublk_cdev_rel;
2788 device_initialize(dev);
2789
2790 ret = dev_set_name(dev, "ublkc%d", minor);
2791 if (ret)
2792 goto fail;
2793
2794 cdev_init(&ub->cdev, &ublk_ch_fops);
2795 ret = cdev_device_add(&ub->cdev, dev);
2796 if (ret)
2797 goto fail;
2798
2799 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
2800 unprivileged_ublks_added++;
2801 return 0;
2802 fail:
2803 put_device(dev);
2804 return ret;
2805 }
2806
2807 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)2808 static void ublk_align_max_io_size(struct ublk_device *ub)
2809 {
2810 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
2811
2812 ub->dev_info.max_io_buf_bytes =
2813 round_down(max_io_bytes, PAGE_SIZE);
2814 }
2815
ublk_add_tag_set(struct ublk_device * ub)2816 static int ublk_add_tag_set(struct ublk_device *ub)
2817 {
2818 ub->tag_set.ops = &ublk_mq_ops;
2819 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
2820 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
2821 ub->tag_set.numa_node = NUMA_NO_NODE;
2822 ub->tag_set.driver_data = ub;
2823 return blk_mq_alloc_tag_set(&ub->tag_set);
2824 }
2825
ublk_remove(struct ublk_device * ub)2826 static void ublk_remove(struct ublk_device *ub)
2827 {
2828 bool unprivileged;
2829
2830 ublk_stop_dev(ub);
2831 cdev_device_del(&ub->cdev, &ub->cdev_dev);
2832 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
2833 ublk_put_device(ub);
2834
2835 if (unprivileged)
2836 unprivileged_ublks_added--;
2837 }
2838
ublk_get_device_from_id(int idx)2839 static struct ublk_device *ublk_get_device_from_id(int idx)
2840 {
2841 struct ublk_device *ub = NULL;
2842
2843 if (idx < 0)
2844 return NULL;
2845
2846 spin_lock(&ublk_idr_lock);
2847 ub = idr_find(&ublk_index_idr, idx);
2848 if (ub)
2849 ub = ublk_get_device(ub);
2850 spin_unlock(&ublk_idr_lock);
2851
2852 return ub;
2853 }
2854
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)2855 static int ublk_ctrl_start_dev(struct ublk_device *ub,
2856 const struct ublksrv_ctrl_cmd *header)
2857 {
2858 const struct ublk_param_basic *p = &ub->params.basic;
2859 int ublksrv_pid = (int)header->data[0];
2860 struct queue_limits lim = {
2861 .logical_block_size = 1 << p->logical_bs_shift,
2862 .physical_block_size = 1 << p->physical_bs_shift,
2863 .io_min = 1 << p->io_min_shift,
2864 .io_opt = 1 << p->io_opt_shift,
2865 .max_hw_sectors = p->max_sectors,
2866 .chunk_sectors = p->chunk_sectors,
2867 .virt_boundary_mask = p->virt_boundary_mask,
2868 .max_segments = USHRT_MAX,
2869 .max_segment_size = UINT_MAX,
2870 .dma_alignment = 3,
2871 };
2872 struct gendisk *disk;
2873 int ret = -EINVAL;
2874
2875 if (ublksrv_pid <= 0)
2876 return -EINVAL;
2877 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
2878 return -EINVAL;
2879
2880 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
2881 const struct ublk_param_discard *pd = &ub->params.discard;
2882
2883 lim.discard_alignment = pd->discard_alignment;
2884 lim.discard_granularity = pd->discard_granularity;
2885 lim.max_hw_discard_sectors = pd->max_discard_sectors;
2886 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
2887 lim.max_discard_segments = pd->max_discard_segments;
2888 }
2889
2890 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
2891 const struct ublk_param_zoned *p = &ub->params.zoned;
2892
2893 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
2894 return -EOPNOTSUPP;
2895
2896 lim.features |= BLK_FEAT_ZONED;
2897 lim.max_active_zones = p->max_active_zones;
2898 lim.max_open_zones = p->max_open_zones;
2899 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
2900 }
2901
2902 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
2903 lim.features |= BLK_FEAT_WRITE_CACHE;
2904 if (ub->params.basic.attrs & UBLK_ATTR_FUA)
2905 lim.features |= BLK_FEAT_FUA;
2906 }
2907
2908 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
2909 lim.features |= BLK_FEAT_ROTATIONAL;
2910
2911 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
2912 lim.dma_alignment = ub->params.dma.alignment;
2913
2914 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
2915 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
2916 lim.max_segment_size = ub->params.seg.max_segment_size;
2917 lim.max_segments = ub->params.seg.max_segments;
2918 }
2919
2920 if (wait_for_completion_interruptible(&ub->completion) != 0)
2921 return -EINTR;
2922
2923 if (ub->ublksrv_tgid != ublksrv_pid)
2924 return -EINVAL;
2925
2926 mutex_lock(&ub->mutex);
2927 if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
2928 test_bit(UB_STATE_USED, &ub->state)) {
2929 ret = -EEXIST;
2930 goto out_unlock;
2931 }
2932
2933 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
2934 if (IS_ERR(disk)) {
2935 ret = PTR_ERR(disk);
2936 goto out_unlock;
2937 }
2938 sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
2939 disk->fops = &ub_fops;
2940 disk->private_data = ub;
2941
2942 ub->dev_info.ublksrv_pid = ublksrv_pid;
2943 ub->ub_disk = disk;
2944
2945 ublk_apply_params(ub);
2946
2947 /* don't probe partitions if any daemon task is un-trusted */
2948 if (ub->unprivileged_daemons)
2949 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
2950
2951 ublk_get_device(ub);
2952 ub->dev_info.state = UBLK_S_DEV_LIVE;
2953
2954 if (ublk_dev_is_zoned(ub)) {
2955 ret = ublk_revalidate_disk_zones(ub);
2956 if (ret)
2957 goto out_put_cdev;
2958 }
2959
2960 ret = add_disk(disk);
2961 if (ret)
2962 goto out_put_cdev;
2963
2964 set_bit(UB_STATE_USED, &ub->state);
2965
2966 out_put_cdev:
2967 if (ret) {
2968 ublk_detach_disk(ub);
2969 ublk_put_device(ub);
2970 }
2971 if (ret)
2972 put_disk(disk);
2973 out_unlock:
2974 mutex_unlock(&ub->mutex);
2975 return ret;
2976 }
2977
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)2978 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
2979 const struct ublksrv_ctrl_cmd *header)
2980 {
2981 void __user *argp = (void __user *)(unsigned long)header->addr;
2982 cpumask_var_t cpumask;
2983 unsigned long queue;
2984 unsigned int retlen;
2985 unsigned int i;
2986 int ret;
2987
2988 if (header->len * BITS_PER_BYTE < nr_cpu_ids)
2989 return -EINVAL;
2990 if (header->len & (sizeof(unsigned long)-1))
2991 return -EINVAL;
2992 if (!header->addr)
2993 return -EINVAL;
2994
2995 queue = header->data[0];
2996 if (queue >= ub->dev_info.nr_hw_queues)
2997 return -EINVAL;
2998
2999 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
3000 return -ENOMEM;
3001
3002 for_each_possible_cpu(i) {
3003 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
3004 cpumask_set_cpu(i, cpumask);
3005 }
3006
3007 ret = -EFAULT;
3008 retlen = min_t(unsigned short, header->len, cpumask_size());
3009 if (copy_to_user(argp, cpumask, retlen))
3010 goto out_free_cpumask;
3011 if (retlen != header->len &&
3012 clear_user(argp + retlen, header->len - retlen))
3013 goto out_free_cpumask;
3014
3015 ret = 0;
3016 out_free_cpumask:
3017 free_cpumask_var(cpumask);
3018 return ret;
3019 }
3020
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)3021 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
3022 {
3023 pr_devel("%s: dev id %d flags %llx\n", __func__,
3024 info->dev_id, info->flags);
3025 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
3026 info->nr_hw_queues, info->queue_depth);
3027 }
3028
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)3029 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
3030 {
3031 void __user *argp = (void __user *)(unsigned long)header->addr;
3032 struct ublksrv_ctrl_dev_info info;
3033 struct ublk_device *ub;
3034 int ret = -EINVAL;
3035
3036 if (header->len < sizeof(info) || !header->addr)
3037 return -EINVAL;
3038 if (header->queue_id != (u16)-1) {
3039 pr_warn("%s: queue_id is wrong %x\n",
3040 __func__, header->queue_id);
3041 return -EINVAL;
3042 }
3043
3044 if (copy_from_user(&info, argp, sizeof(info)))
3045 return -EFAULT;
3046
3047 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
3048 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
3049 return -EINVAL;
3050
3051 if (capable(CAP_SYS_ADMIN))
3052 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
3053 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
3054 return -EPERM;
3055
3056 /* forbid nonsense combinations of recovery flags */
3057 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
3058 case 0:
3059 case UBLK_F_USER_RECOVERY:
3060 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
3061 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
3062 break;
3063 default:
3064 pr_warn("%s: invalid recovery flags %llx\n", __func__,
3065 info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
3066 return -EINVAL;
3067 }
3068
3069 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
3070 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
3071 return -EINVAL;
3072 }
3073
3074 /*
3075 * unprivileged device can't be trusted, but RECOVERY and
3076 * RECOVERY_REISSUE still may hang error handling, so can't
3077 * support recovery features for unprivileged ublk now
3078 *
3079 * TODO: provide forward progress for RECOVERY handler, so that
3080 * unprivileged device can benefit from it
3081 */
3082 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
3083 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
3084 UBLK_F_USER_RECOVERY);
3085
3086 /*
3087 * For USER_COPY, we depends on userspace to fill request
3088 * buffer by pwrite() to ublk char device, which can't be
3089 * used for unprivileged device
3090 *
3091 * Same with zero copy or auto buffer register.
3092 */
3093 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3094 UBLK_F_AUTO_BUF_REG))
3095 return -EINVAL;
3096 }
3097
3098 /* the created device is always owned by current user */
3099 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
3100
3101 if (header->dev_id != info.dev_id) {
3102 pr_warn("%s: dev id not match %u %u\n",
3103 __func__, header->dev_id, info.dev_id);
3104 return -EINVAL;
3105 }
3106
3107 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
3108 pr_warn("%s: dev id is too large. Max supported is %d\n",
3109 __func__, UBLK_MAX_UBLKS - 1);
3110 return -EINVAL;
3111 }
3112
3113 ublk_dump_dev_info(&info);
3114
3115 ret = mutex_lock_killable(&ublk_ctl_mutex);
3116 if (ret)
3117 return ret;
3118
3119 ret = -EACCES;
3120 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
3121 unprivileged_ublks_added >= unprivileged_ublks_max)
3122 goto out_unlock;
3123
3124 ret = -ENOMEM;
3125 ub = kzalloc(sizeof(*ub), GFP_KERNEL);
3126 if (!ub)
3127 goto out_unlock;
3128 mutex_init(&ub->mutex);
3129 spin_lock_init(&ub->lock);
3130 mutex_init(&ub->cancel_mutex);
3131
3132 ret = ublk_alloc_dev_number(ub, header->dev_id);
3133 if (ret < 0)
3134 goto out_free_ub;
3135
3136 memcpy(&ub->dev_info, &info, sizeof(info));
3137
3138 /* update device id */
3139 ub->dev_info.dev_id = ub->ub_number;
3140
3141 /*
3142 * 64bit flags will be copied back to userspace as feature
3143 * negotiation result, so have to clear flags which driver
3144 * doesn't support yet, then userspace can get correct flags
3145 * (features) to handle.
3146 */
3147 ub->dev_info.flags &= UBLK_F_ALL;
3148
3149 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
3150 UBLK_F_URING_CMD_COMP_IN_TASK |
3151 UBLK_F_PER_IO_DAEMON |
3152 UBLK_F_BUF_REG_OFF_DAEMON;
3153
3154 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
3155 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3156 UBLK_F_AUTO_BUF_REG))
3157 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
3158
3159 /*
3160 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
3161 * returning write_append_lba, which is only allowed in case of
3162 * user copy or zero copy
3163 */
3164 if (ublk_dev_is_zoned(ub) &&
3165 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
3166 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
3167 ret = -EINVAL;
3168 goto out_free_dev_number;
3169 }
3170
3171 ub->dev_info.nr_hw_queues = min_t(unsigned int,
3172 ub->dev_info.nr_hw_queues, nr_cpu_ids);
3173 ublk_align_max_io_size(ub);
3174
3175 ret = ublk_init_queues(ub);
3176 if (ret)
3177 goto out_free_dev_number;
3178
3179 ret = ublk_add_tag_set(ub);
3180 if (ret)
3181 goto out_deinit_queues;
3182
3183 ret = -EFAULT;
3184 if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
3185 goto out_free_tag_set;
3186
3187 /*
3188 * Add the char dev so that ublksrv daemon can be setup.
3189 * ublk_add_chdev() will cleanup everything if it fails.
3190 */
3191 ret = ublk_add_chdev(ub);
3192 goto out_unlock;
3193
3194 out_free_tag_set:
3195 blk_mq_free_tag_set(&ub->tag_set);
3196 out_deinit_queues:
3197 ublk_deinit_queues(ub);
3198 out_free_dev_number:
3199 ublk_free_dev_number(ub);
3200 out_free_ub:
3201 mutex_destroy(&ub->mutex);
3202 mutex_destroy(&ub->cancel_mutex);
3203 kfree(ub);
3204 out_unlock:
3205 mutex_unlock(&ublk_ctl_mutex);
3206 return ret;
3207 }
3208
ublk_idr_freed(int id)3209 static inline bool ublk_idr_freed(int id)
3210 {
3211 void *ptr;
3212
3213 spin_lock(&ublk_idr_lock);
3214 ptr = idr_find(&ublk_index_idr, id);
3215 spin_unlock(&ublk_idr_lock);
3216
3217 return ptr == NULL;
3218 }
3219
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)3220 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
3221 {
3222 struct ublk_device *ub = *p_ub;
3223 int idx = ub->ub_number;
3224 int ret;
3225
3226 ret = mutex_lock_killable(&ublk_ctl_mutex);
3227 if (ret)
3228 return ret;
3229
3230 if (!test_bit(UB_STATE_DELETED, &ub->state)) {
3231 ublk_remove(ub);
3232 set_bit(UB_STATE_DELETED, &ub->state);
3233 }
3234
3235 /* Mark the reference as consumed */
3236 *p_ub = NULL;
3237 ublk_put_device(ub);
3238 mutex_unlock(&ublk_ctl_mutex);
3239
3240 /*
3241 * Wait until the idr is removed, then it can be reused after
3242 * DEL_DEV command is returned.
3243 *
3244 * If we returns because of user interrupt, future delete command
3245 * may come:
3246 *
3247 * - the device number isn't freed, this device won't or needn't
3248 * be deleted again, since UB_STATE_DELETED is set, and device
3249 * will be released after the last reference is dropped
3250 *
3251 * - the device number is freed already, we will not find this
3252 * device via ublk_get_device_from_id()
3253 */
3254 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
3255 return -EINTR;
3256 return 0;
3257 }
3258
ublk_ctrl_cmd_dump(struct io_uring_cmd * cmd)3259 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
3260 {
3261 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3262
3263 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
3264 __func__, cmd->cmd_op, header->dev_id, header->queue_id,
3265 header->data[0], header->addr, header->len);
3266 }
3267
ublk_ctrl_stop_dev(struct ublk_device * ub)3268 static int ublk_ctrl_stop_dev(struct ublk_device *ub)
3269 {
3270 ublk_stop_dev(ub);
3271 return 0;
3272 }
3273
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3274 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
3275 const struct ublksrv_ctrl_cmd *header)
3276 {
3277 void __user *argp = (void __user *)(unsigned long)header->addr;
3278
3279 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
3280 return -EINVAL;
3281
3282 if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
3283 return -EFAULT;
3284
3285 return 0;
3286 }
3287
3288 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)3289 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
3290 {
3291 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
3292 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
3293
3294 if (ub->ub_disk) {
3295 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
3296 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
3297 } else {
3298 ub->params.devt.disk_major = 0;
3299 ub->params.devt.disk_minor = 0;
3300 }
3301 ub->params.types |= UBLK_PARAM_TYPE_DEVT;
3302 }
3303
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3304 static int ublk_ctrl_get_params(struct ublk_device *ub,
3305 const struct ublksrv_ctrl_cmd *header)
3306 {
3307 void __user *argp = (void __user *)(unsigned long)header->addr;
3308 struct ublk_params_header ph;
3309 int ret;
3310
3311 if (header->len <= sizeof(ph) || !header->addr)
3312 return -EINVAL;
3313
3314 if (copy_from_user(&ph, argp, sizeof(ph)))
3315 return -EFAULT;
3316
3317 if (ph.len > header->len || !ph.len)
3318 return -EINVAL;
3319
3320 if (ph.len > sizeof(struct ublk_params))
3321 ph.len = sizeof(struct ublk_params);
3322
3323 mutex_lock(&ub->mutex);
3324 ublk_ctrl_fill_params_devt(ub);
3325 if (copy_to_user(argp, &ub->params, ph.len))
3326 ret = -EFAULT;
3327 else
3328 ret = 0;
3329 mutex_unlock(&ub->mutex);
3330
3331 return ret;
3332 }
3333
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3334 static int ublk_ctrl_set_params(struct ublk_device *ub,
3335 const struct ublksrv_ctrl_cmd *header)
3336 {
3337 void __user *argp = (void __user *)(unsigned long)header->addr;
3338 struct ublk_params_header ph;
3339 int ret = -EFAULT;
3340
3341 if (header->len <= sizeof(ph) || !header->addr)
3342 return -EINVAL;
3343
3344 if (copy_from_user(&ph, argp, sizeof(ph)))
3345 return -EFAULT;
3346
3347 if (ph.len > header->len || !ph.len || !ph.types)
3348 return -EINVAL;
3349
3350 if (ph.len > sizeof(struct ublk_params))
3351 ph.len = sizeof(struct ublk_params);
3352
3353 mutex_lock(&ub->mutex);
3354 if (test_bit(UB_STATE_USED, &ub->state)) {
3355 /*
3356 * Parameters can only be changed when device hasn't
3357 * been started yet
3358 */
3359 ret = -EACCES;
3360 } else if (copy_from_user(&ub->params, argp, ph.len)) {
3361 ret = -EFAULT;
3362 } else {
3363 /* clear all we don't support yet */
3364 ub->params.types &= UBLK_PARAM_TYPE_ALL;
3365 ret = ublk_validate_params(ub);
3366 if (ret)
3367 ub->params.types = 0;
3368 }
3369 mutex_unlock(&ub->mutex);
3370
3371 return ret;
3372 }
3373
ublk_ctrl_start_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3374 static int ublk_ctrl_start_recovery(struct ublk_device *ub,
3375 const struct ublksrv_ctrl_cmd *header)
3376 {
3377 int ret = -EINVAL;
3378
3379 mutex_lock(&ub->mutex);
3380 if (ublk_nosrv_should_stop_dev(ub))
3381 goto out_unlock;
3382 /*
3383 * START_RECOVERY is only allowd after:
3384 *
3385 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
3386 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
3387 * released.
3388 *
3389 * and one of the following holds
3390 *
3391 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
3392 * (a)has quiesced request queue
3393 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
3394 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
3395 * (d)has completed/camceled all ioucmds owned by ther dying process
3396 *
3397 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
3398 * quiesced, but all I/O is being immediately errored
3399 */
3400 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
3401 ret = -EBUSY;
3402 goto out_unlock;
3403 }
3404 pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
3405 init_completion(&ub->completion);
3406 ret = 0;
3407 out_unlock:
3408 mutex_unlock(&ub->mutex);
3409 return ret;
3410 }
3411
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3412 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
3413 const struct ublksrv_ctrl_cmd *header)
3414 {
3415 int ublksrv_pid = (int)header->data[0];
3416 int ret = -EINVAL;
3417
3418 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
3419 header->dev_id);
3420
3421 if (wait_for_completion_interruptible(&ub->completion))
3422 return -EINTR;
3423
3424 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
3425 header->dev_id);
3426
3427 if (ub->ublksrv_tgid != ublksrv_pid)
3428 return -EINVAL;
3429
3430 mutex_lock(&ub->mutex);
3431 if (ublk_nosrv_should_stop_dev(ub))
3432 goto out_unlock;
3433
3434 if (!ublk_dev_in_recoverable_state(ub)) {
3435 ret = -EBUSY;
3436 goto out_unlock;
3437 }
3438 ub->dev_info.ublksrv_pid = ublksrv_pid;
3439 ub->dev_info.state = UBLK_S_DEV_LIVE;
3440 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
3441 __func__, ublksrv_pid, header->dev_id);
3442 blk_mq_kick_requeue_list(ub->ub_disk->queue);
3443 ret = 0;
3444 out_unlock:
3445 mutex_unlock(&ub->mutex);
3446 return ret;
3447 }
3448
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)3449 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
3450 {
3451 void __user *argp = (void __user *)(unsigned long)header->addr;
3452 u64 features = UBLK_F_ALL;
3453
3454 if (header->len != UBLK_FEATURES_LEN || !header->addr)
3455 return -EINVAL;
3456
3457 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
3458 return -EFAULT;
3459
3460 return 0;
3461 }
3462
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3463 static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
3464 {
3465 struct ublk_param_basic *p = &ub->params.basic;
3466 u64 new_size = header->data[0];
3467
3468 mutex_lock(&ub->mutex);
3469 p->dev_sectors = new_size;
3470 set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
3471 mutex_unlock(&ub->mutex);
3472 }
3473
3474 struct count_busy {
3475 const struct ublk_queue *ubq;
3476 unsigned int nr_busy;
3477 };
3478
ublk_count_busy_req(struct request * rq,void * data)3479 static bool ublk_count_busy_req(struct request *rq, void *data)
3480 {
3481 struct count_busy *idle = data;
3482
3483 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
3484 idle->nr_busy += 1;
3485 return true;
3486 }
3487
3488 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)3489 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
3490 {
3491 struct count_busy data = {
3492 .ubq = ubq,
3493 };
3494
3495 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
3496 return data.nr_busy < ubq->q_depth;
3497 }
3498
3499 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)3500 static int ublk_wait_for_idle_io(struct ublk_device *ub,
3501 unsigned int timeout_ms)
3502 {
3503 unsigned int elapsed = 0;
3504 int ret;
3505
3506 while (elapsed < timeout_ms && !signal_pending(current)) {
3507 unsigned int queues_cancelable = 0;
3508 int i;
3509
3510 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
3511 struct ublk_queue *ubq = ublk_get_queue(ub, i);
3512
3513 queues_cancelable += !!ubq_has_idle_io(ubq);
3514 }
3515
3516 /*
3517 * Each queue needs at least one active command for
3518 * notifying ublk server
3519 */
3520 if (queues_cancelable == ub->dev_info.nr_hw_queues)
3521 break;
3522
3523 msleep(UBLK_REQUEUE_DELAY_MS);
3524 elapsed += UBLK_REQUEUE_DELAY_MS;
3525 }
3526
3527 if (signal_pending(current))
3528 ret = -EINTR;
3529 else if (elapsed >= timeout_ms)
3530 ret = -EBUSY;
3531 else
3532 ret = 0;
3533
3534 return ret;
3535 }
3536
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)3537 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
3538 const struct ublksrv_ctrl_cmd *header)
3539 {
3540 /* zero means wait forever */
3541 u64 timeout_ms = header->data[0];
3542 struct gendisk *disk;
3543 int ret = -ENODEV;
3544
3545 if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
3546 return -EOPNOTSUPP;
3547
3548 mutex_lock(&ub->mutex);
3549 disk = ublk_get_disk(ub);
3550 if (!disk)
3551 goto unlock;
3552 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3553 goto put_disk;
3554
3555 ret = 0;
3556 /* already in expected state */
3557 if (ub->dev_info.state != UBLK_S_DEV_LIVE)
3558 goto put_disk;
3559
3560 /* Mark the device as canceling */
3561 mutex_lock(&ub->cancel_mutex);
3562 blk_mq_quiesce_queue(disk->queue);
3563 ublk_set_canceling(ub, true);
3564 blk_mq_unquiesce_queue(disk->queue);
3565 mutex_unlock(&ub->cancel_mutex);
3566
3567 if (!timeout_ms)
3568 timeout_ms = UINT_MAX;
3569 ret = ublk_wait_for_idle_io(ub, timeout_ms);
3570
3571 put_disk:
3572 ublk_put_disk(disk);
3573 unlock:
3574 mutex_unlock(&ub->mutex);
3575
3576 /* Cancel pending uring_cmd */
3577 if (!ret)
3578 ublk_cancel_dev(ub);
3579 return ret;
3580 }
3581
3582 /*
3583 * All control commands are sent via /dev/ublk-control, so we have to check
3584 * the destination device's permission
3585 */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)3586 static int ublk_char_dev_permission(struct ublk_device *ub,
3587 const char *dev_path, int mask)
3588 {
3589 int err;
3590 struct path path;
3591 struct kstat stat;
3592
3593 err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
3594 if (err)
3595 return err;
3596
3597 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
3598 if (err)
3599 goto exit;
3600
3601 err = -EPERM;
3602 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
3603 goto exit;
3604
3605 err = inode_permission(&nop_mnt_idmap,
3606 d_backing_inode(path.dentry), mask);
3607 exit:
3608 path_put(&path);
3609 return err;
3610 }
3611
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,struct io_uring_cmd * cmd)3612 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
3613 struct io_uring_cmd *cmd)
3614 {
3615 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
3616 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
3617 void __user *argp = (void __user *)(unsigned long)header->addr;
3618 char *dev_path = NULL;
3619 int ret = 0;
3620 int mask;
3621
3622 if (!unprivileged) {
3623 if (!capable(CAP_SYS_ADMIN))
3624 return -EPERM;
3625 /*
3626 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
3627 * char_dev_path in payload too, since userspace may not
3628 * know if the specified device is created as unprivileged
3629 * mode.
3630 */
3631 if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
3632 return 0;
3633 }
3634
3635 /*
3636 * User has to provide the char device path for unprivileged ublk
3637 *
3638 * header->addr always points to the dev path buffer, and
3639 * header->dev_path_len records length of dev path buffer.
3640 */
3641 if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
3642 return -EINVAL;
3643
3644 if (header->len < header->dev_path_len)
3645 return -EINVAL;
3646
3647 dev_path = memdup_user_nul(argp, header->dev_path_len);
3648 if (IS_ERR(dev_path))
3649 return PTR_ERR(dev_path);
3650
3651 ret = -EINVAL;
3652 switch (_IOC_NR(cmd->cmd_op)) {
3653 case UBLK_CMD_GET_DEV_INFO:
3654 case UBLK_CMD_GET_DEV_INFO2:
3655 case UBLK_CMD_GET_QUEUE_AFFINITY:
3656 case UBLK_CMD_GET_PARAMS:
3657 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
3658 mask = MAY_READ;
3659 break;
3660 case UBLK_CMD_START_DEV:
3661 case UBLK_CMD_STOP_DEV:
3662 case UBLK_CMD_ADD_DEV:
3663 case UBLK_CMD_DEL_DEV:
3664 case UBLK_CMD_SET_PARAMS:
3665 case UBLK_CMD_START_USER_RECOVERY:
3666 case UBLK_CMD_END_USER_RECOVERY:
3667 case UBLK_CMD_UPDATE_SIZE:
3668 case UBLK_CMD_QUIESCE_DEV:
3669 mask = MAY_READ | MAY_WRITE;
3670 break;
3671 default:
3672 goto exit;
3673 }
3674
3675 ret = ublk_char_dev_permission(ub, dev_path, mask);
3676 if (!ret) {
3677 header->len -= header->dev_path_len;
3678 header->addr += header->dev_path_len;
3679 }
3680 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
3681 __func__, ub->ub_number, cmd->cmd_op,
3682 ub->dev_info.owner_uid, ub->dev_info.owner_gid,
3683 dev_path, ret);
3684 exit:
3685 kfree(dev_path);
3686 return ret;
3687 }
3688
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3689 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
3690 unsigned int issue_flags)
3691 {
3692 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3693 struct ublk_device *ub = NULL;
3694 u32 cmd_op = cmd->cmd_op;
3695 int ret = -EINVAL;
3696
3697 if (issue_flags & IO_URING_F_NONBLOCK)
3698 return -EAGAIN;
3699
3700 ublk_ctrl_cmd_dump(cmd);
3701
3702 if (!(issue_flags & IO_URING_F_SQE128))
3703 goto out;
3704
3705 ret = ublk_check_cmd_op(cmd_op);
3706 if (ret)
3707 goto out;
3708
3709 if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
3710 ret = ublk_ctrl_get_features(header);
3711 goto out;
3712 }
3713
3714 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
3715 ret = -ENODEV;
3716 ub = ublk_get_device_from_id(header->dev_id);
3717 if (!ub)
3718 goto out;
3719
3720 ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
3721 if (ret)
3722 goto put_dev;
3723 }
3724
3725 switch (_IOC_NR(cmd_op)) {
3726 case UBLK_CMD_START_DEV:
3727 ret = ublk_ctrl_start_dev(ub, header);
3728 break;
3729 case UBLK_CMD_STOP_DEV:
3730 ret = ublk_ctrl_stop_dev(ub);
3731 break;
3732 case UBLK_CMD_GET_DEV_INFO:
3733 case UBLK_CMD_GET_DEV_INFO2:
3734 ret = ublk_ctrl_get_dev_info(ub, header);
3735 break;
3736 case UBLK_CMD_ADD_DEV:
3737 ret = ublk_ctrl_add_dev(header);
3738 break;
3739 case UBLK_CMD_DEL_DEV:
3740 ret = ublk_ctrl_del_dev(&ub, true);
3741 break;
3742 case UBLK_CMD_DEL_DEV_ASYNC:
3743 ret = ublk_ctrl_del_dev(&ub, false);
3744 break;
3745 case UBLK_CMD_GET_QUEUE_AFFINITY:
3746 ret = ublk_ctrl_get_queue_affinity(ub, header);
3747 break;
3748 case UBLK_CMD_GET_PARAMS:
3749 ret = ublk_ctrl_get_params(ub, header);
3750 break;
3751 case UBLK_CMD_SET_PARAMS:
3752 ret = ublk_ctrl_set_params(ub, header);
3753 break;
3754 case UBLK_CMD_START_USER_RECOVERY:
3755 ret = ublk_ctrl_start_recovery(ub, header);
3756 break;
3757 case UBLK_CMD_END_USER_RECOVERY:
3758 ret = ublk_ctrl_end_recovery(ub, header);
3759 break;
3760 case UBLK_CMD_UPDATE_SIZE:
3761 ublk_ctrl_set_size(ub, header);
3762 ret = 0;
3763 break;
3764 case UBLK_CMD_QUIESCE_DEV:
3765 ret = ublk_ctrl_quiesce_dev(ub, header);
3766 break;
3767 default:
3768 ret = -EOPNOTSUPP;
3769 break;
3770 }
3771
3772 put_dev:
3773 if (ub)
3774 ublk_put_device(ub);
3775 out:
3776 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
3777 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
3778 return ret;
3779 }
3780
3781 static const struct file_operations ublk_ctl_fops = {
3782 .open = nonseekable_open,
3783 .uring_cmd = ublk_ctrl_uring_cmd,
3784 .owner = THIS_MODULE,
3785 .llseek = noop_llseek,
3786 };
3787
3788 static struct miscdevice ublk_misc = {
3789 .minor = MISC_DYNAMIC_MINOR,
3790 .name = "ublk-control",
3791 .fops = &ublk_ctl_fops,
3792 };
3793
ublk_init(void)3794 static int __init ublk_init(void)
3795 {
3796 int ret;
3797
3798 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
3799 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
3800 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
3801
3802 init_waitqueue_head(&ublk_idr_wq);
3803
3804 ret = misc_register(&ublk_misc);
3805 if (ret)
3806 return ret;
3807
3808 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
3809 if (ret)
3810 goto unregister_mis;
3811
3812 ret = class_register(&ublk_chr_class);
3813 if (ret)
3814 goto free_chrdev_region;
3815
3816 return 0;
3817
3818 free_chrdev_region:
3819 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3820 unregister_mis:
3821 misc_deregister(&ublk_misc);
3822 return ret;
3823 }
3824
ublk_exit(void)3825 static void __exit ublk_exit(void)
3826 {
3827 struct ublk_device *ub;
3828 int id;
3829
3830 idr_for_each_entry(&ublk_index_idr, ub, id)
3831 ublk_remove(ub);
3832
3833 class_unregister(&ublk_chr_class);
3834 misc_deregister(&ublk_misc);
3835
3836 idr_destroy(&ublk_index_idr);
3837 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3838 }
3839
3840 module_init(ublk_init);
3841 module_exit(ublk_exit);
3842
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)3843 static int ublk_set_max_unprivileged_ublks(const char *buf,
3844 const struct kernel_param *kp)
3845 {
3846 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
3847 }
3848
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)3849 static int ublk_get_max_unprivileged_ublks(char *buf,
3850 const struct kernel_param *kp)
3851 {
3852 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
3853 }
3854
3855 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
3856 .set = ublk_set_max_unprivileged_ublks,
3857 .get = ublk_get_max_unprivileged_ublks,
3858 };
3859
3860 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
3861 &unprivileged_ublks_max, 0644);
3862 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
3863
3864 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
3865 MODULE_DESCRIPTION("Userspace block device");
3866 MODULE_LICENSE("GPL");
3867