1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <uapi/linux/fs.h>
50 #include <uapi/linux/ublk_cmd.h>
51
52 #define UBLK_MINORS (1U << MINORBITS)
53
54 #define UBLK_INVALID_BUF_IDX ((u16)-1)
55
56 /* private ioctl command mirror */
57 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
58 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
59 #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
60 #define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
61
62 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
63 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
64
65 /* All UBLK_F_* have to be included into UBLK_F_ALL */
66 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
67 | UBLK_F_URING_CMD_COMP_IN_TASK \
68 | UBLK_F_NEED_GET_DATA \
69 | UBLK_F_USER_RECOVERY \
70 | UBLK_F_USER_RECOVERY_REISSUE \
71 | UBLK_F_UNPRIVILEGED_DEV \
72 | UBLK_F_CMD_IOCTL_ENCODE \
73 | UBLK_F_USER_COPY \
74 | UBLK_F_ZONED \
75 | UBLK_F_USER_RECOVERY_FAIL_IO \
76 | UBLK_F_UPDATE_SIZE \
77 | UBLK_F_AUTO_BUF_REG \
78 | UBLK_F_QUIESCE \
79 | UBLK_F_PER_IO_DAEMON \
80 | UBLK_F_BUF_REG_OFF_DAEMON \
81 | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
82 | UBLK_F_SAFE_STOP_DEV \
83 | UBLK_F_BATCH_IO \
84 | UBLK_F_NO_AUTO_PART_SCAN)
85
86 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
87 | UBLK_F_USER_RECOVERY_REISSUE \
88 | UBLK_F_USER_RECOVERY_FAIL_IO)
89
90 /* All UBLK_PARAM_TYPE_* should be included here */
91 #define UBLK_PARAM_TYPE_ALL \
92 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
93 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
94 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
95 UBLK_PARAM_TYPE_INTEGRITY)
96
97 #define UBLK_BATCH_F_ALL \
98 (UBLK_BATCH_F_HAS_ZONE_LBA | \
99 UBLK_BATCH_F_HAS_BUF_ADDR | \
100 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
101
102 /* ublk batch fetch uring_cmd */
103 struct ublk_batch_fetch_cmd {
104 struct list_head node;
105 struct io_uring_cmd *cmd;
106 unsigned short buf_group;
107 };
108
109 struct ublk_uring_cmd_pdu {
110 /*
111 * Store requests in same batch temporarily for queuing them to
112 * daemon context.
113 *
114 * It should have been stored to request payload, but we do want
115 * to avoid extra pre-allocation, and uring_cmd payload is always
116 * free for us
117 */
118 union {
119 struct request *req;
120 struct request *req_list;
121 };
122
123 /*
124 * The following two are valid in this cmd whole lifetime, and
125 * setup in ublk uring_cmd handler
126 */
127 struct ublk_queue *ubq;
128
129 union {
130 u16 tag;
131 struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
132 };
133 };
134
135 struct ublk_batch_io_data {
136 struct ublk_device *ub;
137 struct io_uring_cmd *cmd;
138 struct ublk_batch_io header;
139 unsigned int issue_flags;
140 struct io_comp_batch *iob;
141 };
142
143 /*
144 * io command is active: sqe cmd is received, and its cqe isn't done
145 *
146 * If the flag is set, the io command is owned by ublk driver, and waited
147 * for incoming blk-mq request from the ublk block device.
148 *
149 * If the flag is cleared, the io command will be completed, and owned by
150 * ublk server.
151 */
152 #define UBLK_IO_FLAG_ACTIVE 0x01
153
154 /*
155 * IO command is completed via cqe, and it is being handled by ublksrv, and
156 * not committed yet
157 *
158 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
159 * cross verification
160 */
161 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
162
163 /*
164 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
165 * get data buffer address from ublksrv.
166 *
167 * Then, bio data could be copied into this data buffer for a WRITE request
168 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
169 */
170 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
171
172 /*
173 * request buffer is registered automatically, so we have to unregister it
174 * before completing this request.
175 *
176 * io_uring will unregister buffer automatically for us during exiting.
177 */
178 #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
179
180 /* atomic RW with ubq->cancel_lock */
181 #define UBLK_IO_FLAG_CANCELED 0x80000000
182
183 /*
184 * Initialize refcount to a large number to include any registered buffers.
185 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
186 * any buffers registered on the io daemon task.
187 */
188 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
189
190 /* used for UBLK_F_BATCH_IO only */
191 #define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1)
192
193 union ublk_io_buf {
194 __u64 addr;
195 struct ublk_auto_buf_reg auto_reg;
196 };
197
198 struct ublk_io {
199 union ublk_io_buf buf;
200 unsigned int flags;
201 int res;
202
203 union {
204 /* valid if UBLK_IO_FLAG_ACTIVE is set */
205 struct io_uring_cmd *cmd;
206 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
207 struct request *req;
208 };
209
210 struct task_struct *task;
211
212 /*
213 * The number of uses of this I/O by the ublk server
214 * if user copy or zero copy are enabled:
215 * - UBLK_REFCOUNT_INIT from dispatch to the server
216 * until UBLK_IO_COMMIT_AND_FETCH_REQ
217 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
218 * - 1 for each io_uring registered buffer not registered on task
219 * The I/O can only be completed once all references are dropped.
220 * User copy and buffer registration operations are only permitted
221 * if the reference count is nonzero.
222 */
223 refcount_t ref;
224 /* Count of buffers registered on task and not yet unregistered */
225 unsigned task_registered_buffers;
226
227 void *buf_ctx_handle;
228 spinlock_t lock;
229 } ____cacheline_aligned_in_smp;
230
231 struct ublk_queue {
232 int q_id;
233 int q_depth;
234
235 unsigned long flags;
236 struct ublksrv_io_desc *io_cmd_buf;
237
238 bool force_abort;
239 bool canceling;
240 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
241 spinlock_t cancel_lock;
242 struct ublk_device *dev;
243 u32 nr_io_ready;
244
245 /*
246 * For supporting UBLK_F_BATCH_IO only.
247 *
248 * Inflight ublk request tag is saved in this fifo
249 *
250 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
251 * so lock is required for storing request tag to fifo
252 *
253 * Make sure just one reader for fetching request from task work
254 * function to ublk server, so no need to grab the lock in reader
255 * side.
256 *
257 * Batch I/O State Management:
258 *
259 * The batch I/O system uses implicit state management based on the
260 * combination of three key variables below.
261 *
262 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
263 * No fetch commands available, events queue in evts_fifo
264 *
265 * - READY: !list_empty(&fcmd_head) && !active_fcmd
266 * Fetch commands available but none processing events
267 *
268 * - ACTIVE: active_fcmd
269 * One fetch command actively processing events from evts_fifo
270 *
271 * Key Invariants:
272 * - At most one active_fcmd at any time (single reader)
273 * - active_fcmd is always from fcmd_head list when non-NULL
274 * - evts_fifo can be read locklessly by the single active reader
275 * - All state transitions require evts_lock protection
276 * - Multiple writers to evts_fifo require lock protection
277 */
278 struct {
279 DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
280 spinlock_t evts_lock;
281
282 /* List of fetch commands available to process events */
283 struct list_head fcmd_head;
284
285 /* Currently active fetch command (NULL = none active) */
286 struct ublk_batch_fetch_cmd *active_fcmd;
287 }____cacheline_aligned_in_smp;
288
289 struct ublk_io ios[] __counted_by(q_depth);
290 };
291
292 struct ublk_device {
293 struct gendisk *ub_disk;
294
295 struct ublksrv_ctrl_dev_info dev_info;
296
297 struct blk_mq_tag_set tag_set;
298
299 struct cdev cdev;
300 struct device cdev_dev;
301
302 #define UB_STATE_OPEN 0
303 #define UB_STATE_USED 1
304 #define UB_STATE_DELETED 2
305 unsigned long state;
306 int ub_number;
307
308 struct mutex mutex;
309
310 spinlock_t lock;
311 struct mm_struct *mm;
312
313 struct ublk_params params;
314
315 struct completion completion;
316 u32 nr_queue_ready;
317 bool unprivileged_daemons;
318 struct mutex cancel_mutex;
319 bool canceling;
320 pid_t ublksrv_tgid;
321 struct delayed_work exit_work;
322 struct work_struct partition_scan_work;
323
324 bool block_open; /* protected by open_mutex */
325
326 struct ublk_queue *queues[];
327 };
328
329 /* header of ublk_params */
330 struct ublk_params_header {
331 __u32 len;
332 __u32 types;
333 };
334
335 static void ublk_io_release(void *priv);
336 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
337 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
338 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
339 u16 q_id, u16 tag, struct ublk_io *io);
340 static inline unsigned int ublk_req_build_flags(struct request *req);
341 static void ublk_batch_dispatch(struct ublk_queue *ubq,
342 const struct ublk_batch_io_data *data,
343 struct ublk_batch_fetch_cmd *fcmd);
344
ublk_dev_support_batch_io(const struct ublk_device * ub)345 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
346 {
347 return ub->dev_info.flags & UBLK_F_BATCH_IO;
348 }
349
ublk_support_batch_io(const struct ublk_queue * ubq)350 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
351 {
352 return ubq->flags & UBLK_F_BATCH_IO;
353 }
354
ublk_io_lock(struct ublk_io * io)355 static inline void ublk_io_lock(struct ublk_io *io)
356 {
357 spin_lock(&io->lock);
358 }
359
ublk_io_unlock(struct ublk_io * io)360 static inline void ublk_io_unlock(struct ublk_io *io)
361 {
362 spin_unlock(&io->lock);
363 }
364
365 /* Initialize the event queue */
ublk_io_evts_init(struct ublk_queue * q,unsigned int size,int numa_node)366 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
367 int numa_node)
368 {
369 spin_lock_init(&q->evts_lock);
370 return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
371 }
372
373 /* Check if event queue is empty */
ublk_io_evts_empty(const struct ublk_queue * q)374 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
375 {
376 return kfifo_is_empty(&q->evts_fifo);
377 }
378
ublk_io_evts_deinit(struct ublk_queue * q)379 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
380 {
381 WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
382 kfifo_free(&q->evts_fifo);
383 }
384
385 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)386 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
387 {
388 return &ubq->io_cmd_buf[tag];
389 }
390
ublk_support_zero_copy(const struct ublk_queue * ubq)391 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
392 {
393 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
394 }
395
ublk_dev_support_zero_copy(const struct ublk_device * ub)396 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
397 {
398 return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
399 }
400
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)401 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
402 {
403 return ubq->flags & UBLK_F_AUTO_BUF_REG;
404 }
405
ublk_dev_support_auto_buf_reg(const struct ublk_device * ub)406 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
407 {
408 return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
409 }
410
ublk_support_user_copy(const struct ublk_queue * ubq)411 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
412 {
413 return ubq->flags & UBLK_F_USER_COPY;
414 }
415
ublk_dev_support_user_copy(const struct ublk_device * ub)416 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
417 {
418 return ub->dev_info.flags & UBLK_F_USER_COPY;
419 }
420
ublk_dev_is_zoned(const struct ublk_device * ub)421 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
422 {
423 return ub->dev_info.flags & UBLK_F_ZONED;
424 }
425
ublk_queue_is_zoned(const struct ublk_queue * ubq)426 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
427 {
428 return ubq->flags & UBLK_F_ZONED;
429 }
430
ublk_dev_support_integrity(const struct ublk_device * ub)431 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
432 {
433 return ub->dev_info.flags & UBLK_F_INTEGRITY;
434 }
435
436 #ifdef CONFIG_BLK_DEV_ZONED
437
438 struct ublk_zoned_report_desc {
439 __u64 sector;
440 __u32 operation;
441 __u32 nr_zones;
442 };
443
444 static DEFINE_XARRAY(ublk_zoned_report_descs);
445
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)446 static int ublk_zoned_insert_report_desc(const struct request *req,
447 struct ublk_zoned_report_desc *desc)
448 {
449 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
450 desc, GFP_KERNEL);
451 }
452
ublk_zoned_erase_report_desc(const struct request * req)453 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
454 const struct request *req)
455 {
456 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
457 }
458
ublk_zoned_get_report_desc(const struct request * req)459 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
460 const struct request *req)
461 {
462 return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
463 }
464
ublk_get_nr_zones(const struct ublk_device * ub)465 static int ublk_get_nr_zones(const struct ublk_device *ub)
466 {
467 const struct ublk_param_basic *p = &ub->params.basic;
468
469 /* Zone size is a power of 2 */
470 return p->dev_sectors >> ilog2(p->chunk_sectors);
471 }
472
ublk_revalidate_disk_zones(struct ublk_device * ub)473 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
474 {
475 return blk_revalidate_disk_zones(ub->ub_disk);
476 }
477
ublk_dev_param_zoned_validate(const struct ublk_device * ub)478 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
479 {
480 const struct ublk_param_zoned *p = &ub->params.zoned;
481 int nr_zones;
482
483 if (!ublk_dev_is_zoned(ub))
484 return -EINVAL;
485
486 if (!p->max_zone_append_sectors)
487 return -EINVAL;
488
489 nr_zones = ublk_get_nr_zones(ub);
490
491 if (p->max_active_zones > nr_zones)
492 return -EINVAL;
493
494 if (p->max_open_zones > nr_zones)
495 return -EINVAL;
496
497 return 0;
498 }
499
ublk_dev_param_zoned_apply(struct ublk_device * ub)500 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
501 {
502 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
503 }
504
505 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)506 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
507 unsigned int nr_zones, size_t *buflen)
508 {
509 struct request_queue *q = ublk->ub_disk->queue;
510 size_t bufsize;
511 void *buf;
512
513 nr_zones = min_t(unsigned int, nr_zones,
514 ublk->ub_disk->nr_zones);
515
516 bufsize = nr_zones * sizeof(struct blk_zone);
517 bufsize =
518 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
519
520 while (bufsize >= sizeof(struct blk_zone)) {
521 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
522 if (buf) {
523 *buflen = bufsize;
524 return buf;
525 }
526 bufsize >>= 1;
527 }
528
529 *buflen = 0;
530 return NULL;
531 }
532
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)533 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
534 unsigned int nr_zones, struct blk_report_zones_args *args)
535 {
536 struct ublk_device *ub = disk->private_data;
537 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
538 unsigned int first_zone = sector >> ilog2(zone_size_sectors);
539 unsigned int done_zones = 0;
540 unsigned int max_zones_per_request;
541 int ret;
542 struct blk_zone *buffer;
543 size_t buffer_length;
544
545 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
546 nr_zones);
547
548 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
549 if (!buffer)
550 return -ENOMEM;
551
552 max_zones_per_request = buffer_length / sizeof(struct blk_zone);
553
554 while (done_zones < nr_zones) {
555 unsigned int remaining_zones = nr_zones - done_zones;
556 unsigned int zones_in_request =
557 min_t(unsigned int, remaining_zones, max_zones_per_request);
558 struct request *req;
559 struct ublk_zoned_report_desc desc;
560 blk_status_t status;
561
562 memset(buffer, 0, buffer_length);
563
564 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
565 if (IS_ERR(req)) {
566 ret = PTR_ERR(req);
567 goto out;
568 }
569
570 desc.operation = UBLK_IO_OP_REPORT_ZONES;
571 desc.sector = sector;
572 desc.nr_zones = zones_in_request;
573 ret = ublk_zoned_insert_report_desc(req, &desc);
574 if (ret)
575 goto free_req;
576
577 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
578 if (ret)
579 goto erase_desc;
580
581 status = blk_execute_rq(req, 0);
582 ret = blk_status_to_errno(status);
583 erase_desc:
584 ublk_zoned_erase_report_desc(req);
585 free_req:
586 blk_mq_free_request(req);
587 if (ret)
588 goto out;
589
590 for (unsigned int i = 0; i < zones_in_request; i++) {
591 struct blk_zone *zone = buffer + i;
592
593 /* A zero length zone means no more zones in this response */
594 if (!zone->len)
595 break;
596
597 ret = disk_report_zone(disk, zone, i, args);
598 if (ret)
599 goto out;
600
601 done_zones++;
602 sector += zone_size_sectors;
603
604 }
605 }
606
607 ret = done_zones;
608
609 out:
610 kvfree(buffer);
611 return ret;
612 }
613
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)614 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
615 struct request *req)
616 {
617 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
618 struct ublk_io *io = &ubq->ios[req->tag];
619 struct ublk_zoned_report_desc *desc;
620 u32 ublk_op;
621
622 switch (req_op(req)) {
623 case REQ_OP_ZONE_OPEN:
624 ublk_op = UBLK_IO_OP_ZONE_OPEN;
625 break;
626 case REQ_OP_ZONE_CLOSE:
627 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
628 break;
629 case REQ_OP_ZONE_FINISH:
630 ublk_op = UBLK_IO_OP_ZONE_FINISH;
631 break;
632 case REQ_OP_ZONE_RESET:
633 ublk_op = UBLK_IO_OP_ZONE_RESET;
634 break;
635 case REQ_OP_ZONE_APPEND:
636 ublk_op = UBLK_IO_OP_ZONE_APPEND;
637 break;
638 case REQ_OP_ZONE_RESET_ALL:
639 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
640 break;
641 case REQ_OP_DRV_IN:
642 desc = ublk_zoned_get_report_desc(req);
643 if (!desc)
644 return BLK_STS_IOERR;
645 ublk_op = desc->operation;
646 switch (ublk_op) {
647 case UBLK_IO_OP_REPORT_ZONES:
648 iod->op_flags = ublk_op | ublk_req_build_flags(req);
649 iod->nr_zones = desc->nr_zones;
650 iod->start_sector = desc->sector;
651 return BLK_STS_OK;
652 default:
653 return BLK_STS_IOERR;
654 }
655 case REQ_OP_DRV_OUT:
656 /* We do not support drv_out */
657 return BLK_STS_NOTSUPP;
658 default:
659 return BLK_STS_IOERR;
660 }
661
662 iod->op_flags = ublk_op | ublk_req_build_flags(req);
663 iod->nr_sectors = blk_rq_sectors(req);
664 iod->start_sector = blk_rq_pos(req);
665 iod->addr = io->buf.addr;
666
667 return BLK_STS_OK;
668 }
669
670 #else
671
672 #define ublk_report_zones (NULL)
673
ublk_dev_param_zoned_validate(const struct ublk_device * ub)674 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
675 {
676 return -EOPNOTSUPP;
677 }
678
ublk_dev_param_zoned_apply(struct ublk_device * ub)679 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
680 {
681 }
682
ublk_revalidate_disk_zones(struct ublk_device * ub)683 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
684 {
685 return 0;
686 }
687
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)688 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
689 struct request *req)
690 {
691 return BLK_STS_NOTSUPP;
692 }
693
694 #endif
695
696 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
697 bool need_map, struct io_comp_batch *iob);
698
699 static dev_t ublk_chr_devt;
700 static const struct class ublk_chr_class = {
701 .name = "ublk-char",
702 };
703
704 static DEFINE_IDR(ublk_index_idr);
705 static DEFINE_SPINLOCK(ublk_idr_lock);
706 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
707
708 static DEFINE_MUTEX(ublk_ctl_mutex);
709
710 static struct ublk_batch_fetch_cmd *
ublk_batch_alloc_fcmd(struct io_uring_cmd * cmd)711 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
712 {
713 struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
714
715 if (fcmd) {
716 fcmd->cmd = cmd;
717 fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
718 }
719 return fcmd;
720 }
721
ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd * fcmd)722 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
723 {
724 kfree(fcmd);
725 }
726
__ublk_release_fcmd(struct ublk_queue * ubq)727 static void __ublk_release_fcmd(struct ublk_queue *ubq)
728 {
729 WRITE_ONCE(ubq->active_fcmd, NULL);
730 }
731
732 /*
733 * Nothing can move on, so clear ->active_fcmd, and the caller should stop
734 * dispatching
735 */
ublk_batch_deinit_fetch_buf(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd,int res)736 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
737 const struct ublk_batch_io_data *data,
738 struct ublk_batch_fetch_cmd *fcmd,
739 int res)
740 {
741 spin_lock(&ubq->evts_lock);
742 list_del_init(&fcmd->node);
743 WARN_ON_ONCE(fcmd != ubq->active_fcmd);
744 __ublk_release_fcmd(ubq);
745 spin_unlock(&ubq->evts_lock);
746
747 io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
748 ublk_batch_free_fcmd(fcmd);
749 }
750
ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd * fcmd,struct io_br_sel * sel,unsigned int issue_flags)751 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
752 struct io_br_sel *sel,
753 unsigned int issue_flags)
754 {
755 if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
756 return -ENOBUFS;
757 return 0;
758 }
759
ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd * fcmd,void __user * buf,const u16 * tag_buf,unsigned int len)760 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
761 void __user *buf, const u16 *tag_buf,
762 unsigned int len)
763 {
764 if (copy_to_user(buf, tag_buf, len))
765 return -EFAULT;
766 return len;
767 }
768
769 #define UBLK_MAX_UBLKS UBLK_MINORS
770
771 /*
772 * Max unprivileged ublk devices allowed to add
773 *
774 * It can be extended to one per-user limit in future or even controlled
775 * by cgroup.
776 */
777 static unsigned int unprivileged_ublks_max = 64;
778 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
779
780 static struct miscdevice ublk_misc;
781
ublk_pos_to_hwq(loff_t pos)782 static inline unsigned ublk_pos_to_hwq(loff_t pos)
783 {
784 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
785 UBLK_QID_BITS_MASK;
786 }
787
ublk_pos_to_buf_off(loff_t pos)788 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
789 {
790 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
791 }
792
ublk_pos_to_tag(loff_t pos)793 static inline unsigned ublk_pos_to_tag(loff_t pos)
794 {
795 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
796 UBLK_TAG_BITS_MASK;
797 }
798
ublk_dev_param_basic_apply(struct ublk_device * ub)799 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
800 {
801 const struct ublk_param_basic *p = &ub->params.basic;
802
803 if (p->attrs & UBLK_ATTR_READ_ONLY)
804 set_disk_ro(ub->ub_disk, true);
805
806 set_capacity(ub->ub_disk, p->dev_sectors);
807 }
808
ublk_integrity_flags(u32 flags)809 static int ublk_integrity_flags(u32 flags)
810 {
811 int ret_flags = 0;
812
813 if (flags & LBMD_PI_CAP_INTEGRITY) {
814 flags &= ~LBMD_PI_CAP_INTEGRITY;
815 ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
816 }
817 if (flags & LBMD_PI_CAP_REFTAG) {
818 flags &= ~LBMD_PI_CAP_REFTAG;
819 ret_flags |= BLK_INTEGRITY_REF_TAG;
820 }
821 return flags ? -EINVAL : ret_flags;
822 }
823
ublk_integrity_pi_tuple_size(u8 csum_type)824 static int ublk_integrity_pi_tuple_size(u8 csum_type)
825 {
826 switch (csum_type) {
827 case LBMD_PI_CSUM_NONE:
828 return 0;
829 case LBMD_PI_CSUM_IP:
830 case LBMD_PI_CSUM_CRC16_T10DIF:
831 return 8;
832 case LBMD_PI_CSUM_CRC64_NVME:
833 return 16;
834 default:
835 return -EINVAL;
836 }
837 }
838
ublk_integrity_csum_type(u8 csum_type)839 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
840 {
841 switch (csum_type) {
842 case LBMD_PI_CSUM_NONE:
843 return BLK_INTEGRITY_CSUM_NONE;
844 case LBMD_PI_CSUM_IP:
845 return BLK_INTEGRITY_CSUM_IP;
846 case LBMD_PI_CSUM_CRC16_T10DIF:
847 return BLK_INTEGRITY_CSUM_CRC;
848 case LBMD_PI_CSUM_CRC64_NVME:
849 return BLK_INTEGRITY_CSUM_CRC64;
850 default:
851 WARN_ON_ONCE(1);
852 return BLK_INTEGRITY_CSUM_NONE;
853 }
854 }
855
ublk_validate_params(const struct ublk_device * ub)856 static int ublk_validate_params(const struct ublk_device *ub)
857 {
858 /* basic param is the only one which must be set */
859 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
860 const struct ublk_param_basic *p = &ub->params.basic;
861
862 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
863 return -EINVAL;
864
865 if (p->logical_bs_shift > p->physical_bs_shift)
866 return -EINVAL;
867
868 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
869 return -EINVAL;
870
871 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
872 return -EINVAL;
873 } else
874 return -EINVAL;
875
876 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
877 const struct ublk_param_discard *p = &ub->params.discard;
878
879 /* So far, only support single segment discard */
880 if (p->max_discard_sectors && p->max_discard_segments != 1)
881 return -EINVAL;
882
883 if (!p->discard_granularity)
884 return -EINVAL;
885 }
886
887 /* dev_t is read-only */
888 if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
889 return -EINVAL;
890
891 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
892 return ublk_dev_param_zoned_validate(ub);
893 else if (ublk_dev_is_zoned(ub))
894 return -EINVAL;
895
896 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
897 const struct ublk_param_dma_align *p = &ub->params.dma;
898
899 if (p->alignment >= PAGE_SIZE)
900 return -EINVAL;
901
902 if (!is_power_of_2(p->alignment + 1))
903 return -EINVAL;
904 }
905
906 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
907 const struct ublk_param_segment *p = &ub->params.seg;
908
909 if (!is_power_of_2(p->seg_boundary_mask + 1))
910 return -EINVAL;
911
912 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
913 return -EINVAL;
914 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
915 return -EINVAL;
916 }
917
918 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
919 const struct ublk_param_integrity *p = &ub->params.integrity;
920 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
921 int flags = ublk_integrity_flags(p->flags);
922
923 if (!ublk_dev_support_integrity(ub))
924 return -EINVAL;
925 if (flags < 0)
926 return flags;
927 if (pi_tuple_size < 0)
928 return pi_tuple_size;
929 if (!p->metadata_size)
930 return -EINVAL;
931 if (p->csum_type == LBMD_PI_CSUM_NONE &&
932 p->flags & LBMD_PI_CAP_REFTAG)
933 return -EINVAL;
934 if (p->pi_offset + pi_tuple_size > p->metadata_size)
935 return -EINVAL;
936 if (p->interval_exp < SECTOR_SHIFT ||
937 p->interval_exp > ub->params.basic.logical_bs_shift)
938 return -EINVAL;
939 }
940
941 return 0;
942 }
943
ublk_apply_params(struct ublk_device * ub)944 static void ublk_apply_params(struct ublk_device *ub)
945 {
946 ublk_dev_param_basic_apply(ub);
947
948 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
949 ublk_dev_param_zoned_apply(ub);
950 }
951
ublk_need_map_io(const struct ublk_queue * ubq)952 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
953 {
954 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
955 !ublk_support_auto_buf_reg(ubq);
956 }
957
ublk_dev_need_map_io(const struct ublk_device * ub)958 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
959 {
960 return !ublk_dev_support_user_copy(ub) &&
961 !ublk_dev_support_zero_copy(ub) &&
962 !ublk_dev_support_auto_buf_reg(ub);
963 }
964
ublk_need_req_ref(const struct ublk_queue * ubq)965 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
966 {
967 /*
968 * read()/write() is involved in user copy, so request reference
969 * has to be grabbed
970 *
971 * for zero copy, request buffer need to be registered to io_uring
972 * buffer table, so reference is needed
973 *
974 * For auto buffer register, ublk server still may issue
975 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
976 * so reference is required too.
977 */
978 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
979 ublk_support_auto_buf_reg(ubq);
980 }
981
ublk_dev_need_req_ref(const struct ublk_device * ub)982 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
983 {
984 return ublk_dev_support_user_copy(ub) ||
985 ublk_dev_support_zero_copy(ub) ||
986 ublk_dev_support_auto_buf_reg(ub);
987 }
988
989 /*
990 * ublk IO Reference Counting Design
991 * ==================================
992 *
993 * For user-copy and zero-copy modes, ublk uses a split reference model with
994 * two counters that together track IO lifetime:
995 *
996 * - io->ref: refcount for off-task buffer registrations and user-copy ops
997 * - io->task_registered_buffers: count of buffers registered on the IO task
998 *
999 * Key Invariant:
1000 * --------------
1001 * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1002 * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1003 * when no active references exist. After IO completion, both counters become
1004 * zero. For I/Os not currently dispatched to the ublk server, both ref and
1005 * task_registered_buffers are 0.
1006 *
1007 * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1008 * exit to determine if all references have been released.
1009 *
1010 * Why Split Counters:
1011 * -------------------
1012 * Buffers registered on the IO daemon task can use the lightweight
1013 * task_registered_buffers counter (simple increment/decrement) instead of
1014 * atomic refcount operations. The ublk_io_release() callback checks if
1015 * current == io->task to decide which counter to update.
1016 *
1017 * This optimization only applies before IO completion. At completion,
1018 * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1019 * After that, all subsequent buffer unregistrations must use the atomic ref
1020 * since they may be releasing the last reference.
1021 *
1022 * Reference Lifecycle:
1023 * --------------------
1024 * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1025 *
1026 * 2. During IO processing:
1027 * - On-task buffer reg: task_registered_buffers++ (no ref change)
1028 * - Off-task buffer reg: ref++ via ublk_get_req_ref()
1029 * - Buffer unregister callback (ublk_io_release):
1030 * * If on-task: task_registered_buffers--
1031 * * If off-task: ref-- via ublk_put_req_ref()
1032 *
1033 * 3. ublk_sub_req_ref() at IO completion:
1034 * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1035 * - Subtracts sub_refs from ref and zeroes task_registered_buffers
1036 * - This effectively collapses task_registered_buffers into the atomic ref,
1037 * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1038 * buffers that were already counted
1039 *
1040 * Example (zero-copy, register on-task, unregister off-task):
1041 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1042 * - Register buffer on-task: task_registered_buffers = 1
1043 * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1044 * - Completion via ublk_sub_req_ref():
1045 * sub_refs = UBLK_REFCOUNT_INIT - 1,
1046 * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1047 *
1048 * Example (auto buffer registration):
1049 * Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1050 *
1051 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1052 * - Buffer unregister: task_registered_buffers-- (becomes 0)
1053 * - Completion via ublk_sub_req_ref():
1054 * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1055 *
1056 * Example (zero-copy, ublk server killed):
1057 * When daemon is killed, io_uring cleanup unregisters buffers off-task.
1058 * ublk_check_and_reset_active_ref() waits for the invariant to hold.
1059 *
1060 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1061 * - Register buffer on-task: task_registered_buffers = 1
1062 * - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1063 * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1064 * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1065 * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1066 * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1067 * and abort pending requests
1068 *
1069 * Batch IO Special Case:
1070 * ----------------------
1071 * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1072 * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1073 * task_registered_buffers counter still tracks registered buffers for the
1074 * invariant check, even though the callback doesn't decrement it.
1075 *
1076 * Note: updating task_registered_buffers is protected by io->lock.
1077 */
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)1078 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1079 struct ublk_io *io)
1080 {
1081 if (ublk_need_req_ref(ubq))
1082 refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1083 }
1084
ublk_get_req_ref(struct ublk_io * io)1085 static inline bool ublk_get_req_ref(struct ublk_io *io)
1086 {
1087 return refcount_inc_not_zero(&io->ref);
1088 }
1089
ublk_put_req_ref(struct ublk_io * io,struct request * req)1090 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1091 {
1092 if (!refcount_dec_and_test(&io->ref))
1093 return;
1094
1095 /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1096 __ublk_complete_rq(req, io, false, NULL);
1097 }
1098
ublk_sub_req_ref(struct ublk_io * io)1099 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1100 {
1101 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1102
1103 io->task_registered_buffers = 0;
1104 return refcount_sub_and_test(sub_refs, &io->ref);
1105 }
1106
ublk_need_get_data(const struct ublk_queue * ubq)1107 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1108 {
1109 return ubq->flags & UBLK_F_NEED_GET_DATA;
1110 }
1111
ublk_dev_need_get_data(const struct ublk_device * ub)1112 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1113 {
1114 return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1115 }
1116
1117 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)1118 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1119 {
1120 if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1121 return ub;
1122 return NULL;
1123 }
1124
1125 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)1126 static noinline void ublk_put_device(struct ublk_device *ub)
1127 {
1128 put_device(&ub->cdev_dev);
1129 }
1130
ublk_get_queue(struct ublk_device * dev,int qid)1131 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1132 int qid)
1133 {
1134 return dev->queues[qid];
1135 }
1136
ublk_rq_has_data(const struct request * rq)1137 static inline bool ublk_rq_has_data(const struct request *rq)
1138 {
1139 return bio_has_data(rq->bio);
1140 }
1141
1142 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)1143 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1144 {
1145 return ublk_get_queue(ub, q_id)->io_cmd_buf;
1146 }
1147
__ublk_queue_cmd_buf_size(int depth)1148 static inline int __ublk_queue_cmd_buf_size(int depth)
1149 {
1150 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1151 }
1152
ublk_queue_cmd_buf_size(struct ublk_device * ub)1153 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1154 {
1155 return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1156 }
1157
ublk_max_cmd_buf_size(void)1158 static int ublk_max_cmd_buf_size(void)
1159 {
1160 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1161 }
1162
1163 /*
1164 * Should I/O outstanding to the ublk server when it exits be reissued?
1165 * If not, outstanding I/O will get errors.
1166 */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)1167 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1168 {
1169 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1170 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1171 }
1172
1173 /*
1174 * Should I/O issued while there is no ublk server queue? If not, I/O
1175 * issued while there is no ublk server will get errors.
1176 */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)1177 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1178 {
1179 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1180 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1181 }
1182
1183 /*
1184 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1185 * of the device flags for smaller cache footprint - better for fast
1186 * paths.
1187 */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)1188 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1189 {
1190 return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1191 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1192 }
1193
1194 /*
1195 * Should ublk devices be stopped (i.e. no recovery possible) when the
1196 * ublk server exits? If not, devices can be used again by a future
1197 * incarnation of a ublk server via the start_recovery/end_recovery
1198 * commands.
1199 */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)1200 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1201 {
1202 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1203 }
1204
ublk_dev_in_recoverable_state(struct ublk_device * ub)1205 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1206 {
1207 return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1208 ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1209 }
1210
ublk_free_disk(struct gendisk * disk)1211 static void ublk_free_disk(struct gendisk *disk)
1212 {
1213 struct ublk_device *ub = disk->private_data;
1214
1215 clear_bit(UB_STATE_USED, &ub->state);
1216 ublk_put_device(ub);
1217 }
1218
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)1219 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1220 unsigned int *owner_gid)
1221 {
1222 kuid_t uid;
1223 kgid_t gid;
1224
1225 current_uid_gid(&uid, &gid);
1226
1227 *owner_uid = from_kuid(&init_user_ns, uid);
1228 *owner_gid = from_kgid(&init_user_ns, gid);
1229 }
1230
ublk_open(struct gendisk * disk,blk_mode_t mode)1231 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1232 {
1233 struct ublk_device *ub = disk->private_data;
1234
1235 if (capable(CAP_SYS_ADMIN))
1236 return 0;
1237
1238 /*
1239 * If it is one unprivileged device, only owner can open
1240 * the disk. Otherwise it could be one trap made by one
1241 * evil user who grants this disk's privileges to other
1242 * users deliberately.
1243 *
1244 * This way is reasonable too given anyone can create
1245 * unprivileged device, and no need other's grant.
1246 */
1247 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1248 unsigned int curr_uid, curr_gid;
1249
1250 ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1251
1252 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1253 ub->dev_info.owner_gid)
1254 return -EPERM;
1255 }
1256
1257 if (ub->block_open)
1258 return -ENXIO;
1259
1260 return 0;
1261 }
1262
1263 static const struct block_device_operations ub_fops = {
1264 .owner = THIS_MODULE,
1265 .open = ublk_open,
1266 .free_disk = ublk_free_disk,
1267 .report_zones = ublk_report_zones,
1268 };
1269
ublk_copy_user_bvec(const struct bio_vec * bv,unsigned * offset,struct iov_iter * uiter,int dir,size_t * done)1270 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1271 struct iov_iter *uiter, int dir, size_t *done)
1272 {
1273 unsigned len;
1274 void *bv_buf;
1275 size_t copied;
1276
1277 if (*offset >= bv->bv_len) {
1278 *offset -= bv->bv_len;
1279 return true;
1280 }
1281
1282 len = bv->bv_len - *offset;
1283 bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1284 if (dir == ITER_DEST)
1285 copied = copy_to_iter(bv_buf, len, uiter);
1286 else
1287 copied = copy_from_iter(bv_buf, len, uiter);
1288
1289 kunmap_local(bv_buf);
1290
1291 *done += copied;
1292 if (copied < len)
1293 return false;
1294
1295 *offset = 0;
1296 return true;
1297 }
1298
1299 /*
1300 * Copy data between request pages and io_iter, and 'offset'
1301 * is the start point of linear offset of request.
1302 */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1303 static size_t ublk_copy_user_pages(const struct request *req,
1304 unsigned offset, struct iov_iter *uiter, int dir)
1305 {
1306 struct req_iterator iter;
1307 struct bio_vec bv;
1308 size_t done = 0;
1309
1310 rq_for_each_segment(bv, req, iter) {
1311 if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1312 break;
1313 }
1314 return done;
1315 }
1316
1317 #ifdef CONFIG_BLK_DEV_INTEGRITY
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1318 static size_t ublk_copy_user_integrity(const struct request *req,
1319 unsigned offset, struct iov_iter *uiter, int dir)
1320 {
1321 size_t done = 0;
1322 struct bio *bio = req->bio;
1323 struct bvec_iter iter;
1324 struct bio_vec iv;
1325
1326 if (!blk_integrity_rq(req))
1327 return 0;
1328
1329 bio_for_each_integrity_vec(iv, bio, iter) {
1330 if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1331 break;
1332 }
1333
1334 return done;
1335 }
1336 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1337 static size_t ublk_copy_user_integrity(const struct request *req,
1338 unsigned offset, struct iov_iter *uiter, int dir)
1339 {
1340 return 0;
1341 }
1342 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1343
ublk_need_map_req(const struct request * req)1344 static inline bool ublk_need_map_req(const struct request *req)
1345 {
1346 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1347 }
1348
ublk_need_unmap_req(const struct request * req)1349 static inline bool ublk_need_unmap_req(const struct request *req)
1350 {
1351 return ublk_rq_has_data(req) &&
1352 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1353 }
1354
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)1355 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1356 const struct request *req,
1357 const struct ublk_io *io)
1358 {
1359 const unsigned int rq_bytes = blk_rq_bytes(req);
1360
1361 if (!ublk_need_map_io(ubq))
1362 return rq_bytes;
1363
1364 /*
1365 * no zero copy, we delay copy WRITE request data into ublksrv
1366 * context and the big benefit is that pinning pages in current
1367 * context is pretty fast, see ublk_pin_user_pages
1368 */
1369 if (ublk_need_map_req(req)) {
1370 struct iov_iter iter;
1371 const int dir = ITER_DEST;
1372
1373 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1374 return ublk_copy_user_pages(req, 0, &iter, dir);
1375 }
1376 return rq_bytes;
1377 }
1378
ublk_unmap_io(bool need_map,const struct request * req,const struct ublk_io * io)1379 static unsigned int ublk_unmap_io(bool need_map,
1380 const struct request *req,
1381 const struct ublk_io *io)
1382 {
1383 const unsigned int rq_bytes = blk_rq_bytes(req);
1384
1385 if (!need_map)
1386 return rq_bytes;
1387
1388 if (ublk_need_unmap_req(req)) {
1389 struct iov_iter iter;
1390 const int dir = ITER_SOURCE;
1391
1392 WARN_ON_ONCE(io->res > rq_bytes);
1393
1394 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1395 return ublk_copy_user_pages(req, 0, &iter, dir);
1396 }
1397 return rq_bytes;
1398 }
1399
ublk_req_build_flags(struct request * req)1400 static inline unsigned int ublk_req_build_flags(struct request *req)
1401 {
1402 unsigned flags = 0;
1403
1404 if (req->cmd_flags & REQ_FAILFAST_DEV)
1405 flags |= UBLK_IO_F_FAILFAST_DEV;
1406
1407 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1408 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1409
1410 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1411 flags |= UBLK_IO_F_FAILFAST_DRIVER;
1412
1413 if (req->cmd_flags & REQ_META)
1414 flags |= UBLK_IO_F_META;
1415
1416 if (req->cmd_flags & REQ_FUA)
1417 flags |= UBLK_IO_F_FUA;
1418
1419 if (req->cmd_flags & REQ_NOUNMAP)
1420 flags |= UBLK_IO_F_NOUNMAP;
1421
1422 if (req->cmd_flags & REQ_SWAP)
1423 flags |= UBLK_IO_F_SWAP;
1424
1425 if (blk_integrity_rq(req))
1426 flags |= UBLK_IO_F_INTEGRITY;
1427
1428 return flags;
1429 }
1430
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1431 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1432 {
1433 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1434 struct ublk_io *io = &ubq->ios[req->tag];
1435 u32 ublk_op;
1436
1437 switch (req_op(req)) {
1438 case REQ_OP_READ:
1439 ublk_op = UBLK_IO_OP_READ;
1440 break;
1441 case REQ_OP_WRITE:
1442 ublk_op = UBLK_IO_OP_WRITE;
1443 break;
1444 case REQ_OP_FLUSH:
1445 ublk_op = UBLK_IO_OP_FLUSH;
1446 break;
1447 case REQ_OP_DISCARD:
1448 ublk_op = UBLK_IO_OP_DISCARD;
1449 break;
1450 case REQ_OP_WRITE_ZEROES:
1451 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1452 break;
1453 default:
1454 if (ublk_queue_is_zoned(ubq))
1455 return ublk_setup_iod_zoned(ubq, req);
1456 return BLK_STS_IOERR;
1457 }
1458
1459 /* need to translate since kernel may change */
1460 iod->op_flags = ublk_op | ublk_req_build_flags(req);
1461 iod->nr_sectors = blk_rq_sectors(req);
1462 iod->start_sector = blk_rq_pos(req);
1463 iod->addr = io->buf.addr;
1464
1465 return BLK_STS_OK;
1466 }
1467
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1468 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1469 struct io_uring_cmd *ioucmd)
1470 {
1471 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1472 }
1473
ublk_end_request(struct request * req,blk_status_t error)1474 static void ublk_end_request(struct request *req, blk_status_t error)
1475 {
1476 local_bh_disable();
1477 blk_mq_end_request(req, error);
1478 local_bh_enable();
1479 }
1480
1481 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req,struct ublk_io * io,bool need_map,struct io_comp_batch * iob)1482 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1483 bool need_map, struct io_comp_batch *iob)
1484 {
1485 unsigned int unmapped_bytes;
1486 blk_status_t res = BLK_STS_OK;
1487 bool requeue;
1488
1489 /* failed read IO if nothing is read */
1490 if (!io->res && req_op(req) == REQ_OP_READ)
1491 io->res = -EIO;
1492
1493 if (io->res < 0) {
1494 res = errno_to_blk_status(io->res);
1495 goto exit;
1496 }
1497
1498 /*
1499 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1500 * directly.
1501 *
1502 * Both the two needn't unmap.
1503 */
1504 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1505 req_op(req) != REQ_OP_DRV_IN)
1506 goto exit;
1507
1508 /* for READ request, writing data in iod->addr to rq buffers */
1509 unmapped_bytes = ublk_unmap_io(need_map, req, io);
1510
1511 /*
1512 * Extremely impossible since we got data filled in just before
1513 *
1514 * Re-read simply for this unlikely case.
1515 */
1516 if (unlikely(unmapped_bytes < io->res))
1517 io->res = unmapped_bytes;
1518
1519 /*
1520 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1521 * happens off this path, then that will prevent ublk's blkdev_release()
1522 * from being called on current's task work, see fput() implementation.
1523 *
1524 * Otherwise, ublk server may not provide forward progress in case of
1525 * reading the partition table from bdev_open() with disk->open_mutex
1526 * held, and causes dead lock as we could already be holding
1527 * disk->open_mutex here.
1528 *
1529 * Preferably we would not be doing IO with a mutex held that is also
1530 * used for release, but this work-around will suffice for now.
1531 */
1532 local_bh_disable();
1533 requeue = blk_update_request(req, BLK_STS_OK, io->res);
1534 local_bh_enable();
1535 if (requeue)
1536 blk_mq_requeue_request(req, true);
1537 else if (likely(!blk_should_fake_timeout(req->q))) {
1538 if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1539 return;
1540 __blk_mq_end_request(req, BLK_STS_OK);
1541 }
1542
1543 return;
1544 exit:
1545 ublk_end_request(req, res);
1546 }
1547
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1548 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1549 struct request *req)
1550 {
1551 /* read cmd first because req will overwrite it */
1552 struct io_uring_cmd *cmd = io->cmd;
1553
1554 /* mark this cmd owned by ublksrv */
1555 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1556
1557 /*
1558 * clear ACTIVE since we are done with this sqe/cmd slot
1559 * We can only accept io cmd in case of being not active.
1560 */
1561 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1562
1563 io->req = req;
1564 return cmd;
1565 }
1566
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1567 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1568 int res, unsigned issue_flags)
1569 {
1570 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1571
1572 /* tell ublksrv one io request is coming */
1573 io_uring_cmd_done(cmd, res, issue_flags);
1574 }
1575
1576 #define UBLK_REQUEUE_DELAY_MS 3
1577
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1578 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1579 struct request *rq)
1580 {
1581 /* We cannot process this rq so just requeue it. */
1582 if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1583 blk_mq_requeue_request(rq, false);
1584 else
1585 ublk_end_request(rq, BLK_STS_IOERR);
1586 }
1587
1588 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,unsigned tag)1589 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1590 {
1591 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1592
1593 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1594 }
1595
1596 enum auto_buf_reg_res {
1597 AUTO_BUF_REG_FAIL,
1598 AUTO_BUF_REG_FALLBACK,
1599 AUTO_BUF_REG_OK,
1600 };
1601
1602 /*
1603 * Setup io state after auto buffer registration.
1604 *
1605 * Must be called after ublk_auto_buf_register() is done.
1606 * Caller must hold io->lock in batch context.
1607 */
ublk_auto_buf_io_setup(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,enum auto_buf_reg_res res)1608 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1609 struct request *req, struct ublk_io *io,
1610 struct io_uring_cmd *cmd,
1611 enum auto_buf_reg_res res)
1612 {
1613 if (res == AUTO_BUF_REG_OK) {
1614 io->task_registered_buffers = 1;
1615 io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1616 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1617 }
1618 ublk_init_req_ref(ubq, io);
1619 __ublk_prep_compl_io_cmd(io, req);
1620 }
1621
1622 /* Register request bvec to io_uring for auto buffer registration. */
1623 static enum auto_buf_reg_res
ublk_auto_buf_register(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1624 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1625 struct ublk_io *io, struct io_uring_cmd *cmd,
1626 unsigned int issue_flags)
1627 {
1628 int ret;
1629
1630 ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1631 io->buf.auto_reg.index, issue_flags);
1632 if (ret) {
1633 if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1634 ublk_auto_buf_reg_fallback(ubq, req->tag);
1635 return AUTO_BUF_REG_FALLBACK;
1636 }
1637 ublk_end_request(req, BLK_STS_IOERR);
1638 return AUTO_BUF_REG_FAIL;
1639 }
1640
1641 return AUTO_BUF_REG_OK;
1642 }
1643
1644 /*
1645 * Dispatch IO to userspace with auto buffer registration.
1646 *
1647 * Only called in non-batch context from task work, io->lock not held.
1648 */
ublk_auto_buf_dispatch(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1649 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1650 struct request *req, struct ublk_io *io,
1651 struct io_uring_cmd *cmd,
1652 unsigned int issue_flags)
1653 {
1654 enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1655 issue_flags);
1656
1657 if (res != AUTO_BUF_REG_FAIL) {
1658 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1659 io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1660 }
1661 }
1662
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1663 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1664 struct ublk_io *io)
1665 {
1666 unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1667
1668 /* partially mapped, update io descriptor */
1669 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1670 /*
1671 * Nothing mapped, retry until we succeed.
1672 *
1673 * We may never succeed in mapping any bytes here because
1674 * of OOM. TODO: reserve one buffer with single page pinned
1675 * for providing forward progress guarantee.
1676 */
1677 if (unlikely(!mapped_bytes)) {
1678 blk_mq_requeue_request(req, false);
1679 blk_mq_delay_kick_requeue_list(req->q,
1680 UBLK_REQUEUE_DELAY_MS);
1681 return false;
1682 }
1683
1684 ublk_get_iod(ubq, req->tag)->nr_sectors =
1685 mapped_bytes >> 9;
1686 }
1687
1688 return true;
1689 }
1690
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req)1691 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1692 {
1693 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1694 int tag = req->tag;
1695 struct ublk_io *io = &ubq->ios[tag];
1696
1697 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1698 __func__, ubq->q_id, req->tag, io->flags,
1699 ublk_get_iod(ubq, req->tag)->addr);
1700
1701 /*
1702 * Task is exiting if either:
1703 *
1704 * (1) current != io->task.
1705 * io_uring_cmd_complete_in_task() tries to run task_work
1706 * in a workqueue if cmd's task is PF_EXITING.
1707 *
1708 * (2) current->flags & PF_EXITING.
1709 */
1710 if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1711 __ublk_abort_rq(ubq, req);
1712 return;
1713 }
1714
1715 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1716 /*
1717 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1718 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1719 * and notify it.
1720 */
1721 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1722 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1723 __func__, ubq->q_id, req->tag, io->flags);
1724 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1725 issue_flags);
1726 return;
1727 }
1728
1729 if (!ublk_start_io(ubq, req, io))
1730 return;
1731
1732 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1733 ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1734 } else {
1735 ublk_init_req_ref(ubq, io);
1736 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1737 }
1738 }
1739
__ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short tag)1740 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1741 const struct ublk_batch_io_data *data,
1742 unsigned short tag)
1743 {
1744 struct ublk_device *ub = data->ub;
1745 struct ublk_io *io = &ubq->ios[tag];
1746 struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1747 enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1748 struct io_uring_cmd *cmd = data->cmd;
1749
1750 if (!ublk_start_io(ubq, req, io))
1751 return false;
1752
1753 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1754 res = ublk_auto_buf_register(ubq, req, io, cmd,
1755 data->issue_flags);
1756
1757 if (res == AUTO_BUF_REG_FAIL)
1758 return false;
1759 }
1760
1761 ublk_io_lock(io);
1762 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1763 ublk_io_unlock(io);
1764
1765 return true;
1766 }
1767
ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,unsigned int len)1768 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1769 const struct ublk_batch_io_data *data,
1770 unsigned short *tag_buf,
1771 unsigned int len)
1772 {
1773 bool has_unused = false;
1774 unsigned int i;
1775
1776 for (i = 0; i < len; i++) {
1777 unsigned short tag = tag_buf[i];
1778
1779 if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1780 tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1781 has_unused = true;
1782 }
1783 }
1784
1785 return has_unused;
1786 }
1787
1788 /*
1789 * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1790 * Returns the new length after filtering.
1791 */
ublk_filter_unused_tags(unsigned short * tag_buf,unsigned int len)1792 static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1793 unsigned int len)
1794 {
1795 unsigned int i, j;
1796
1797 for (i = 0, j = 0; i < len; i++) {
1798 if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1799 if (i != j)
1800 tag_buf[j] = tag_buf[i];
1801 j++;
1802 }
1803 }
1804
1805 return j;
1806 }
1807
1808 #define MAX_NR_TAG 128
__ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)1809 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1810 const struct ublk_batch_io_data *data,
1811 struct ublk_batch_fetch_cmd *fcmd)
1812 {
1813 const unsigned int tag_sz = sizeof(unsigned short);
1814 unsigned short tag_buf[MAX_NR_TAG];
1815 struct io_br_sel sel;
1816 size_t len = 0;
1817 bool needs_filter;
1818 int ret;
1819
1820 WARN_ON_ONCE(data->cmd != fcmd->cmd);
1821
1822 sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1823 data->issue_flags);
1824 if (sel.val < 0)
1825 return sel.val;
1826 if (!sel.addr)
1827 return -ENOBUFS;
1828
1829 /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1830 len = min(len, sizeof(tag_buf)) / tag_sz;
1831 len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1832
1833 needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1834 /* Filter out unused tags before posting to userspace */
1835 if (unlikely(needs_filter)) {
1836 int new_len = ublk_filter_unused_tags(tag_buf, len);
1837
1838 /* return actual length if all are failed or requeued */
1839 if (!new_len) {
1840 /* release the selected buffer */
1841 sel.val = 0;
1842 WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1843 &sel, data->issue_flags));
1844 return len;
1845 }
1846 len = new_len;
1847 }
1848
1849 sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1850 ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1851 if (unlikely(ret < 0)) {
1852 int i, res;
1853
1854 /*
1855 * Undo prep state for all IOs since userspace never received them.
1856 * This restores IOs to pre-prepared state so they can be cleanly
1857 * re-prepared when tags are pulled from FIFO again.
1858 */
1859 for (i = 0; i < len; i++) {
1860 struct ublk_io *io = &ubq->ios[tag_buf[i]];
1861 int index = -1;
1862
1863 ublk_io_lock(io);
1864 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1865 index = io->buf.auto_reg.index;
1866 io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1867 io->flags |= UBLK_IO_FLAG_ACTIVE;
1868 ublk_io_unlock(io);
1869
1870 if (index != -1)
1871 io_buffer_unregister_bvec(data->cmd, index,
1872 data->issue_flags);
1873 }
1874
1875 res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1876 tag_buf, len, &ubq->evts_lock);
1877
1878 pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1879 "tags(%d %zu) ret %d\n", __func__, res, len,
1880 ret);
1881 }
1882 return ret;
1883 }
1884
__ublk_acquire_fcmd(struct ublk_queue * ubq)1885 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1886 struct ublk_queue *ubq)
1887 {
1888 struct ublk_batch_fetch_cmd *fcmd;
1889
1890 lockdep_assert_held(&ubq->evts_lock);
1891
1892 /*
1893 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1894 *
1895 * The pair is the smp_mb() in ublk_batch_dispatch().
1896 *
1897 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1898 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1899 */
1900 smp_mb();
1901 if (READ_ONCE(ubq->active_fcmd)) {
1902 fcmd = NULL;
1903 } else {
1904 fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1905 struct ublk_batch_fetch_cmd, node);
1906 WRITE_ONCE(ubq->active_fcmd, fcmd);
1907 }
1908 return fcmd;
1909 }
1910
ublk_batch_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)1911 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1912 {
1913 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1914 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1915 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1916 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1917 struct ublk_batch_io_data data = {
1918 .ub = pdu->ubq->dev,
1919 .cmd = fcmd->cmd,
1920 .issue_flags = issue_flags,
1921 };
1922
1923 WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1924
1925 ublk_batch_dispatch(pdu->ubq, &data, fcmd);
1926 }
1927
1928 static void
ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)1929 ublk_batch_dispatch(struct ublk_queue *ubq,
1930 const struct ublk_batch_io_data *data,
1931 struct ublk_batch_fetch_cmd *fcmd)
1932 {
1933 struct ublk_batch_fetch_cmd *new_fcmd;
1934 unsigned tried = 0;
1935 int ret = 0;
1936
1937 again:
1938 while (!ublk_io_evts_empty(ubq)) {
1939 ret = __ublk_batch_dispatch(ubq, data, fcmd);
1940 if (ret <= 0)
1941 break;
1942 }
1943
1944 if (ret < 0) {
1945 ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
1946 return;
1947 }
1948
1949 __ublk_release_fcmd(ubq);
1950 /*
1951 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
1952 * checking ubq->evts_fifo.
1953 *
1954 * The pair is the smp_mb() in __ublk_acquire_fcmd().
1955 */
1956 smp_mb();
1957 if (likely(ublk_io_evts_empty(ubq)))
1958 return;
1959
1960 spin_lock(&ubq->evts_lock);
1961 new_fcmd = __ublk_acquire_fcmd(ubq);
1962 spin_unlock(&ubq->evts_lock);
1963
1964 if (!new_fcmd)
1965 return;
1966
1967 /* Avoid lockup by allowing to handle at most 32 batches */
1968 if (new_fcmd == fcmd && tried++ < 32)
1969 goto again;
1970
1971 io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
1972 }
1973
ublk_cmd_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)1974 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1975 {
1976 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1977 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1978 struct ublk_queue *ubq = pdu->ubq;
1979
1980 ublk_dispatch_req(ubq, pdu->req);
1981 }
1982
ublk_batch_queue_cmd(struct ublk_queue * ubq,struct request * rq,bool last)1983 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
1984 {
1985 unsigned short tag = rq->tag;
1986 struct ublk_batch_fetch_cmd *fcmd = NULL;
1987
1988 spin_lock(&ubq->evts_lock);
1989 kfifo_put(&ubq->evts_fifo, tag);
1990 if (last)
1991 fcmd = __ublk_acquire_fcmd(ubq);
1992 spin_unlock(&ubq->evts_lock);
1993
1994 if (fcmd)
1995 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
1996 }
1997
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)1998 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1999 {
2000 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2001 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2002
2003 pdu->req = rq;
2004 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2005 }
2006
ublk_cmd_list_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2007 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2008 {
2009 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2010 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2011 struct request *rq = pdu->req_list;
2012 struct request *next;
2013
2014 do {
2015 next = rq->rq_next;
2016 rq->rq_next = NULL;
2017 ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2018 rq = next;
2019 } while (rq);
2020 }
2021
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)2022 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2023 {
2024 struct io_uring_cmd *cmd = io->cmd;
2025 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2026
2027 pdu->req_list = rq_list_peek(l);
2028 rq_list_init(l);
2029 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2030 }
2031
ublk_timeout(struct request * rq)2032 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2033 {
2034 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2035 pid_t tgid = ubq->dev->ublksrv_tgid;
2036 struct task_struct *p;
2037 struct pid *pid;
2038
2039 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2040 return BLK_EH_RESET_TIMER;
2041
2042 if (unlikely(!tgid))
2043 return BLK_EH_RESET_TIMER;
2044
2045 rcu_read_lock();
2046 pid = find_vpid(tgid);
2047 p = pid_task(pid, PIDTYPE_PID);
2048 if (p)
2049 send_sig(SIGKILL, p, 0);
2050 rcu_read_unlock();
2051 return BLK_EH_DONE;
2052 }
2053
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)2054 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2055 bool check_cancel)
2056 {
2057 blk_status_t res;
2058
2059 if (unlikely(READ_ONCE(ubq->fail_io)))
2060 return BLK_STS_TARGET;
2061
2062 /* With recovery feature enabled, force_abort is set in
2063 * ublk_stop_dev() before calling del_gendisk(). We have to
2064 * abort all requeued and new rqs here to let del_gendisk()
2065 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2066 * to avoid UAF on io_uring ctx.
2067 *
2068 * Note: force_abort is guaranteed to be seen because it is set
2069 * before request queue is unqiuesced.
2070 */
2071 if (ublk_nosrv_should_queue_io(ubq) &&
2072 unlikely(READ_ONCE(ubq->force_abort)))
2073 return BLK_STS_IOERR;
2074
2075 if (check_cancel && unlikely(ubq->canceling))
2076 return BLK_STS_IOERR;
2077
2078 /* fill iod to slot in io cmd buffer */
2079 res = ublk_setup_iod(ubq, rq);
2080 if (unlikely(res != BLK_STS_OK))
2081 return BLK_STS_IOERR;
2082
2083 blk_mq_start_request(rq);
2084 return BLK_STS_OK;
2085 }
2086
2087 /*
2088 * Common helper for queue_rq that handles request preparation and
2089 * cancellation checks. Returns status and sets should_queue to indicate
2090 * whether the caller should proceed with queuing the request.
2091 */
__ublk_queue_rq_common(struct ublk_queue * ubq,struct request * rq,bool * should_queue)2092 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2093 struct request *rq,
2094 bool *should_queue)
2095 {
2096 blk_status_t res;
2097
2098 res = ublk_prep_req(ubq, rq, false);
2099 if (res != BLK_STS_OK) {
2100 *should_queue = false;
2101 return res;
2102 }
2103
2104 /*
2105 * ->canceling has to be handled after ->force_abort and ->fail_io
2106 * is dealt with, otherwise this request may not be failed in case
2107 * of recovery, and cause hang when deleting disk
2108 */
2109 if (unlikely(ubq->canceling)) {
2110 *should_queue = false;
2111 __ublk_abort_rq(ubq, rq);
2112 return BLK_STS_OK;
2113 }
2114
2115 *should_queue = true;
2116 return BLK_STS_OK;
2117 }
2118
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2119 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2120 const struct blk_mq_queue_data *bd)
2121 {
2122 struct ublk_queue *ubq = hctx->driver_data;
2123 struct request *rq = bd->rq;
2124 bool should_queue;
2125 blk_status_t res;
2126
2127 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2128 if (!should_queue)
2129 return res;
2130
2131 ublk_queue_cmd(ubq, rq);
2132 return BLK_STS_OK;
2133 }
2134
ublk_batch_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2135 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2136 const struct blk_mq_queue_data *bd)
2137 {
2138 struct ublk_queue *ubq = hctx->driver_data;
2139 struct request *rq = bd->rq;
2140 bool should_queue;
2141 blk_status_t res;
2142
2143 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2144 if (!should_queue)
2145 return res;
2146
2147 ublk_batch_queue_cmd(ubq, rq, bd->last);
2148 return BLK_STS_OK;
2149 }
2150
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)2151 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2152 const struct ublk_io *io2)
2153 {
2154 return (io_uring_cmd_ctx_handle(io->cmd) ==
2155 io_uring_cmd_ctx_handle(io2->cmd)) &&
2156 (io->task == io2->task);
2157 }
2158
ublk_commit_rqs(struct blk_mq_hw_ctx * hctx)2159 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2160 {
2161 struct ublk_queue *ubq = hctx->driver_data;
2162 struct ublk_batch_fetch_cmd *fcmd;
2163
2164 spin_lock(&ubq->evts_lock);
2165 fcmd = __ublk_acquire_fcmd(ubq);
2166 spin_unlock(&ubq->evts_lock);
2167
2168 if (fcmd)
2169 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2170 }
2171
ublk_queue_rqs(struct rq_list * rqlist)2172 static void ublk_queue_rqs(struct rq_list *rqlist)
2173 {
2174 struct rq_list requeue_list = { };
2175 struct rq_list submit_list = { };
2176 struct ublk_io *io = NULL;
2177 struct request *req;
2178
2179 while ((req = rq_list_pop(rqlist))) {
2180 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2181 struct ublk_io *this_io = &this_q->ios[req->tag];
2182
2183 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2184 rq_list_add_tail(&requeue_list, req);
2185 continue;
2186 }
2187
2188 if (io && !ublk_belong_to_same_batch(io, this_io) &&
2189 !rq_list_empty(&submit_list))
2190 ublk_queue_cmd_list(io, &submit_list);
2191 io = this_io;
2192 rq_list_add_tail(&submit_list, req);
2193 }
2194
2195 if (!rq_list_empty(&submit_list))
2196 ublk_queue_cmd_list(io, &submit_list);
2197 *rqlist = requeue_list;
2198 }
2199
ublk_batch_queue_cmd_list(struct ublk_queue * ubq,struct rq_list * l)2200 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2201 {
2202 unsigned short tags[MAX_NR_TAG];
2203 struct ublk_batch_fetch_cmd *fcmd;
2204 struct request *rq;
2205 unsigned cnt = 0;
2206
2207 spin_lock(&ubq->evts_lock);
2208 rq_list_for_each(l, rq) {
2209 tags[cnt++] = (unsigned short)rq->tag;
2210 if (cnt >= MAX_NR_TAG) {
2211 kfifo_in(&ubq->evts_fifo, tags, cnt);
2212 cnt = 0;
2213 }
2214 }
2215 if (cnt)
2216 kfifo_in(&ubq->evts_fifo, tags, cnt);
2217 fcmd = __ublk_acquire_fcmd(ubq);
2218 spin_unlock(&ubq->evts_lock);
2219
2220 rq_list_init(l);
2221 if (fcmd)
2222 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2223 }
2224
ublk_batch_queue_rqs(struct rq_list * rqlist)2225 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2226 {
2227 struct rq_list requeue_list = { };
2228 struct rq_list submit_list = { };
2229 struct ublk_queue *ubq = NULL;
2230 struct request *req;
2231
2232 while ((req = rq_list_pop(rqlist))) {
2233 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2234
2235 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2236 rq_list_add_tail(&requeue_list, req);
2237 continue;
2238 }
2239
2240 if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2241 ublk_batch_queue_cmd_list(ubq, &submit_list);
2242 ubq = this_q;
2243 rq_list_add_tail(&submit_list, req);
2244 }
2245
2246 if (!rq_list_empty(&submit_list))
2247 ublk_batch_queue_cmd_list(ubq, &submit_list);
2248 *rqlist = requeue_list;
2249 }
2250
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)2251 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2252 unsigned int hctx_idx)
2253 {
2254 struct ublk_device *ub = driver_data;
2255 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2256
2257 hctx->driver_data = ubq;
2258 return 0;
2259 }
2260
2261 static const struct blk_mq_ops ublk_mq_ops = {
2262 .queue_rq = ublk_queue_rq,
2263 .queue_rqs = ublk_queue_rqs,
2264 .init_hctx = ublk_init_hctx,
2265 .timeout = ublk_timeout,
2266 };
2267
2268 static const struct blk_mq_ops ublk_batch_mq_ops = {
2269 .commit_rqs = ublk_commit_rqs,
2270 .queue_rq = ublk_batch_queue_rq,
2271 .queue_rqs = ublk_batch_queue_rqs,
2272 .init_hctx = ublk_init_hctx,
2273 .timeout = ublk_timeout,
2274 };
2275
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2276 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2277 {
2278 int i;
2279
2280 ubq->nr_io_ready = 0;
2281
2282 for (i = 0; i < ubq->q_depth; i++) {
2283 struct ublk_io *io = &ubq->ios[i];
2284
2285 /*
2286 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2287 * io->cmd
2288 */
2289 io->flags &= UBLK_IO_FLAG_CANCELED;
2290 io->cmd = NULL;
2291 io->buf.addr = 0;
2292
2293 /*
2294 * old task is PF_EXITING, put it now
2295 *
2296 * It could be NULL in case of closing one quiesced
2297 * device.
2298 */
2299 if (io->task) {
2300 put_task_struct(io->task);
2301 io->task = NULL;
2302 }
2303
2304 WARN_ON_ONCE(refcount_read(&io->ref));
2305 WARN_ON_ONCE(io->task_registered_buffers);
2306 }
2307 }
2308
ublk_ch_open(struct inode * inode,struct file * filp)2309 static int ublk_ch_open(struct inode *inode, struct file *filp)
2310 {
2311 struct ublk_device *ub = container_of(inode->i_cdev,
2312 struct ublk_device, cdev);
2313
2314 if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2315 return -EBUSY;
2316 filp->private_data = ub;
2317 ub->ublksrv_tgid = current->tgid;
2318 return 0;
2319 }
2320
ublk_reset_ch_dev(struct ublk_device * ub)2321 static void ublk_reset_ch_dev(struct ublk_device *ub)
2322 {
2323 int i;
2324
2325 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2326 ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2327
2328 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2329 ub->mm = NULL;
2330 ub->nr_queue_ready = 0;
2331 ub->unprivileged_daemons = false;
2332 ub->ublksrv_tgid = -1;
2333 }
2334
ublk_get_disk(struct ublk_device * ub)2335 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2336 {
2337 struct gendisk *disk;
2338
2339 spin_lock(&ub->lock);
2340 disk = ub->ub_disk;
2341 if (disk)
2342 get_device(disk_to_dev(disk));
2343 spin_unlock(&ub->lock);
2344
2345 return disk;
2346 }
2347
ublk_put_disk(struct gendisk * disk)2348 static void ublk_put_disk(struct gendisk *disk)
2349 {
2350 if (disk)
2351 put_device(disk_to_dev(disk));
2352 }
2353
ublk_partition_scan_work(struct work_struct * work)2354 static void ublk_partition_scan_work(struct work_struct *work)
2355 {
2356 struct ublk_device *ub =
2357 container_of(work, struct ublk_device, partition_scan_work);
2358 /* Hold disk reference to prevent UAF during concurrent teardown */
2359 struct gendisk *disk = ublk_get_disk(ub);
2360
2361 if (!disk)
2362 return;
2363
2364 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2365 &disk->state)))
2366 goto out;
2367
2368 mutex_lock(&disk->open_mutex);
2369 bdev_disk_changed(disk, false);
2370 mutex_unlock(&disk->open_mutex);
2371 out:
2372 ublk_put_disk(disk);
2373 }
2374
2375 /*
2376 * Use this function to ensure that ->canceling is consistently set for
2377 * the device and all queues. Do not set these flags directly.
2378 *
2379 * Caller must ensure that:
2380 * - cancel_mutex is held. This ensures that there is no concurrent
2381 * access to ub->canceling and no concurrent writes to ubq->canceling.
2382 * - there are no concurrent reads of ubq->canceling from the queue_rq
2383 * path. This can be done by quiescing the queue, or through other
2384 * means.
2385 */
ublk_set_canceling(struct ublk_device * ub,bool canceling)2386 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2387 __must_hold(&ub->cancel_mutex)
2388 {
2389 int i;
2390
2391 ub->canceling = canceling;
2392 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2393 ublk_get_queue(ub, i)->canceling = canceling;
2394 }
2395
ublk_check_and_reset_active_ref(struct ublk_device * ub)2396 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2397 {
2398 int i, j;
2399
2400 if (!ublk_dev_need_req_ref(ub))
2401 return false;
2402
2403 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2404 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2405
2406 for (j = 0; j < ubq->q_depth; j++) {
2407 struct ublk_io *io = &ubq->ios[j];
2408 unsigned int refs = refcount_read(&io->ref) +
2409 io->task_registered_buffers;
2410
2411 /*
2412 * UBLK_REFCOUNT_INIT or zero means no active
2413 * reference
2414 */
2415 if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2416 return true;
2417
2418 /* reset to zero if the io hasn't active references */
2419 refcount_set(&io->ref, 0);
2420 io->task_registered_buffers = 0;
2421 }
2422 }
2423 return false;
2424 }
2425
ublk_ch_release_work_fn(struct work_struct * work)2426 static void ublk_ch_release_work_fn(struct work_struct *work)
2427 {
2428 struct ublk_device *ub =
2429 container_of(work, struct ublk_device, exit_work.work);
2430 struct gendisk *disk;
2431 int i;
2432
2433 /*
2434 * For zero-copy and auto buffer register modes, I/O references
2435 * might not be dropped naturally when the daemon is killed, but
2436 * io_uring guarantees that registered bvec kernel buffers are
2437 * unregistered finally when freeing io_uring context, then the
2438 * active references are dropped.
2439 *
2440 * Wait until active references are dropped for avoiding use-after-free
2441 *
2442 * registered buffer may be unregistered in io_ring's release hander,
2443 * so have to wait by scheduling work function for avoiding the two
2444 * file release dependency.
2445 */
2446 if (ublk_check_and_reset_active_ref(ub)) {
2447 schedule_delayed_work(&ub->exit_work, 1);
2448 return;
2449 }
2450
2451 /*
2452 * disk isn't attached yet, either device isn't live, or it has
2453 * been removed already, so we needn't to do anything
2454 */
2455 disk = ublk_get_disk(ub);
2456 if (!disk)
2457 goto out;
2458
2459 /*
2460 * All uring_cmd are done now, so abort any request outstanding to
2461 * the ublk server
2462 *
2463 * This can be done in lockless way because ublk server has been
2464 * gone
2465 *
2466 * More importantly, we have to provide forward progress guarantee
2467 * without holding ub->mutex, otherwise control task grabbing
2468 * ub->mutex triggers deadlock
2469 *
2470 * All requests may be inflight, so ->canceling may not be set, set
2471 * it now.
2472 */
2473 mutex_lock(&ub->cancel_mutex);
2474 ublk_set_canceling(ub, true);
2475 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2476 ublk_abort_queue(ub, ublk_get_queue(ub, i));
2477 mutex_unlock(&ub->cancel_mutex);
2478 blk_mq_kick_requeue_list(disk->queue);
2479
2480 /*
2481 * All infligh requests have been completed or requeued and any new
2482 * request will be failed or requeued via `->canceling` now, so it is
2483 * fine to grab ub->mutex now.
2484 */
2485 mutex_lock(&ub->mutex);
2486
2487 /* double check after grabbing lock */
2488 if (!ub->ub_disk)
2489 goto unlock;
2490
2491 /*
2492 * Transition the device to the nosrv state. What exactly this
2493 * means depends on the recovery flags
2494 */
2495 if (ublk_nosrv_should_stop_dev(ub)) {
2496 /*
2497 * Allow any pending/future I/O to pass through quickly
2498 * with an error. This is needed because del_gendisk
2499 * waits for all pending I/O to complete
2500 */
2501 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2502 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2503
2504 ublk_stop_dev_unlocked(ub);
2505 } else {
2506 if (ublk_nosrv_dev_should_queue_io(ub)) {
2507 /* ->canceling is set and all requests are aborted */
2508 ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2509 } else {
2510 ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2511 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2512 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2513 }
2514 }
2515 unlock:
2516 mutex_unlock(&ub->mutex);
2517 ublk_put_disk(disk);
2518
2519 /* all uring_cmd has been done now, reset device & ubq */
2520 ublk_reset_ch_dev(ub);
2521 out:
2522 clear_bit(UB_STATE_OPEN, &ub->state);
2523
2524 /* put the reference grabbed in ublk_ch_release() */
2525 ublk_put_device(ub);
2526 }
2527
ublk_ch_release(struct inode * inode,struct file * filp)2528 static int ublk_ch_release(struct inode *inode, struct file *filp)
2529 {
2530 struct ublk_device *ub = filp->private_data;
2531
2532 /*
2533 * Grab ublk device reference, so it won't be gone until we are
2534 * really released from work function.
2535 */
2536 ublk_get_device(ub);
2537
2538 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2539 schedule_delayed_work(&ub->exit_work, 0);
2540 return 0;
2541 }
2542
2543 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)2544 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2545 {
2546 struct ublk_device *ub = filp->private_data;
2547 size_t sz = vma->vm_end - vma->vm_start;
2548 unsigned max_sz = ublk_max_cmd_buf_size();
2549 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2550 int q_id, ret = 0;
2551
2552 spin_lock(&ub->lock);
2553 if (!ub->mm)
2554 ub->mm = current->mm;
2555 if (current->mm != ub->mm)
2556 ret = -EINVAL;
2557 spin_unlock(&ub->lock);
2558
2559 if (ret)
2560 return ret;
2561
2562 if (vma->vm_flags & VM_WRITE)
2563 return -EPERM;
2564
2565 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2566 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2567 return -EINVAL;
2568
2569 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2570 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2571 __func__, q_id, current->pid, vma->vm_start,
2572 phys_off, (unsigned long)sz);
2573
2574 if (sz != ublk_queue_cmd_buf_size(ub))
2575 return -EINVAL;
2576
2577 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2578 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2579 }
2580
__ublk_fail_req(struct ublk_device * ub,struct ublk_io * io,struct request * req)2581 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2582 struct request *req)
2583 {
2584 WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2585 io->flags & UBLK_IO_FLAG_ACTIVE);
2586
2587 if (ublk_nosrv_should_reissue_outstanding(ub))
2588 blk_mq_requeue_request(req, false);
2589 else {
2590 io->res = -EIO;
2591 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2592 }
2593 }
2594
2595 /*
2596 * Request tag may just be filled to event kfifo, not get chance to
2597 * dispatch, abort these requests too
2598 */
ublk_abort_batch_queue(struct ublk_device * ub,struct ublk_queue * ubq)2599 static void ublk_abort_batch_queue(struct ublk_device *ub,
2600 struct ublk_queue *ubq)
2601 {
2602 unsigned short tag;
2603
2604 while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2605 struct request *req = blk_mq_tag_to_rq(
2606 ub->tag_set.tags[ubq->q_id], tag);
2607
2608 if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2609 __ublk_fail_req(ub, &ubq->ios[tag], req);
2610 }
2611 }
2612
2613 /*
2614 * Called from ublk char device release handler, when any uring_cmd is
2615 * done, meantime request queue is "quiesced" since all inflight requests
2616 * can't be completed because ublk server is dead.
2617 *
2618 * So no one can hold our request IO reference any more, simply ignore the
2619 * reference, and complete the request immediately
2620 */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)2621 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2622 {
2623 int i;
2624
2625 for (i = 0; i < ubq->q_depth; i++) {
2626 struct ublk_io *io = &ubq->ios[i];
2627
2628 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2629 __ublk_fail_req(ub, io, io->req);
2630 }
2631
2632 if (ublk_support_batch_io(ubq))
2633 ublk_abort_batch_queue(ub, ubq);
2634 }
2635
ublk_start_cancel(struct ublk_device * ub)2636 static void ublk_start_cancel(struct ublk_device *ub)
2637 {
2638 struct gendisk *disk = ublk_get_disk(ub);
2639
2640 /* Our disk has been dead */
2641 if (!disk)
2642 return;
2643
2644 mutex_lock(&ub->cancel_mutex);
2645 if (ub->canceling)
2646 goto out;
2647 /*
2648 * Now we are serialized with ublk_queue_rq()
2649 *
2650 * Make sure that ubq->canceling is set when queue is frozen,
2651 * because ublk_queue_rq() has to rely on this flag for avoiding to
2652 * touch completed uring_cmd
2653 */
2654 blk_mq_quiesce_queue(disk->queue);
2655 ublk_set_canceling(ub, true);
2656 blk_mq_unquiesce_queue(disk->queue);
2657 out:
2658 mutex_unlock(&ub->cancel_mutex);
2659 ublk_put_disk(disk);
2660 }
2661
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)2662 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2663 unsigned int issue_flags)
2664 {
2665 struct ublk_io *io = &ubq->ios[tag];
2666 struct ublk_device *ub = ubq->dev;
2667 struct request *req;
2668 bool done;
2669
2670 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2671 return;
2672
2673 /*
2674 * Don't try to cancel this command if the request is started for
2675 * avoiding race between io_uring_cmd_done() and
2676 * io_uring_cmd_complete_in_task().
2677 *
2678 * Either the started request will be aborted via __ublk_abort_rq(),
2679 * then this uring_cmd is canceled next time, or it will be done in
2680 * task work function ublk_dispatch_req() because io_uring guarantees
2681 * that ublk_dispatch_req() is always called
2682 */
2683 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2684 if (req && blk_mq_request_started(req) && req->tag == tag)
2685 return;
2686
2687 spin_lock(&ubq->cancel_lock);
2688 done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2689 if (!done)
2690 io->flags |= UBLK_IO_FLAG_CANCELED;
2691 spin_unlock(&ubq->cancel_lock);
2692
2693 if (!done)
2694 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2695 }
2696
2697 /*
2698 * Cancel a batch fetch command if it hasn't been claimed by another path.
2699 *
2700 * An fcmd can only be cancelled if:
2701 * 1. It's not the active_fcmd (which is currently being processed)
2702 * 2. It's still on the list (!list_empty check) - once removed from the list,
2703 * the fcmd is considered claimed and will be freed by whoever removed it
2704 *
2705 * Use list_del_init() so subsequent list_empty() checks work correctly.
2706 */
ublk_batch_cancel_cmd(struct ublk_queue * ubq,struct ublk_batch_fetch_cmd * fcmd,unsigned int issue_flags)2707 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2708 struct ublk_batch_fetch_cmd *fcmd,
2709 unsigned int issue_flags)
2710 {
2711 bool done;
2712
2713 spin_lock(&ubq->evts_lock);
2714 done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2715 if (done)
2716 list_del_init(&fcmd->node);
2717 spin_unlock(&ubq->evts_lock);
2718
2719 if (done) {
2720 io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2721 ublk_batch_free_fcmd(fcmd);
2722 }
2723 }
2724
ublk_batch_cancel_queue(struct ublk_queue * ubq)2725 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2726 {
2727 struct ublk_batch_fetch_cmd *fcmd;
2728 LIST_HEAD(fcmd_list);
2729
2730 spin_lock(&ubq->evts_lock);
2731 ubq->force_abort = true;
2732 list_splice_init(&ubq->fcmd_head, &fcmd_list);
2733 fcmd = READ_ONCE(ubq->active_fcmd);
2734 if (fcmd)
2735 list_move(&fcmd->node, &ubq->fcmd_head);
2736 spin_unlock(&ubq->evts_lock);
2737
2738 while (!list_empty(&fcmd_list)) {
2739 fcmd = list_first_entry(&fcmd_list,
2740 struct ublk_batch_fetch_cmd, node);
2741 ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2742 }
2743 }
2744
ublk_batch_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2745 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2746 unsigned int issue_flags)
2747 {
2748 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2749 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2750 struct ublk_queue *ubq = pdu->ubq;
2751
2752 ublk_start_cancel(ubq->dev);
2753
2754 ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2755 }
2756
2757 /*
2758 * The ublk char device won't be closed when calling cancel fn, so both
2759 * ublk device and queue are guaranteed to be live
2760 *
2761 * Two-stage cancel:
2762 *
2763 * - make every active uring_cmd done in ->cancel_fn()
2764 *
2765 * - aborting inflight ublk IO requests in ublk char device release handler,
2766 * which depends on 1st stage because device can only be closed iff all
2767 * uring_cmd are done
2768 *
2769 * Do _not_ try to acquire ub->mutex before all inflight requests are
2770 * aborted, otherwise deadlock may be caused.
2771 */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2772 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2773 unsigned int issue_flags)
2774 {
2775 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2776 struct ublk_queue *ubq = pdu->ubq;
2777 struct task_struct *task;
2778 struct ublk_io *io;
2779
2780 if (WARN_ON_ONCE(!ubq))
2781 return;
2782
2783 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2784 return;
2785
2786 task = io_uring_cmd_get_task(cmd);
2787 io = &ubq->ios[pdu->tag];
2788 if (WARN_ON_ONCE(task && task != io->task))
2789 return;
2790
2791 ublk_start_cancel(ubq->dev);
2792
2793 WARN_ON_ONCE(io->cmd != cmd);
2794 ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2795 }
2796
ublk_queue_ready(const struct ublk_queue * ubq)2797 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2798 {
2799 return ubq->nr_io_ready == ubq->q_depth;
2800 }
2801
ublk_dev_ready(const struct ublk_device * ub)2802 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2803 {
2804 return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2805 }
2806
ublk_cancel_queue(struct ublk_queue * ubq)2807 static void ublk_cancel_queue(struct ublk_queue *ubq)
2808 {
2809 int i;
2810
2811 if (ublk_support_batch_io(ubq)) {
2812 ublk_batch_cancel_queue(ubq);
2813 return;
2814 }
2815
2816 for (i = 0; i < ubq->q_depth; i++)
2817 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2818 }
2819
2820 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)2821 static void ublk_cancel_dev(struct ublk_device *ub)
2822 {
2823 int i;
2824
2825 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2826 ublk_cancel_queue(ublk_get_queue(ub, i));
2827 }
2828
ublk_check_inflight_rq(struct request * rq,void * data)2829 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2830 {
2831 bool *idle = data;
2832
2833 if (blk_mq_request_started(rq)) {
2834 *idle = false;
2835 return false;
2836 }
2837 return true;
2838 }
2839
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)2840 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2841 {
2842 bool idle;
2843
2844 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2845 while (true) {
2846 idle = true;
2847 blk_mq_tagset_busy_iter(&ub->tag_set,
2848 ublk_check_inflight_rq, &idle);
2849 if (idle)
2850 break;
2851 msleep(UBLK_REQUEUE_DELAY_MS);
2852 }
2853 }
2854
ublk_force_abort_dev(struct ublk_device * ub)2855 static void ublk_force_abort_dev(struct ublk_device *ub)
2856 {
2857 int i;
2858
2859 pr_devel("%s: force abort ub: dev_id %d state %s\n",
2860 __func__, ub->dev_info.dev_id,
2861 ub->dev_info.state == UBLK_S_DEV_LIVE ?
2862 "LIVE" : "QUIESCED");
2863 blk_mq_quiesce_queue(ub->ub_disk->queue);
2864 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2865 ublk_wait_tagset_rqs_idle(ub);
2866
2867 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2868 ublk_get_queue(ub, i)->force_abort = true;
2869 blk_mq_unquiesce_queue(ub->ub_disk->queue);
2870 /* We may have requeued some rqs in ublk_quiesce_queue() */
2871 blk_mq_kick_requeue_list(ub->ub_disk->queue);
2872 }
2873
ublk_detach_disk(struct ublk_device * ub)2874 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2875 {
2876 struct gendisk *disk;
2877
2878 /* Sync with ublk_abort_queue() by holding the lock */
2879 spin_lock(&ub->lock);
2880 disk = ub->ub_disk;
2881 ub->dev_info.state = UBLK_S_DEV_DEAD;
2882 ub->dev_info.ublksrv_pid = -1;
2883 ub->ub_disk = NULL;
2884 spin_unlock(&ub->lock);
2885
2886 return disk;
2887 }
2888
ublk_stop_dev_unlocked(struct ublk_device * ub)2889 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2890 __must_hold(&ub->mutex)
2891 {
2892 struct gendisk *disk;
2893
2894 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2895 return;
2896
2897 if (ublk_nosrv_dev_should_queue_io(ub))
2898 ublk_force_abort_dev(ub);
2899 del_gendisk(ub->ub_disk);
2900 disk = ublk_detach_disk(ub);
2901 put_disk(disk);
2902 }
2903
ublk_stop_dev(struct ublk_device * ub)2904 static void ublk_stop_dev(struct ublk_device *ub)
2905 {
2906 mutex_lock(&ub->mutex);
2907 ublk_stop_dev_unlocked(ub);
2908 mutex_unlock(&ub->mutex);
2909 cancel_work_sync(&ub->partition_scan_work);
2910 ublk_cancel_dev(ub);
2911 }
2912
2913 /* reset per-queue io flags */
ublk_queue_reset_io_flags(struct ublk_queue * ubq)2914 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2915 {
2916 int j;
2917
2918 /* UBLK_IO_FLAG_CANCELED can be cleared now */
2919 spin_lock(&ubq->cancel_lock);
2920 for (j = 0; j < ubq->q_depth; j++)
2921 ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2922 ubq->canceling = false;
2923 spin_unlock(&ubq->cancel_lock);
2924 ubq->fail_io = false;
2925 }
2926
2927 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,u16 q_id)2928 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
2929 __must_hold(&ub->mutex)
2930 {
2931 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2932
2933 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2934 ub->unprivileged_daemons = true;
2935
2936 ubq->nr_io_ready++;
2937
2938 /* Check if this specific queue is now fully ready */
2939 if (ublk_queue_ready(ubq)) {
2940 ub->nr_queue_ready++;
2941
2942 /*
2943 * Reset queue flags as soon as this queue is ready.
2944 * This clears the canceling flag, allowing batch FETCH commands
2945 * to succeed during recovery without waiting for all queues.
2946 */
2947 ublk_queue_reset_io_flags(ubq);
2948 }
2949
2950 /* Check if all queues are ready */
2951 if (ublk_dev_ready(ub)) {
2952 /*
2953 * All queues ready - clear device-level canceling flag
2954 * and complete the recovery/initialization.
2955 */
2956 mutex_lock(&ub->cancel_mutex);
2957 ub->canceling = false;
2958 mutex_unlock(&ub->cancel_mutex);
2959 complete_all(&ub->completion);
2960 }
2961 }
2962
ublk_check_cmd_op(u32 cmd_op)2963 static inline int ublk_check_cmd_op(u32 cmd_op)
2964 {
2965 u32 ioc_type = _IOC_TYPE(cmd_op);
2966
2967 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2968 return -EOPNOTSUPP;
2969
2970 if (ioc_type != 'u' && ioc_type != 0)
2971 return -EOPNOTSUPP;
2972
2973 return 0;
2974 }
2975
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)2976 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2977 {
2978 struct ublk_auto_buf_reg buf;
2979
2980 buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2981
2982 if (buf.reserved0 || buf.reserved1)
2983 return -EINVAL;
2984
2985 if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2986 return -EINVAL;
2987 io->buf.auto_reg = buf;
2988 return 0;
2989 }
2990
ublk_clear_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)2991 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
2992 struct io_uring_cmd *cmd,
2993 u16 *buf_idx)
2994 {
2995 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
2996 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
2997
2998 /*
2999 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3000 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3001 * `io_ring_ctx`.
3002 *
3003 * If this uring_cmd's io_ring_ctx isn't same with the
3004 * one for registering the buffer, it is ublk server's
3005 * responsibility for unregistering the buffer, otherwise
3006 * this ublk request gets stuck.
3007 */
3008 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3009 *buf_idx = io->buf.auto_reg.index;
3010 }
3011 }
3012
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3013 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3014 struct io_uring_cmd *cmd,
3015 u16 *buf_idx)
3016 {
3017 ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3018 return ublk_set_auto_buf_reg(io, cmd);
3019 }
3020
3021 /* Once we return, `io->req` can't be used any more */
3022 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)3023 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3024 {
3025 struct request *req = io->req;
3026
3027 io->cmd = cmd;
3028 io->flags |= UBLK_IO_FLAG_ACTIVE;
3029 /* now this cmd slot is owned by ublk driver */
3030 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3031
3032 return req;
3033 }
3034
3035 static inline int
ublk_config_io_buf(const struct ublk_device * ub,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)3036 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3037 struct io_uring_cmd *cmd, unsigned long buf_addr,
3038 u16 *buf_idx)
3039 {
3040 if (ublk_dev_support_auto_buf_reg(ub))
3041 return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3042
3043 io->buf.addr = buf_addr;
3044 return 0;
3045 }
3046
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)3047 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3048 unsigned int issue_flags,
3049 struct ublk_queue *ubq, unsigned int tag)
3050 {
3051 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3052
3053 /*
3054 * Safe to refer to @ubq since ublk_queue won't be died until its
3055 * commands are completed
3056 */
3057 pdu->ubq = ubq;
3058 pdu->tag = tag;
3059 io_uring_cmd_mark_cancelable(cmd, issue_flags);
3060 }
3061
ublk_io_release(void * priv)3062 static void ublk_io_release(void *priv)
3063 {
3064 struct request *rq = priv;
3065 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3066 struct ublk_io *io = &ubq->ios[rq->tag];
3067
3068 /*
3069 * task_registered_buffers may be 0 if buffers were registered off task
3070 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3071 */
3072 if (current == io->task && io->task_registered_buffers)
3073 io->task_registered_buffers--;
3074 else
3075 ublk_put_req_ref(io, rq);
3076 }
3077
ublk_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned int index,unsigned int issue_flags)3078 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3079 struct ublk_device *ub,
3080 u16 q_id, u16 tag,
3081 struct ublk_io *io,
3082 unsigned int index, unsigned int issue_flags)
3083 {
3084 struct request *req;
3085 int ret;
3086
3087 if (!ublk_dev_support_zero_copy(ub))
3088 return -EINVAL;
3089
3090 req = __ublk_check_and_get_req(ub, q_id, tag, io);
3091 if (!req)
3092 return -EINVAL;
3093
3094 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3095 issue_flags);
3096 if (ret) {
3097 ublk_put_req_ref(io, req);
3098 return ret;
3099 }
3100
3101 return 0;
3102 }
3103
3104 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned index,unsigned issue_flags)3105 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3106 struct ublk_device *ub,
3107 u16 q_id, u16 tag, struct ublk_io *io,
3108 unsigned index, unsigned issue_flags)
3109 {
3110 unsigned new_registered_buffers;
3111 struct request *req = io->req;
3112 int ret;
3113
3114 /*
3115 * Ensure there are still references for ublk_sub_req_ref() to release.
3116 * If not, fall back on the thread-safe buffer registration.
3117 */
3118 new_registered_buffers = io->task_registered_buffers + 1;
3119 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3120 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3121 issue_flags);
3122
3123 if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3124 return -EINVAL;
3125
3126 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3127 issue_flags);
3128 if (ret)
3129 return ret;
3130
3131 io->task_registered_buffers = new_registered_buffers;
3132 return 0;
3133 }
3134
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)3135 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3136 const struct ublk_device *ub,
3137 unsigned int index, unsigned int issue_flags)
3138 {
3139 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3140 return -EINVAL;
3141
3142 return io_buffer_unregister_bvec(cmd, index, issue_flags);
3143 }
3144
ublk_check_fetch_buf(const struct ublk_device * ub,__u64 buf_addr)3145 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3146 {
3147 if (ublk_dev_need_map_io(ub)) {
3148 /*
3149 * FETCH_RQ has to provide IO buffer if NEED GET
3150 * DATA is not enabled
3151 */
3152 if (!buf_addr && !ublk_dev_need_get_data(ub))
3153 return -EINVAL;
3154 } else if (buf_addr) {
3155 /* User copy requires addr to be unset */
3156 return -EINVAL;
3157 }
3158 return 0;
3159 }
3160
__ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,u16 q_id)3161 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3162 struct ublk_io *io, u16 q_id)
3163 {
3164 /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3165 if (ublk_dev_ready(ub))
3166 return -EBUSY;
3167
3168 /* allow each command to be FETCHed at most once */
3169 if (io->flags & UBLK_IO_FLAG_ACTIVE)
3170 return -EINVAL;
3171
3172 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3173
3174 ublk_fill_io_cmd(io, cmd);
3175
3176 if (ublk_dev_support_batch_io(ub))
3177 WRITE_ONCE(io->task, NULL);
3178 else
3179 WRITE_ONCE(io->task, get_task_struct(current));
3180
3181 return 0;
3182 }
3183
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr,u16 q_id)3184 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3185 struct ublk_io *io, __u64 buf_addr, u16 q_id)
3186 {
3187 int ret;
3188
3189 /*
3190 * When handling FETCH command for setting up ublk uring queue,
3191 * ub->mutex is the innermost lock, and we won't block for handling
3192 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3193 */
3194 mutex_lock(&ub->mutex);
3195 ret = __ublk_fetch(cmd, ub, io, q_id);
3196 if (!ret)
3197 ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3198 if (!ret)
3199 ublk_mark_io_ready(ub, q_id);
3200 mutex_unlock(&ub->mutex);
3201 return ret;
3202 }
3203
ublk_check_commit_and_fetch(const struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)3204 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3205 struct ublk_io *io, __u64 buf_addr)
3206 {
3207 struct request *req = io->req;
3208
3209 if (ublk_dev_need_map_io(ub)) {
3210 /*
3211 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3212 * NEED GET DATA is not enabled or it is Read IO.
3213 */
3214 if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3215 req_op(req) == REQ_OP_READ))
3216 return -EINVAL;
3217 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3218 /*
3219 * User copy requires addr to be unset when command is
3220 * not zone append
3221 */
3222 return -EINVAL;
3223 }
3224
3225 return 0;
3226 }
3227
ublk_need_complete_req(const struct ublk_device * ub,struct ublk_io * io)3228 static bool ublk_need_complete_req(const struct ublk_device *ub,
3229 struct ublk_io *io)
3230 {
3231 if (ublk_dev_need_req_ref(ub))
3232 return ublk_sub_req_ref(io);
3233 return true;
3234 }
3235
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)3236 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3237 struct request *req)
3238 {
3239 /*
3240 * We have handled UBLK_IO_NEED_GET_DATA command,
3241 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3242 * do the copy work.
3243 */
3244 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3245 /* update iod->addr because ublksrv may have passed a new io buffer */
3246 ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3247 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3248 __func__, ubq->q_id, req->tag, io->flags,
3249 ublk_get_iod(ubq, req->tag)->addr);
3250
3251 return ublk_start_io(ubq, req, io);
3252 }
3253
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)3254 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3255 unsigned int issue_flags)
3256 {
3257 /* May point to userspace-mapped memory */
3258 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3259 struct ublksrv_io_cmd);
3260 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3261 struct ublk_device *ub = cmd->file->private_data;
3262 struct ublk_queue *ubq;
3263 struct ublk_io *io = NULL;
3264 u32 cmd_op = cmd->cmd_op;
3265 u16 q_id = READ_ONCE(ub_src->q_id);
3266 u16 tag = READ_ONCE(ub_src->tag);
3267 s32 result = READ_ONCE(ub_src->result);
3268 u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3269 struct request *req;
3270 int ret;
3271 bool compl;
3272
3273 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3274
3275 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3276 __func__, cmd->cmd_op, q_id, tag, result);
3277
3278 ret = ublk_check_cmd_op(cmd_op);
3279 if (ret)
3280 goto out;
3281
3282 /*
3283 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3284 * so no need to validate the q_id, tag, or task
3285 */
3286 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3287 return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3288
3289 ret = -EINVAL;
3290 if (q_id >= ub->dev_info.nr_hw_queues)
3291 goto out;
3292
3293 ubq = ublk_get_queue(ub, q_id);
3294
3295 if (tag >= ub->dev_info.queue_depth)
3296 goto out;
3297
3298 io = &ubq->ios[tag];
3299 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3300 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3301 ret = ublk_check_fetch_buf(ub, addr);
3302 if (ret)
3303 goto out;
3304 ret = ublk_fetch(cmd, ub, io, addr, q_id);
3305 if (ret)
3306 goto out;
3307
3308 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3309 return -EIOCBQUEUED;
3310 }
3311
3312 if (READ_ONCE(io->task) != current) {
3313 /*
3314 * ublk_register_io_buf() accesses only the io's refcount,
3315 * so can be handled on any task
3316 */
3317 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3318 return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3319 addr, issue_flags);
3320
3321 goto out;
3322 }
3323
3324 /* there is pending io cmd, something must be wrong */
3325 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3326 ret = -EBUSY;
3327 goto out;
3328 }
3329
3330 /*
3331 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3332 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3333 */
3334 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3335 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3336 goto out;
3337
3338 switch (_IOC_NR(cmd_op)) {
3339 case UBLK_IO_REGISTER_IO_BUF:
3340 return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3341 issue_flags);
3342 case UBLK_IO_COMMIT_AND_FETCH_REQ:
3343 ret = ublk_check_commit_and_fetch(ub, io, addr);
3344 if (ret)
3345 goto out;
3346 io->res = result;
3347 req = ublk_fill_io_cmd(io, cmd);
3348 ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3349 if (buf_idx != UBLK_INVALID_BUF_IDX)
3350 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3351 compl = ublk_need_complete_req(ub, io);
3352
3353 if (req_op(req) == REQ_OP_ZONE_APPEND)
3354 req->__sector = addr;
3355 if (compl)
3356 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3357
3358 if (ret)
3359 goto out;
3360 break;
3361 case UBLK_IO_NEED_GET_DATA:
3362 /*
3363 * ublk_get_data() may fail and fallback to requeue, so keep
3364 * uring_cmd active first and prepare for handling new requeued
3365 * request
3366 */
3367 req = ublk_fill_io_cmd(io, cmd);
3368 ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3369 WARN_ON_ONCE(ret);
3370 if (likely(ublk_get_data(ubq, io, req))) {
3371 __ublk_prep_compl_io_cmd(io, req);
3372 return UBLK_IO_RES_OK;
3373 }
3374 break;
3375 default:
3376 goto out;
3377 }
3378 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3379 return -EIOCBQUEUED;
3380
3381 out:
3382 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3383 __func__, cmd_op, tag, ret, io ? io->flags : 0);
3384 return ret;
3385 }
3386
__ublk_check_and_get_req(struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io)3387 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3388 u16 q_id, u16 tag, struct ublk_io *io)
3389 {
3390 struct request *req;
3391
3392 /*
3393 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3394 * which would overwrite it with io->cmd
3395 */
3396 req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3397 if (!req)
3398 return NULL;
3399
3400 if (!ublk_get_req_ref(io))
3401 return NULL;
3402
3403 if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3404 goto fail_put;
3405
3406 if (!ublk_rq_has_data(req))
3407 goto fail_put;
3408
3409 return req;
3410 fail_put:
3411 ublk_put_req_ref(io, req);
3412 return NULL;
3413 }
3414
ublk_ch_uring_cmd_cb(struct io_tw_req tw_req,io_tw_token_t tw)3415 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3416 {
3417 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3418 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3419 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3420
3421 if (ret != -EIOCBQUEUED)
3422 io_uring_cmd_done(cmd, ret, issue_flags);
3423 }
3424
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3425 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3426 {
3427 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3428 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3429 return 0;
3430 }
3431
3432 /* well-implemented server won't run into unlocked */
3433 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3434 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3435 return -EIOCBQUEUED;
3436 }
3437
3438 return ublk_ch_uring_cmd_local(cmd, issue_flags);
3439 }
3440
ublk_batch_buf_addr(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3441 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3442 const struct ublk_elem_header *elem)
3443 {
3444 const void *buf = elem;
3445
3446 if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3447 return *(const __u64 *)(buf + sizeof(*elem));
3448 return 0;
3449 }
3450
ublk_batch_zone_lba(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3451 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3452 const struct ublk_elem_header *elem)
3453 {
3454 const void *buf = elem;
3455
3456 if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3457 return *(const __u64 *)(buf + sizeof(*elem) +
3458 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3459 return -1;
3460 }
3461
3462 static struct ublk_auto_buf_reg
ublk_batch_auto_buf_reg(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3463 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3464 const struct ublk_elem_header *elem)
3465 {
3466 struct ublk_auto_buf_reg reg = {
3467 .index = elem->buf_index,
3468 .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3469 UBLK_AUTO_BUF_REG_FALLBACK : 0,
3470 };
3471
3472 return reg;
3473 }
3474
3475 /*
3476 * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3477 * it is the least common multiple(LCM) of 8, 16 and 24
3478 */
3479 #define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10)
3480 struct ublk_batch_io_iter {
3481 void __user *uaddr;
3482 unsigned done, total;
3483 unsigned char elem_bytes;
3484 /* copy to this buffer from user space */
3485 unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3486 };
3487
3488 static inline int
__ublk_walk_cmd_buf(struct ublk_queue * ubq,struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,unsigned bytes,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3489 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3490 struct ublk_batch_io_iter *iter,
3491 const struct ublk_batch_io_data *data,
3492 unsigned bytes,
3493 int (*cb)(struct ublk_queue *q,
3494 const struct ublk_batch_io_data *data,
3495 const struct ublk_elem_header *elem))
3496 {
3497 unsigned int i;
3498 int ret = 0;
3499
3500 for (i = 0; i < bytes; i += iter->elem_bytes) {
3501 const struct ublk_elem_header *elem =
3502 (const struct ublk_elem_header *)&iter->buf[i];
3503
3504 if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3505 ret = -EINVAL;
3506 break;
3507 }
3508
3509 ret = cb(ubq, data, elem);
3510 if (unlikely(ret))
3511 break;
3512 }
3513
3514 iter->done += i;
3515 return ret;
3516 }
3517
ublk_walk_cmd_buf(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3518 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3519 const struct ublk_batch_io_data *data,
3520 int (*cb)(struct ublk_queue *q,
3521 const struct ublk_batch_io_data *data,
3522 const struct ublk_elem_header *elem))
3523 {
3524 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3525 int ret = 0;
3526
3527 while (iter->done < iter->total) {
3528 unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3529
3530 if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3531 pr_warn("ublk%d: read batch cmd buffer failed\n",
3532 data->ub->dev_info.dev_id);
3533 return -EFAULT;
3534 }
3535
3536 ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3537 if (ret)
3538 return ret;
3539 }
3540 return 0;
3541 }
3542
ublk_batch_unprep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3543 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3544 const struct ublk_batch_io_data *data,
3545 const struct ublk_elem_header *elem)
3546 {
3547 struct ublk_io *io = &ubq->ios[elem->tag];
3548
3549 /*
3550 * If queue was ready before this decrement, it won't be anymore,
3551 * so we need to decrement the queue ready count and restore the
3552 * canceling flag to prevent new requests from being queued.
3553 */
3554 if (ublk_queue_ready(ubq)) {
3555 data->ub->nr_queue_ready--;
3556 spin_lock(&ubq->cancel_lock);
3557 ubq->canceling = true;
3558 spin_unlock(&ubq->cancel_lock);
3559 }
3560 ubq->nr_io_ready--;
3561
3562 ublk_io_lock(io);
3563 io->flags = 0;
3564 ublk_io_unlock(io);
3565 return 0;
3566 }
3567
ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data)3568 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3569 const struct ublk_batch_io_data *data)
3570 {
3571 int ret;
3572
3573 /* Re-process only what we've already processed, starting from beginning */
3574 iter->total = iter->done;
3575 iter->done = 0;
3576
3577 ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3578 WARN_ON_ONCE(ret);
3579 }
3580
ublk_batch_prep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3581 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3582 const struct ublk_batch_io_data *data,
3583 const struct ublk_elem_header *elem)
3584 {
3585 struct ublk_io *io = &ubq->ios[elem->tag];
3586 const struct ublk_batch_io *uc = &data->header;
3587 union ublk_io_buf buf = { 0 };
3588 int ret;
3589
3590 if (ublk_dev_support_auto_buf_reg(data->ub))
3591 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3592 else if (ublk_dev_need_map_io(data->ub)) {
3593 buf.addr = ublk_batch_buf_addr(uc, elem);
3594
3595 ret = ublk_check_fetch_buf(data->ub, buf.addr);
3596 if (ret)
3597 return ret;
3598 }
3599
3600 ublk_io_lock(io);
3601 ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3602 if (!ret)
3603 io->buf = buf;
3604 ublk_io_unlock(io);
3605
3606 if (!ret)
3607 ublk_mark_io_ready(data->ub, ubq->q_id);
3608
3609 return ret;
3610 }
3611
ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data * data)3612 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3613 {
3614 const struct ublk_batch_io *uc = &data->header;
3615 struct io_uring_cmd *cmd = data->cmd;
3616 struct ublk_batch_io_iter iter = {
3617 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3618 .total = uc->nr_elem * uc->elem_bytes,
3619 .elem_bytes = uc->elem_bytes,
3620 };
3621 int ret;
3622
3623 mutex_lock(&data->ub->mutex);
3624 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3625
3626 if (ret && iter.done)
3627 ublk_batch_revert_prep_cmd(&iter, data);
3628 mutex_unlock(&data->ub->mutex);
3629 return ret;
3630 }
3631
ublk_batch_commit_io_check(const struct ublk_queue * ubq,struct ublk_io * io,union ublk_io_buf * buf)3632 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3633 struct ublk_io *io,
3634 union ublk_io_buf *buf)
3635 {
3636 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3637 return -EBUSY;
3638
3639 /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3640 if (ublk_need_map_io(ubq) && !buf->addr)
3641 return -EINVAL;
3642 return 0;
3643 }
3644
ublk_batch_commit_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3645 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3646 const struct ublk_batch_io_data *data,
3647 const struct ublk_elem_header *elem)
3648 {
3649 struct ublk_io *io = &ubq->ios[elem->tag];
3650 const struct ublk_batch_io *uc = &data->header;
3651 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3652 union ublk_io_buf buf = { 0 };
3653 struct request *req = NULL;
3654 bool auto_reg = false;
3655 bool compl = false;
3656 int ret;
3657
3658 if (ublk_dev_support_auto_buf_reg(data->ub)) {
3659 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3660 auto_reg = true;
3661 } else if (ublk_dev_need_map_io(data->ub))
3662 buf.addr = ublk_batch_buf_addr(uc, elem);
3663
3664 ublk_io_lock(io);
3665 ret = ublk_batch_commit_io_check(ubq, io, &buf);
3666 if (!ret) {
3667 io->res = elem->result;
3668 io->buf = buf;
3669 req = ublk_fill_io_cmd(io, data->cmd);
3670
3671 if (auto_reg)
3672 ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3673 compl = ublk_need_complete_req(data->ub, io);
3674 }
3675 ublk_io_unlock(io);
3676
3677 if (unlikely(ret)) {
3678 pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3679 __func__, data->ub->dev_info.dev_id, ubq->q_id,
3680 elem->tag, ret);
3681 return ret;
3682 }
3683
3684 if (buf_idx != UBLK_INVALID_BUF_IDX)
3685 io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3686 if (req_op(req) == REQ_OP_ZONE_APPEND)
3687 req->__sector = ublk_batch_zone_lba(uc, elem);
3688 if (compl)
3689 __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3690 return 0;
3691 }
3692
ublk_handle_batch_commit_cmd(struct ublk_batch_io_data * data)3693 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3694 {
3695 const struct ublk_batch_io *uc = &data->header;
3696 struct io_uring_cmd *cmd = data->cmd;
3697 struct ublk_batch_io_iter iter = {
3698 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3699 .total = uc->nr_elem * uc->elem_bytes,
3700 .elem_bytes = uc->elem_bytes,
3701 };
3702 DEFINE_IO_COMP_BATCH(iob);
3703 int ret;
3704
3705 data->iob = &iob;
3706 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3707
3708 if (iob.complete)
3709 iob.complete(&iob);
3710
3711 return iter.done == 0 ? ret : iter.done;
3712 }
3713
ublk_check_batch_cmd_flags(const struct ublk_batch_io * uc)3714 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3715 {
3716 unsigned elem_bytes = sizeof(struct ublk_elem_header);
3717
3718 if (uc->flags & ~UBLK_BATCH_F_ALL)
3719 return -EINVAL;
3720
3721 /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3722 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3723 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3724 return -EINVAL;
3725
3726 elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3727 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3728 if (uc->elem_bytes != elem_bytes)
3729 return -EINVAL;
3730 return 0;
3731 }
3732
ublk_check_batch_cmd(const struct ublk_batch_io_data * data)3733 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3734 {
3735 const struct ublk_batch_io *uc = &data->header;
3736
3737 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3738 return -EINVAL;
3739
3740 if (uc->nr_elem > data->ub->dev_info.queue_depth)
3741 return -E2BIG;
3742
3743 if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3744 !ublk_dev_is_zoned(data->ub))
3745 return -EINVAL;
3746
3747 if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3748 !ublk_dev_need_map_io(data->ub))
3749 return -EINVAL;
3750
3751 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3752 !ublk_dev_support_auto_buf_reg(data->ub))
3753 return -EINVAL;
3754
3755 return ublk_check_batch_cmd_flags(uc);
3756 }
3757
ublk_batch_attach(struct ublk_queue * ubq,struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)3758 static int ublk_batch_attach(struct ublk_queue *ubq,
3759 struct ublk_batch_io_data *data,
3760 struct ublk_batch_fetch_cmd *fcmd)
3761 {
3762 struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3763 bool free = false;
3764 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3765
3766 spin_lock(&ubq->evts_lock);
3767 if (unlikely(ubq->force_abort || ubq->canceling)) {
3768 free = true;
3769 } else {
3770 list_add_tail(&fcmd->node, &ubq->fcmd_head);
3771 new_fcmd = __ublk_acquire_fcmd(ubq);
3772 }
3773 spin_unlock(&ubq->evts_lock);
3774
3775 if (unlikely(free)) {
3776 ublk_batch_free_fcmd(fcmd);
3777 return -ENODEV;
3778 }
3779
3780 pdu->ubq = ubq;
3781 pdu->fcmd = fcmd;
3782 io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3783
3784 if (!new_fcmd)
3785 goto out;
3786
3787 /*
3788 * If the two fetch commands are originated from same io_ring_ctx,
3789 * run batch dispatch directly. Otherwise, schedule task work for
3790 * doing it.
3791 */
3792 if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3793 io_uring_cmd_ctx_handle(fcmd->cmd)) {
3794 data->cmd = new_fcmd->cmd;
3795 ublk_batch_dispatch(ubq, data, new_fcmd);
3796 } else {
3797 io_uring_cmd_complete_in_task(new_fcmd->cmd,
3798 ublk_batch_tw_cb);
3799 }
3800 out:
3801 return -EIOCBQUEUED;
3802 }
3803
ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data * data)3804 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3805 {
3806 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3807 struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3808
3809 if (!fcmd)
3810 return -ENOMEM;
3811
3812 return ublk_batch_attach(ubq, data, fcmd);
3813 }
3814
ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data * data)3815 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3816 {
3817 const struct ublk_batch_io *uc = &data->header;
3818
3819 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3820 return -EINVAL;
3821
3822 if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3823 return -EINVAL;
3824
3825 if (uc->elem_bytes != sizeof(__u16))
3826 return -EINVAL;
3827
3828 if (uc->flags != 0)
3829 return -EINVAL;
3830
3831 return 0;
3832 }
3833
ublk_handle_non_batch_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3834 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3835 unsigned int issue_flags)
3836 {
3837 const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3838 struct ublksrv_io_cmd);
3839 struct ublk_device *ub = cmd->file->private_data;
3840 unsigned tag = READ_ONCE(ub_cmd->tag);
3841 unsigned q_id = READ_ONCE(ub_cmd->q_id);
3842 unsigned index = READ_ONCE(ub_cmd->addr);
3843 struct ublk_queue *ubq;
3844 struct ublk_io *io;
3845
3846 if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3847 return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3848
3849 if (q_id >= ub->dev_info.nr_hw_queues)
3850 return -EINVAL;
3851
3852 if (tag >= ub->dev_info.queue_depth)
3853 return -EINVAL;
3854
3855 if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3856 return -EOPNOTSUPP;
3857
3858 ubq = ublk_get_queue(ub, q_id);
3859 io = &ubq->ios[tag];
3860 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3861 issue_flags);
3862 }
3863
ublk_ch_batch_io_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3864 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3865 unsigned int issue_flags)
3866 {
3867 const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3868 struct ublk_batch_io);
3869 struct ublk_device *ub = cmd->file->private_data;
3870 struct ublk_batch_io_data data = {
3871 .ub = ub,
3872 .cmd = cmd,
3873 .header = (struct ublk_batch_io) {
3874 .q_id = READ_ONCE(uc->q_id),
3875 .flags = READ_ONCE(uc->flags),
3876 .nr_elem = READ_ONCE(uc->nr_elem),
3877 .elem_bytes = READ_ONCE(uc->elem_bytes),
3878 },
3879 .issue_flags = issue_flags,
3880 };
3881 u32 cmd_op = cmd->cmd_op;
3882 int ret = -EINVAL;
3883
3884 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3885 ublk_batch_cancel_fn(cmd, issue_flags);
3886 return 0;
3887 }
3888
3889 switch (cmd_op) {
3890 case UBLK_U_IO_PREP_IO_CMDS:
3891 ret = ublk_check_batch_cmd(&data);
3892 if (ret)
3893 goto out;
3894 ret = ublk_handle_batch_prep_cmd(&data);
3895 break;
3896 case UBLK_U_IO_COMMIT_IO_CMDS:
3897 ret = ublk_check_batch_cmd(&data);
3898 if (ret)
3899 goto out;
3900 ret = ublk_handle_batch_commit_cmd(&data);
3901 break;
3902 case UBLK_U_IO_FETCH_IO_CMDS:
3903 ret = ublk_validate_batch_fetch_cmd(&data);
3904 if (ret)
3905 goto out;
3906 ret = ublk_handle_batch_fetch_cmd(&data);
3907 break;
3908 default:
3909 ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3910 break;
3911 }
3912 out:
3913 return ret;
3914 }
3915
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)3916 static inline bool ublk_check_ubuf_dir(const struct request *req,
3917 int ubuf_dir)
3918 {
3919 /* copy ubuf to request pages */
3920 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
3921 ubuf_dir == ITER_SOURCE)
3922 return true;
3923
3924 /* copy request pages to ubuf */
3925 if ((req_op(req) == REQ_OP_WRITE ||
3926 req_op(req) == REQ_OP_ZONE_APPEND) &&
3927 ubuf_dir == ITER_DEST)
3928 return true;
3929
3930 return false;
3931 }
3932
3933 static ssize_t
ublk_user_copy(struct kiocb * iocb,struct iov_iter * iter,int dir)3934 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
3935 {
3936 struct ublk_device *ub = iocb->ki_filp->private_data;
3937 struct ublk_queue *ubq;
3938 struct request *req;
3939 struct ublk_io *io;
3940 unsigned data_len;
3941 bool is_integrity;
3942 bool on_daemon;
3943 size_t buf_off;
3944 u16 tag, q_id;
3945 ssize_t ret;
3946
3947 if (!user_backed_iter(iter))
3948 return -EACCES;
3949
3950 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3951 return -EACCES;
3952
3953 tag = ublk_pos_to_tag(iocb->ki_pos);
3954 q_id = ublk_pos_to_hwq(iocb->ki_pos);
3955 buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
3956 is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
3957
3958 if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
3959 return -EINVAL;
3960
3961 if (q_id >= ub->dev_info.nr_hw_queues)
3962 return -EINVAL;
3963
3964 ubq = ublk_get_queue(ub, q_id);
3965 if (!ublk_dev_support_user_copy(ub))
3966 return -EACCES;
3967
3968 if (tag >= ub->dev_info.queue_depth)
3969 return -EINVAL;
3970
3971 io = &ubq->ios[tag];
3972 on_daemon = current == READ_ONCE(io->task);
3973 if (on_daemon) {
3974 /* On daemon, io can't be completed concurrently, so skip ref */
3975 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3976 return -EINVAL;
3977
3978 req = io->req;
3979 if (!ublk_rq_has_data(req))
3980 return -EINVAL;
3981 } else {
3982 req = __ublk_check_and_get_req(ub, q_id, tag, io);
3983 if (!req)
3984 return -EINVAL;
3985 }
3986
3987 if (is_integrity) {
3988 struct blk_integrity *bi = &req->q->limits.integrity;
3989
3990 data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
3991 } else {
3992 data_len = blk_rq_bytes(req);
3993 }
3994 if (buf_off > data_len) {
3995 ret = -EINVAL;
3996 goto out;
3997 }
3998
3999 if (!ublk_check_ubuf_dir(req, dir)) {
4000 ret = -EACCES;
4001 goto out;
4002 }
4003
4004 if (is_integrity)
4005 ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4006 else
4007 ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4008
4009 out:
4010 if (!on_daemon)
4011 ublk_put_req_ref(io, req);
4012 return ret;
4013 }
4014
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)4015 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4016 {
4017 return ublk_user_copy(iocb, to, ITER_DEST);
4018 }
4019
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)4020 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4021 {
4022 return ublk_user_copy(iocb, from, ITER_SOURCE);
4023 }
4024
4025 static const struct file_operations ublk_ch_fops = {
4026 .owner = THIS_MODULE,
4027 .open = ublk_ch_open,
4028 .release = ublk_ch_release,
4029 .read_iter = ublk_ch_read_iter,
4030 .write_iter = ublk_ch_write_iter,
4031 .uring_cmd = ublk_ch_uring_cmd,
4032 .mmap = ublk_ch_mmap,
4033 };
4034
4035 static const struct file_operations ublk_ch_batch_io_fops = {
4036 .owner = THIS_MODULE,
4037 .open = ublk_ch_open,
4038 .release = ublk_ch_release,
4039 .read_iter = ublk_ch_read_iter,
4040 .write_iter = ublk_ch_write_iter,
4041 .uring_cmd = ublk_ch_batch_io_uring_cmd,
4042 .mmap = ublk_ch_mmap,
4043 };
4044
__ublk_deinit_queue(struct ublk_device * ub,struct ublk_queue * ubq)4045 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4046 {
4047 int size, i;
4048
4049 size = ublk_queue_cmd_buf_size(ub);
4050
4051 for (i = 0; i < ubq->q_depth; i++) {
4052 struct ublk_io *io = &ubq->ios[i];
4053 if (io->task)
4054 put_task_struct(io->task);
4055 WARN_ON_ONCE(refcount_read(&io->ref));
4056 WARN_ON_ONCE(io->task_registered_buffers);
4057 }
4058
4059 if (ubq->io_cmd_buf)
4060 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4061
4062 if (ublk_dev_support_batch_io(ub))
4063 ublk_io_evts_deinit(ubq);
4064
4065 kvfree(ubq);
4066 }
4067
ublk_deinit_queue(struct ublk_device * ub,int q_id)4068 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4069 {
4070 struct ublk_queue *ubq = ub->queues[q_id];
4071
4072 if (!ubq)
4073 return;
4074
4075 __ublk_deinit_queue(ub, ubq);
4076 ub->queues[q_id] = NULL;
4077 }
4078
ublk_get_queue_numa_node(struct ublk_device * ub,int q_id)4079 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4080 {
4081 unsigned int cpu;
4082
4083 /* Find first CPU mapped to this queue */
4084 for_each_possible_cpu(cpu) {
4085 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4086 return cpu_to_node(cpu);
4087 }
4088
4089 return NUMA_NO_NODE;
4090 }
4091
ublk_init_queue(struct ublk_device * ub,int q_id)4092 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4093 {
4094 int depth = ub->dev_info.queue_depth;
4095 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4096 struct ublk_queue *ubq;
4097 struct page *page;
4098 int numa_node;
4099 int size, i, ret;
4100
4101 /* Determine NUMA node based on queue's CPU affinity */
4102 numa_node = ublk_get_queue_numa_node(ub, q_id);
4103
4104 /* Allocate queue structure on local NUMA node */
4105 ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4106 numa_node);
4107 if (!ubq)
4108 return -ENOMEM;
4109
4110 spin_lock_init(&ubq->cancel_lock);
4111 ubq->flags = ub->dev_info.flags;
4112 ubq->q_id = q_id;
4113 ubq->q_depth = depth;
4114 size = ublk_queue_cmd_buf_size(ub);
4115
4116 /* Allocate I/O command buffer on local NUMA node */
4117 page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4118 if (!page) {
4119 kvfree(ubq);
4120 return -ENOMEM;
4121 }
4122 ubq->io_cmd_buf = page_address(page);
4123
4124 for (i = 0; i < ubq->q_depth; i++)
4125 spin_lock_init(&ubq->ios[i].lock);
4126
4127 if (ublk_dev_support_batch_io(ub)) {
4128 ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4129 if (ret)
4130 goto fail;
4131 INIT_LIST_HEAD(&ubq->fcmd_head);
4132 }
4133 ub->queues[q_id] = ubq;
4134 ubq->dev = ub;
4135
4136 return 0;
4137 fail:
4138 __ublk_deinit_queue(ub, ubq);
4139 return ret;
4140 }
4141
ublk_deinit_queues(struct ublk_device * ub)4142 static void ublk_deinit_queues(struct ublk_device *ub)
4143 {
4144 int i;
4145
4146 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4147 ublk_deinit_queue(ub, i);
4148 }
4149
ublk_init_queues(struct ublk_device * ub)4150 static int ublk_init_queues(struct ublk_device *ub)
4151 {
4152 int i, ret;
4153
4154 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4155 ret = ublk_init_queue(ub, i);
4156 if (ret)
4157 goto fail;
4158 }
4159
4160 init_completion(&ub->completion);
4161 return 0;
4162
4163 fail:
4164 ublk_deinit_queues(ub);
4165 return ret;
4166 }
4167
ublk_alloc_dev_number(struct ublk_device * ub,int idx)4168 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4169 {
4170 int i = idx;
4171 int err;
4172
4173 spin_lock(&ublk_idr_lock);
4174 /* allocate id, if @id >= 0, we're requesting that specific id */
4175 if (i >= 0) {
4176 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4177 if (err == -ENOSPC)
4178 err = -EEXIST;
4179 } else {
4180 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4181 GFP_NOWAIT);
4182 }
4183 spin_unlock(&ublk_idr_lock);
4184
4185 if (err >= 0)
4186 ub->ub_number = err;
4187
4188 return err;
4189 }
4190
ublk_free_dev_number(struct ublk_device * ub)4191 static void ublk_free_dev_number(struct ublk_device *ub)
4192 {
4193 spin_lock(&ublk_idr_lock);
4194 idr_remove(&ublk_index_idr, ub->ub_number);
4195 wake_up_all(&ublk_idr_wq);
4196 spin_unlock(&ublk_idr_lock);
4197 }
4198
ublk_cdev_rel(struct device * dev)4199 static void ublk_cdev_rel(struct device *dev)
4200 {
4201 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4202
4203 blk_mq_free_tag_set(&ub->tag_set);
4204 ublk_deinit_queues(ub);
4205 ublk_free_dev_number(ub);
4206 mutex_destroy(&ub->mutex);
4207 mutex_destroy(&ub->cancel_mutex);
4208 kfree(ub);
4209 }
4210
ublk_add_chdev(struct ublk_device * ub)4211 static int ublk_add_chdev(struct ublk_device *ub)
4212 {
4213 struct device *dev = &ub->cdev_dev;
4214 int minor = ub->ub_number;
4215 int ret;
4216
4217 dev->parent = ublk_misc.this_device;
4218 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4219 dev->class = &ublk_chr_class;
4220 dev->release = ublk_cdev_rel;
4221 device_initialize(dev);
4222
4223 ret = dev_set_name(dev, "ublkc%d", minor);
4224 if (ret)
4225 goto fail;
4226
4227 if (ublk_dev_support_batch_io(ub))
4228 cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4229 else
4230 cdev_init(&ub->cdev, &ublk_ch_fops);
4231 ret = cdev_device_add(&ub->cdev, dev);
4232 if (ret)
4233 goto fail;
4234
4235 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4236 unprivileged_ublks_added++;
4237 return 0;
4238 fail:
4239 put_device(dev);
4240 return ret;
4241 }
4242
4243 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)4244 static void ublk_align_max_io_size(struct ublk_device *ub)
4245 {
4246 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4247
4248 ub->dev_info.max_io_buf_bytes =
4249 round_down(max_io_bytes, PAGE_SIZE);
4250 }
4251
ublk_add_tag_set(struct ublk_device * ub)4252 static int ublk_add_tag_set(struct ublk_device *ub)
4253 {
4254 if (ublk_dev_support_batch_io(ub))
4255 ub->tag_set.ops = &ublk_batch_mq_ops;
4256 else
4257 ub->tag_set.ops = &ublk_mq_ops;
4258 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4259 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4260 ub->tag_set.numa_node = NUMA_NO_NODE;
4261 ub->tag_set.driver_data = ub;
4262 return blk_mq_alloc_tag_set(&ub->tag_set);
4263 }
4264
ublk_remove(struct ublk_device * ub)4265 static void ublk_remove(struct ublk_device *ub)
4266 {
4267 bool unprivileged;
4268
4269 ublk_stop_dev(ub);
4270 cdev_device_del(&ub->cdev, &ub->cdev_dev);
4271 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4272 ublk_put_device(ub);
4273
4274 if (unprivileged)
4275 unprivileged_ublks_added--;
4276 }
4277
ublk_get_device_from_id(int idx)4278 static struct ublk_device *ublk_get_device_from_id(int idx)
4279 {
4280 struct ublk_device *ub = NULL;
4281
4282 if (idx < 0)
4283 return NULL;
4284
4285 spin_lock(&ublk_idr_lock);
4286 ub = idr_find(&ublk_index_idr, idx);
4287 if (ub)
4288 ub = ublk_get_device(ub);
4289 spin_unlock(&ublk_idr_lock);
4290
4291 return ub;
4292 }
4293
ublk_validate_user_pid(struct ublk_device * ub,pid_t ublksrv_pid)4294 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4295 {
4296 rcu_read_lock();
4297 ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4298 rcu_read_unlock();
4299
4300 return ub->ublksrv_tgid == ublksrv_pid;
4301 }
4302
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4303 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4304 const struct ublksrv_ctrl_cmd *header)
4305 {
4306 const struct ublk_param_basic *p = &ub->params.basic;
4307 int ublksrv_pid = (int)header->data[0];
4308 struct queue_limits lim = {
4309 .logical_block_size = 1 << p->logical_bs_shift,
4310 .physical_block_size = 1 << p->physical_bs_shift,
4311 .io_min = 1 << p->io_min_shift,
4312 .io_opt = 1 << p->io_opt_shift,
4313 .max_hw_sectors = p->max_sectors,
4314 .chunk_sectors = p->chunk_sectors,
4315 .virt_boundary_mask = p->virt_boundary_mask,
4316 .max_segments = USHRT_MAX,
4317 .max_segment_size = UINT_MAX,
4318 .dma_alignment = 3,
4319 };
4320 struct gendisk *disk;
4321 int ret = -EINVAL;
4322
4323 if (ublksrv_pid <= 0)
4324 return -EINVAL;
4325 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4326 return -EINVAL;
4327
4328 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4329 const struct ublk_param_discard *pd = &ub->params.discard;
4330
4331 lim.discard_alignment = pd->discard_alignment;
4332 lim.discard_granularity = pd->discard_granularity;
4333 lim.max_hw_discard_sectors = pd->max_discard_sectors;
4334 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4335 lim.max_discard_segments = pd->max_discard_segments;
4336 }
4337
4338 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4339 const struct ublk_param_zoned *p = &ub->params.zoned;
4340
4341 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4342 return -EOPNOTSUPP;
4343
4344 lim.features |= BLK_FEAT_ZONED;
4345 lim.max_active_zones = p->max_active_zones;
4346 lim.max_open_zones = p->max_open_zones;
4347 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4348 }
4349
4350 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4351 lim.features |= BLK_FEAT_WRITE_CACHE;
4352 if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4353 lim.features |= BLK_FEAT_FUA;
4354 }
4355
4356 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4357 lim.features |= BLK_FEAT_ROTATIONAL;
4358
4359 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4360 lim.dma_alignment = ub->params.dma.alignment;
4361
4362 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4363 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4364 lim.max_segment_size = ub->params.seg.max_segment_size;
4365 lim.max_segments = ub->params.seg.max_segments;
4366 }
4367
4368 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4369 const struct ublk_param_integrity *p = &ub->params.integrity;
4370 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4371
4372 lim.max_integrity_segments =
4373 p->max_integrity_segments ?: USHRT_MAX;
4374 lim.integrity = (struct blk_integrity) {
4375 .flags = ublk_integrity_flags(p->flags),
4376 .csum_type = ublk_integrity_csum_type(p->csum_type),
4377 .metadata_size = p->metadata_size,
4378 .pi_offset = p->pi_offset,
4379 .interval_exp = p->interval_exp,
4380 .tag_size = p->tag_size,
4381 .pi_tuple_size = pi_tuple_size,
4382 };
4383 }
4384
4385 if (wait_for_completion_interruptible(&ub->completion) != 0)
4386 return -EINTR;
4387
4388 if (!ublk_validate_user_pid(ub, ublksrv_pid))
4389 return -EINVAL;
4390
4391 mutex_lock(&ub->mutex);
4392 /* device may become not ready in case of F_BATCH */
4393 if (!ublk_dev_ready(ub)) {
4394 ret = -EINVAL;
4395 goto out_unlock;
4396 }
4397 if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4398 test_bit(UB_STATE_USED, &ub->state)) {
4399 ret = -EEXIST;
4400 goto out_unlock;
4401 }
4402
4403 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4404 if (IS_ERR(disk)) {
4405 ret = PTR_ERR(disk);
4406 goto out_unlock;
4407 }
4408 sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4409 disk->fops = &ub_fops;
4410 disk->private_data = ub;
4411
4412 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4413 ub->ub_disk = disk;
4414
4415 ublk_apply_params(ub);
4416
4417 /*
4418 * Suppress partition scan to avoid potential IO hang.
4419 *
4420 * If ublk server error occurs during partition scan, the IO may
4421 * wait while holding ub->mutex, which can deadlock with other
4422 * operations that need the mutex. Defer partition scan to async
4423 * work.
4424 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4425 * permanently.
4426 */
4427 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4428
4429 ublk_get_device(ub);
4430 ub->dev_info.state = UBLK_S_DEV_LIVE;
4431
4432 if (ublk_dev_is_zoned(ub)) {
4433 ret = ublk_revalidate_disk_zones(ub);
4434 if (ret)
4435 goto out_put_cdev;
4436 }
4437
4438 ret = add_disk(disk);
4439 if (ret)
4440 goto out_put_cdev;
4441
4442 set_bit(UB_STATE_USED, &ub->state);
4443
4444 /* Skip partition scan if disabled by user */
4445 if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4446 clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4447 } else {
4448 /* Schedule async partition scan for trusted daemons */
4449 if (!ub->unprivileged_daemons)
4450 schedule_work(&ub->partition_scan_work);
4451 }
4452
4453 out_put_cdev:
4454 if (ret) {
4455 ublk_detach_disk(ub);
4456 ublk_put_device(ub);
4457 }
4458 if (ret)
4459 put_disk(disk);
4460 out_unlock:
4461 mutex_unlock(&ub->mutex);
4462 return ret;
4463 }
4464
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4465 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4466 const struct ublksrv_ctrl_cmd *header)
4467 {
4468 void __user *argp = (void __user *)(unsigned long)header->addr;
4469 cpumask_var_t cpumask;
4470 unsigned long queue;
4471 unsigned int retlen;
4472 unsigned int i;
4473 int ret;
4474
4475 if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4476 return -EINVAL;
4477 if (header->len & (sizeof(unsigned long)-1))
4478 return -EINVAL;
4479 if (!header->addr)
4480 return -EINVAL;
4481
4482 queue = header->data[0];
4483 if (queue >= ub->dev_info.nr_hw_queues)
4484 return -EINVAL;
4485
4486 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4487 return -ENOMEM;
4488
4489 for_each_possible_cpu(i) {
4490 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4491 cpumask_set_cpu(i, cpumask);
4492 }
4493
4494 ret = -EFAULT;
4495 retlen = min_t(unsigned short, header->len, cpumask_size());
4496 if (copy_to_user(argp, cpumask, retlen))
4497 goto out_free_cpumask;
4498 if (retlen != header->len &&
4499 clear_user(argp + retlen, header->len - retlen))
4500 goto out_free_cpumask;
4501
4502 ret = 0;
4503 out_free_cpumask:
4504 free_cpumask_var(cpumask);
4505 return ret;
4506 }
4507
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)4508 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4509 {
4510 pr_devel("%s: dev id %d flags %llx\n", __func__,
4511 info->dev_id, info->flags);
4512 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4513 info->nr_hw_queues, info->queue_depth);
4514 }
4515
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)4516 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4517 {
4518 void __user *argp = (void __user *)(unsigned long)header->addr;
4519 struct ublksrv_ctrl_dev_info info;
4520 struct ublk_device *ub;
4521 int ret = -EINVAL;
4522
4523 if (header->len < sizeof(info) || !header->addr)
4524 return -EINVAL;
4525 if (header->queue_id != (u16)-1) {
4526 pr_warn("%s: queue_id is wrong %x\n",
4527 __func__, header->queue_id);
4528 return -EINVAL;
4529 }
4530
4531 if (copy_from_user(&info, argp, sizeof(info)))
4532 return -EFAULT;
4533
4534 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4535 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4536 return -EINVAL;
4537
4538 if (capable(CAP_SYS_ADMIN))
4539 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4540 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4541 return -EPERM;
4542
4543 /* forbid nonsense combinations of recovery flags */
4544 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4545 case 0:
4546 case UBLK_F_USER_RECOVERY:
4547 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4548 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4549 break;
4550 default:
4551 pr_warn("%s: invalid recovery flags %llx\n", __func__,
4552 info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4553 return -EINVAL;
4554 }
4555
4556 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4557 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4558 return -EINVAL;
4559 }
4560
4561 /*
4562 * unprivileged device can't be trusted, but RECOVERY and
4563 * RECOVERY_REISSUE still may hang error handling, so can't
4564 * support recovery features for unprivileged ublk now
4565 *
4566 * TODO: provide forward progress for RECOVERY handler, so that
4567 * unprivileged device can benefit from it
4568 */
4569 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4570 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4571 UBLK_F_USER_RECOVERY);
4572
4573 /*
4574 * For USER_COPY, we depends on userspace to fill request
4575 * buffer by pwrite() to ublk char device, which can't be
4576 * used for unprivileged device
4577 *
4578 * Same with zero copy or auto buffer register.
4579 */
4580 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4581 UBLK_F_AUTO_BUF_REG))
4582 return -EINVAL;
4583 }
4584
4585 /* User copy is required to access integrity buffer */
4586 if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4587 return -EINVAL;
4588
4589 /* the created device is always owned by current user */
4590 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4591
4592 if (header->dev_id != info.dev_id) {
4593 pr_warn("%s: dev id not match %u %u\n",
4594 __func__, header->dev_id, info.dev_id);
4595 return -EINVAL;
4596 }
4597
4598 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4599 pr_warn("%s: dev id is too large. Max supported is %d\n",
4600 __func__, UBLK_MAX_UBLKS - 1);
4601 return -EINVAL;
4602 }
4603
4604 ublk_dump_dev_info(&info);
4605
4606 ret = mutex_lock_killable(&ublk_ctl_mutex);
4607 if (ret)
4608 return ret;
4609
4610 ret = -EACCES;
4611 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4612 unprivileged_ublks_added >= unprivileged_ublks_max)
4613 goto out_unlock;
4614
4615 ret = -ENOMEM;
4616 ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4617 if (!ub)
4618 goto out_unlock;
4619 mutex_init(&ub->mutex);
4620 spin_lock_init(&ub->lock);
4621 mutex_init(&ub->cancel_mutex);
4622 INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4623
4624 ret = ublk_alloc_dev_number(ub, header->dev_id);
4625 if (ret < 0)
4626 goto out_free_ub;
4627
4628 memcpy(&ub->dev_info, &info, sizeof(info));
4629
4630 /* update device id */
4631 ub->dev_info.dev_id = ub->ub_number;
4632
4633 /*
4634 * 64bit flags will be copied back to userspace as feature
4635 * negotiation result, so have to clear flags which driver
4636 * doesn't support yet, then userspace can get correct flags
4637 * (features) to handle.
4638 */
4639 ub->dev_info.flags &= UBLK_F_ALL;
4640
4641 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4642 UBLK_F_URING_CMD_COMP_IN_TASK |
4643 UBLK_F_PER_IO_DAEMON |
4644 UBLK_F_BUF_REG_OFF_DAEMON |
4645 UBLK_F_SAFE_STOP_DEV;
4646
4647 /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4648 if (ublk_dev_support_batch_io(ub))
4649 ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4650
4651 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4652 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4653 UBLK_F_AUTO_BUF_REG))
4654 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4655
4656 /* UBLK_F_BATCH_IO doesn't support GET_DATA */
4657 if (ublk_dev_support_batch_io(ub))
4658 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4659
4660 /*
4661 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4662 * returning write_append_lba, which is only allowed in case of
4663 * user copy or zero copy
4664 */
4665 if (ublk_dev_is_zoned(ub) &&
4666 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4667 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4668 ret = -EINVAL;
4669 goto out_free_dev_number;
4670 }
4671
4672 ub->dev_info.nr_hw_queues = min_t(unsigned int,
4673 ub->dev_info.nr_hw_queues, nr_cpu_ids);
4674 ublk_align_max_io_size(ub);
4675
4676 ret = ublk_add_tag_set(ub);
4677 if (ret)
4678 goto out_free_dev_number;
4679
4680 ret = ublk_init_queues(ub);
4681 if (ret)
4682 goto out_free_tag_set;
4683
4684 ret = -EFAULT;
4685 if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4686 goto out_deinit_queues;
4687
4688 /*
4689 * Add the char dev so that ublksrv daemon can be setup.
4690 * ublk_add_chdev() will cleanup everything if it fails.
4691 */
4692 ret = ublk_add_chdev(ub);
4693 goto out_unlock;
4694
4695 out_deinit_queues:
4696 ublk_deinit_queues(ub);
4697 out_free_tag_set:
4698 blk_mq_free_tag_set(&ub->tag_set);
4699 out_free_dev_number:
4700 ublk_free_dev_number(ub);
4701 out_free_ub:
4702 mutex_destroy(&ub->mutex);
4703 mutex_destroy(&ub->cancel_mutex);
4704 kfree(ub);
4705 out_unlock:
4706 mutex_unlock(&ublk_ctl_mutex);
4707 return ret;
4708 }
4709
ublk_idr_freed(int id)4710 static inline bool ublk_idr_freed(int id)
4711 {
4712 void *ptr;
4713
4714 spin_lock(&ublk_idr_lock);
4715 ptr = idr_find(&ublk_index_idr, id);
4716 spin_unlock(&ublk_idr_lock);
4717
4718 return ptr == NULL;
4719 }
4720
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)4721 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4722 {
4723 struct ublk_device *ub = *p_ub;
4724 int idx = ub->ub_number;
4725 int ret;
4726
4727 ret = mutex_lock_killable(&ublk_ctl_mutex);
4728 if (ret)
4729 return ret;
4730
4731 if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4732 ublk_remove(ub);
4733 set_bit(UB_STATE_DELETED, &ub->state);
4734 }
4735
4736 /* Mark the reference as consumed */
4737 *p_ub = NULL;
4738 ublk_put_device(ub);
4739 mutex_unlock(&ublk_ctl_mutex);
4740
4741 /*
4742 * Wait until the idr is removed, then it can be reused after
4743 * DEL_DEV command is returned.
4744 *
4745 * If we returns because of user interrupt, future delete command
4746 * may come:
4747 *
4748 * - the device number isn't freed, this device won't or needn't
4749 * be deleted again, since UB_STATE_DELETED is set, and device
4750 * will be released after the last reference is dropped
4751 *
4752 * - the device number is freed already, we will not find this
4753 * device via ublk_get_device_from_id()
4754 */
4755 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4756 return -EINTR;
4757 return 0;
4758 }
4759
ublk_ctrl_cmd_dump(u32 cmd_op,const struct ublksrv_ctrl_cmd * header)4760 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4761 const struct ublksrv_ctrl_cmd *header)
4762 {
4763 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4764 __func__, cmd_op, header->dev_id, header->queue_id,
4765 header->data[0], header->addr, header->len);
4766 }
4767
ublk_ctrl_stop_dev(struct ublk_device * ub)4768 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4769 {
4770 ublk_stop_dev(ub);
4771 }
4772
ublk_ctrl_try_stop_dev(struct ublk_device * ub)4773 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4774 {
4775 struct gendisk *disk;
4776 int ret = 0;
4777
4778 disk = ublk_get_disk(ub);
4779 if (!disk)
4780 return -ENODEV;
4781
4782 mutex_lock(&disk->open_mutex);
4783 if (disk_openers(disk) > 0) {
4784 ret = -EBUSY;
4785 goto unlock;
4786 }
4787 ub->block_open = true;
4788 /* release open_mutex as del_gendisk() will reacquire it */
4789 mutex_unlock(&disk->open_mutex);
4790
4791 ublk_ctrl_stop_dev(ub);
4792 goto out;
4793
4794 unlock:
4795 mutex_unlock(&disk->open_mutex);
4796 out:
4797 ublk_put_disk(disk);
4798 return ret;
4799 }
4800
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4801 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4802 const struct ublksrv_ctrl_cmd *header)
4803 {
4804 struct task_struct *p;
4805 struct pid *pid;
4806 struct ublksrv_ctrl_dev_info dev_info;
4807 pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4808 void __user *argp = (void __user *)(unsigned long)header->addr;
4809
4810 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4811 return -EINVAL;
4812
4813 memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4814 dev_info.ublksrv_pid = -1;
4815
4816 if (init_ublksrv_tgid > 0) {
4817 rcu_read_lock();
4818 pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4819 p = pid_task(pid, PIDTYPE_TGID);
4820 if (p) {
4821 int vnr = task_tgid_vnr(p);
4822
4823 if (vnr)
4824 dev_info.ublksrv_pid = vnr;
4825 }
4826 rcu_read_unlock();
4827 }
4828
4829 if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4830 return -EFAULT;
4831
4832 return 0;
4833 }
4834
4835 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)4836 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4837 {
4838 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4839 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4840
4841 if (ub->ub_disk) {
4842 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4843 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4844 } else {
4845 ub->params.devt.disk_major = 0;
4846 ub->params.devt.disk_minor = 0;
4847 }
4848 ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4849 }
4850
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4851 static int ublk_ctrl_get_params(struct ublk_device *ub,
4852 const struct ublksrv_ctrl_cmd *header)
4853 {
4854 void __user *argp = (void __user *)(unsigned long)header->addr;
4855 struct ublk_params_header ph;
4856 int ret;
4857
4858 if (header->len <= sizeof(ph) || !header->addr)
4859 return -EINVAL;
4860
4861 if (copy_from_user(&ph, argp, sizeof(ph)))
4862 return -EFAULT;
4863
4864 if (ph.len > header->len || !ph.len)
4865 return -EINVAL;
4866
4867 if (ph.len > sizeof(struct ublk_params))
4868 ph.len = sizeof(struct ublk_params);
4869
4870 mutex_lock(&ub->mutex);
4871 ublk_ctrl_fill_params_devt(ub);
4872 if (copy_to_user(argp, &ub->params, ph.len))
4873 ret = -EFAULT;
4874 else
4875 ret = 0;
4876 mutex_unlock(&ub->mutex);
4877
4878 return ret;
4879 }
4880
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4881 static int ublk_ctrl_set_params(struct ublk_device *ub,
4882 const struct ublksrv_ctrl_cmd *header)
4883 {
4884 void __user *argp = (void __user *)(unsigned long)header->addr;
4885 struct ublk_params_header ph;
4886 int ret = -EFAULT;
4887
4888 if (header->len <= sizeof(ph) || !header->addr)
4889 return -EINVAL;
4890
4891 if (copy_from_user(&ph, argp, sizeof(ph)))
4892 return -EFAULT;
4893
4894 if (ph.len > header->len || !ph.len || !ph.types)
4895 return -EINVAL;
4896
4897 if (ph.len > sizeof(struct ublk_params))
4898 ph.len = sizeof(struct ublk_params);
4899
4900 mutex_lock(&ub->mutex);
4901 if (test_bit(UB_STATE_USED, &ub->state)) {
4902 /*
4903 * Parameters can only be changed when device hasn't
4904 * been started yet
4905 */
4906 ret = -EACCES;
4907 } else if (copy_from_user(&ub->params, argp, ph.len)) {
4908 ret = -EFAULT;
4909 } else {
4910 /* clear all we don't support yet */
4911 ub->params.types &= UBLK_PARAM_TYPE_ALL;
4912 ret = ublk_validate_params(ub);
4913 if (ret)
4914 ub->params.types = 0;
4915 }
4916 mutex_unlock(&ub->mutex);
4917
4918 return ret;
4919 }
4920
ublk_ctrl_start_recovery(struct ublk_device * ub)4921 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
4922 {
4923 int ret = -EINVAL;
4924
4925 mutex_lock(&ub->mutex);
4926 if (ublk_nosrv_should_stop_dev(ub))
4927 goto out_unlock;
4928 /*
4929 * START_RECOVERY is only allowd after:
4930 *
4931 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
4932 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
4933 * released.
4934 *
4935 * and one of the following holds
4936 *
4937 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
4938 * (a)has quiesced request queue
4939 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
4940 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
4941 * (d)has completed/camceled all ioucmds owned by ther dying process
4942 *
4943 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
4944 * quiesced, but all I/O is being immediately errored
4945 */
4946 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
4947 ret = -EBUSY;
4948 goto out_unlock;
4949 }
4950 pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
4951 init_completion(&ub->completion);
4952 ret = 0;
4953 out_unlock:
4954 mutex_unlock(&ub->mutex);
4955 return ret;
4956 }
4957
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4958 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
4959 const struct ublksrv_ctrl_cmd *header)
4960 {
4961 int ublksrv_pid = (int)header->data[0];
4962 int ret = -EINVAL;
4963
4964 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
4965 header->dev_id);
4966
4967 if (wait_for_completion_interruptible(&ub->completion))
4968 return -EINTR;
4969
4970 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
4971 header->dev_id);
4972
4973 if (!ublk_validate_user_pid(ub, ublksrv_pid))
4974 return -EINVAL;
4975
4976 mutex_lock(&ub->mutex);
4977 if (ublk_nosrv_should_stop_dev(ub))
4978 goto out_unlock;
4979
4980 if (!ublk_dev_in_recoverable_state(ub)) {
4981 ret = -EBUSY;
4982 goto out_unlock;
4983 }
4984 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4985 ub->dev_info.state = UBLK_S_DEV_LIVE;
4986 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
4987 __func__, ublksrv_pid, header->dev_id);
4988 blk_mq_kick_requeue_list(ub->ub_disk->queue);
4989 ret = 0;
4990 out_unlock:
4991 mutex_unlock(&ub->mutex);
4992 return ret;
4993 }
4994
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)4995 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
4996 {
4997 void __user *argp = (void __user *)(unsigned long)header->addr;
4998 u64 features = UBLK_F_ALL;
4999
5000 if (header->len != UBLK_FEATURES_LEN || !header->addr)
5001 return -EINVAL;
5002
5003 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5004 return -EFAULT;
5005
5006 return 0;
5007 }
5008
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5009 static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5010 {
5011 struct ublk_param_basic *p = &ub->params.basic;
5012 u64 new_size = header->data[0];
5013
5014 mutex_lock(&ub->mutex);
5015 p->dev_sectors = new_size;
5016 set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5017 mutex_unlock(&ub->mutex);
5018 }
5019
5020 struct count_busy {
5021 const struct ublk_queue *ubq;
5022 unsigned int nr_busy;
5023 };
5024
ublk_count_busy_req(struct request * rq,void * data)5025 static bool ublk_count_busy_req(struct request *rq, void *data)
5026 {
5027 struct count_busy *idle = data;
5028
5029 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5030 idle->nr_busy += 1;
5031 return true;
5032 }
5033
5034 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)5035 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5036 {
5037 struct count_busy data = {
5038 .ubq = ubq,
5039 };
5040
5041 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5042 return data.nr_busy < ubq->q_depth;
5043 }
5044
5045 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)5046 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5047 unsigned int timeout_ms)
5048 {
5049 unsigned int elapsed = 0;
5050 int ret;
5051
5052 /*
5053 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5054 * or new fetch command, so needn't wait any more
5055 */
5056 if (ublk_dev_support_batch_io(ub))
5057 return 0;
5058
5059 while (elapsed < timeout_ms && !signal_pending(current)) {
5060 unsigned int queues_cancelable = 0;
5061 int i;
5062
5063 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5064 struct ublk_queue *ubq = ublk_get_queue(ub, i);
5065
5066 queues_cancelable += !!ubq_has_idle_io(ubq);
5067 }
5068
5069 /*
5070 * Each queue needs at least one active command for
5071 * notifying ublk server
5072 */
5073 if (queues_cancelable == ub->dev_info.nr_hw_queues)
5074 break;
5075
5076 msleep(UBLK_REQUEUE_DELAY_MS);
5077 elapsed += UBLK_REQUEUE_DELAY_MS;
5078 }
5079
5080 if (signal_pending(current))
5081 ret = -EINTR;
5082 else if (elapsed >= timeout_ms)
5083 ret = -EBUSY;
5084 else
5085 ret = 0;
5086
5087 return ret;
5088 }
5089
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5090 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5091 const struct ublksrv_ctrl_cmd *header)
5092 {
5093 /* zero means wait forever */
5094 u64 timeout_ms = header->data[0];
5095 struct gendisk *disk;
5096 int ret = -ENODEV;
5097
5098 if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5099 return -EOPNOTSUPP;
5100
5101 mutex_lock(&ub->mutex);
5102 disk = ublk_get_disk(ub);
5103 if (!disk)
5104 goto unlock;
5105 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5106 goto put_disk;
5107
5108 ret = 0;
5109 /* already in expected state */
5110 if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5111 goto put_disk;
5112
5113 /* Mark the device as canceling */
5114 mutex_lock(&ub->cancel_mutex);
5115 blk_mq_quiesce_queue(disk->queue);
5116 ublk_set_canceling(ub, true);
5117 blk_mq_unquiesce_queue(disk->queue);
5118 mutex_unlock(&ub->cancel_mutex);
5119
5120 if (!timeout_ms)
5121 timeout_ms = UINT_MAX;
5122 ret = ublk_wait_for_idle_io(ub, timeout_ms);
5123
5124 put_disk:
5125 ublk_put_disk(disk);
5126 unlock:
5127 mutex_unlock(&ub->mutex);
5128
5129 /* Cancel pending uring_cmd */
5130 if (!ret)
5131 ublk_cancel_dev(ub);
5132 return ret;
5133 }
5134
5135 /*
5136 * All control commands are sent via /dev/ublk-control, so we have to check
5137 * the destination device's permission
5138 */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)5139 static int ublk_char_dev_permission(struct ublk_device *ub,
5140 const char *dev_path, int mask)
5141 {
5142 int err;
5143 struct path path;
5144 struct kstat stat;
5145
5146 err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5147 if (err)
5148 return err;
5149
5150 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5151 if (err)
5152 goto exit;
5153
5154 err = -EPERM;
5155 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5156 goto exit;
5157
5158 err = inode_permission(&nop_mnt_idmap,
5159 d_backing_inode(path.dentry), mask);
5160 exit:
5161 path_put(&path);
5162 return err;
5163 }
5164
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,u32 cmd_op,struct ublksrv_ctrl_cmd * header)5165 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5166 u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5167 {
5168 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5169 void __user *argp = (void __user *)(unsigned long)header->addr;
5170 char *dev_path = NULL;
5171 int ret = 0;
5172 int mask;
5173
5174 if (!unprivileged) {
5175 if (!capable(CAP_SYS_ADMIN))
5176 return -EPERM;
5177 /*
5178 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5179 * char_dev_path in payload too, since userspace may not
5180 * know if the specified device is created as unprivileged
5181 * mode.
5182 */
5183 if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5184 return 0;
5185 }
5186
5187 /*
5188 * User has to provide the char device path for unprivileged ublk
5189 *
5190 * header->addr always points to the dev path buffer, and
5191 * header->dev_path_len records length of dev path buffer.
5192 */
5193 if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5194 return -EINVAL;
5195
5196 if (header->len < header->dev_path_len)
5197 return -EINVAL;
5198
5199 dev_path = memdup_user_nul(argp, header->dev_path_len);
5200 if (IS_ERR(dev_path))
5201 return PTR_ERR(dev_path);
5202
5203 ret = -EINVAL;
5204 switch (_IOC_NR(cmd_op)) {
5205 case UBLK_CMD_GET_DEV_INFO:
5206 case UBLK_CMD_GET_DEV_INFO2:
5207 case UBLK_CMD_GET_QUEUE_AFFINITY:
5208 case UBLK_CMD_GET_PARAMS:
5209 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5210 mask = MAY_READ;
5211 break;
5212 case UBLK_CMD_START_DEV:
5213 case UBLK_CMD_STOP_DEV:
5214 case UBLK_CMD_ADD_DEV:
5215 case UBLK_CMD_DEL_DEV:
5216 case UBLK_CMD_SET_PARAMS:
5217 case UBLK_CMD_START_USER_RECOVERY:
5218 case UBLK_CMD_END_USER_RECOVERY:
5219 case UBLK_CMD_UPDATE_SIZE:
5220 case UBLK_CMD_QUIESCE_DEV:
5221 case UBLK_CMD_TRY_STOP_DEV:
5222 mask = MAY_READ | MAY_WRITE;
5223 break;
5224 default:
5225 goto exit;
5226 }
5227
5228 ret = ublk_char_dev_permission(ub, dev_path, mask);
5229 if (!ret) {
5230 header->len -= header->dev_path_len;
5231 header->addr += header->dev_path_len;
5232 }
5233 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5234 __func__, ub->ub_number, cmd_op,
5235 ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5236 dev_path, ret);
5237 exit:
5238 kfree(dev_path);
5239 return ret;
5240 }
5241
ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)5242 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5243 {
5244 switch (_IOC_NR(cmd_op)) {
5245 case UBLK_CMD_GET_QUEUE_AFFINITY:
5246 case UBLK_CMD_GET_DEV_INFO:
5247 case UBLK_CMD_GET_DEV_INFO2:
5248 case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5249 return false;
5250 default:
5251 return true;
5252 }
5253 }
5254
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)5255 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5256 unsigned int issue_flags)
5257 {
5258 /* May point to userspace-mapped memory */
5259 const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5260 struct ublksrv_ctrl_cmd);
5261 struct ublksrv_ctrl_cmd header;
5262 struct ublk_device *ub = NULL;
5263 u32 cmd_op = cmd->cmd_op;
5264 int ret = -EINVAL;
5265
5266 if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5267 issue_flags & IO_URING_F_NONBLOCK)
5268 return -EAGAIN;
5269
5270 if (!(issue_flags & IO_URING_F_SQE128))
5271 return -EINVAL;
5272
5273 header.dev_id = READ_ONCE(ub_src->dev_id);
5274 header.queue_id = READ_ONCE(ub_src->queue_id);
5275 header.len = READ_ONCE(ub_src->len);
5276 header.addr = READ_ONCE(ub_src->addr);
5277 header.data[0] = READ_ONCE(ub_src->data[0]);
5278 header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5279 ublk_ctrl_cmd_dump(cmd_op, &header);
5280
5281 ret = ublk_check_cmd_op(cmd_op);
5282 if (ret)
5283 goto out;
5284
5285 if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5286 ret = ublk_ctrl_get_features(&header);
5287 goto out;
5288 }
5289
5290 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5291 ret = -ENODEV;
5292 ub = ublk_get_device_from_id(header.dev_id);
5293 if (!ub)
5294 goto out;
5295
5296 ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5297 if (ret)
5298 goto put_dev;
5299 }
5300
5301 switch (_IOC_NR(cmd_op)) {
5302 case UBLK_CMD_START_DEV:
5303 ret = ublk_ctrl_start_dev(ub, &header);
5304 break;
5305 case UBLK_CMD_STOP_DEV:
5306 ublk_ctrl_stop_dev(ub);
5307 ret = 0;
5308 break;
5309 case UBLK_CMD_GET_DEV_INFO:
5310 case UBLK_CMD_GET_DEV_INFO2:
5311 ret = ublk_ctrl_get_dev_info(ub, &header);
5312 break;
5313 case UBLK_CMD_ADD_DEV:
5314 ret = ublk_ctrl_add_dev(&header);
5315 break;
5316 case UBLK_CMD_DEL_DEV:
5317 ret = ublk_ctrl_del_dev(&ub, true);
5318 break;
5319 case UBLK_CMD_DEL_DEV_ASYNC:
5320 ret = ublk_ctrl_del_dev(&ub, false);
5321 break;
5322 case UBLK_CMD_GET_QUEUE_AFFINITY:
5323 ret = ublk_ctrl_get_queue_affinity(ub, &header);
5324 break;
5325 case UBLK_CMD_GET_PARAMS:
5326 ret = ublk_ctrl_get_params(ub, &header);
5327 break;
5328 case UBLK_CMD_SET_PARAMS:
5329 ret = ublk_ctrl_set_params(ub, &header);
5330 break;
5331 case UBLK_CMD_START_USER_RECOVERY:
5332 ret = ublk_ctrl_start_recovery(ub);
5333 break;
5334 case UBLK_CMD_END_USER_RECOVERY:
5335 ret = ublk_ctrl_end_recovery(ub, &header);
5336 break;
5337 case UBLK_CMD_UPDATE_SIZE:
5338 ublk_ctrl_set_size(ub, &header);
5339 ret = 0;
5340 break;
5341 case UBLK_CMD_QUIESCE_DEV:
5342 ret = ublk_ctrl_quiesce_dev(ub, &header);
5343 break;
5344 case UBLK_CMD_TRY_STOP_DEV:
5345 ret = ublk_ctrl_try_stop_dev(ub);
5346 break;
5347 default:
5348 ret = -EOPNOTSUPP;
5349 break;
5350 }
5351
5352 put_dev:
5353 if (ub)
5354 ublk_put_device(ub);
5355 out:
5356 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5357 __func__, ret, cmd_op, header.dev_id, header.queue_id);
5358 return ret;
5359 }
5360
5361 static const struct file_operations ublk_ctl_fops = {
5362 .open = nonseekable_open,
5363 .uring_cmd = ublk_ctrl_uring_cmd,
5364 .owner = THIS_MODULE,
5365 .llseek = noop_llseek,
5366 };
5367
5368 static struct miscdevice ublk_misc = {
5369 .minor = MISC_DYNAMIC_MINOR,
5370 .name = "ublk-control",
5371 .fops = &ublk_ctl_fops,
5372 };
5373
ublk_init(void)5374 static int __init ublk_init(void)
5375 {
5376 int ret;
5377
5378 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5379 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5380 /*
5381 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5382 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5383 */
5384 BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5385 UBLKSRV_IO_INTEGRITY_FLAG);
5386 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5387
5388 init_waitqueue_head(&ublk_idr_wq);
5389
5390 ret = misc_register(&ublk_misc);
5391 if (ret)
5392 return ret;
5393
5394 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5395 if (ret)
5396 goto unregister_mis;
5397
5398 ret = class_register(&ublk_chr_class);
5399 if (ret)
5400 goto free_chrdev_region;
5401
5402 return 0;
5403
5404 free_chrdev_region:
5405 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5406 unregister_mis:
5407 misc_deregister(&ublk_misc);
5408 return ret;
5409 }
5410
ublk_exit(void)5411 static void __exit ublk_exit(void)
5412 {
5413 struct ublk_device *ub;
5414 int id;
5415
5416 idr_for_each_entry(&ublk_index_idr, ub, id)
5417 ublk_remove(ub);
5418
5419 class_unregister(&ublk_chr_class);
5420 misc_deregister(&ublk_misc);
5421
5422 idr_destroy(&ublk_index_idr);
5423 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5424 }
5425
5426 module_init(ublk_init);
5427 module_exit(ublk_exit);
5428
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)5429 static int ublk_set_max_unprivileged_ublks(const char *buf,
5430 const struct kernel_param *kp)
5431 {
5432 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5433 }
5434
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)5435 static int ublk_get_max_unprivileged_ublks(char *buf,
5436 const struct kernel_param *kp)
5437 {
5438 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5439 }
5440
5441 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5442 .set = ublk_set_max_unprivileged_ublks,
5443 .get = ublk_get_max_unprivileged_ublks,
5444 };
5445
5446 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5447 &unprivileged_ublks_max, 0644);
5448 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5449
5450 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5451 MODULE_DESCRIPTION("Userspace block device");
5452 MODULE_LICENSE("GPL");
5453