1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53
54 #define UBLK_MINORS (1U << MINORBITS)
55
56 #define UBLK_INVALID_BUF_IDX ((u16)-1)
57
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF)
65
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32)
68
69 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 | UBLK_F_URING_CMD_COMP_IN_TASK \
75 | UBLK_F_NEED_GET_DATA \
76 | UBLK_F_USER_RECOVERY \
77 | UBLK_F_USER_RECOVERY_REISSUE \
78 | UBLK_F_UNPRIVILEGED_DEV \
79 | UBLK_F_CMD_IOCTL_ENCODE \
80 | UBLK_F_USER_COPY \
81 | UBLK_F_ZONED \
82 | UBLK_F_USER_RECOVERY_FAIL_IO \
83 | UBLK_F_UPDATE_SIZE \
84 | UBLK_F_AUTO_BUF_REG \
85 | UBLK_F_QUIESCE \
86 | UBLK_F_PER_IO_DAEMON \
87 | UBLK_F_BUF_REG_OFF_DAEMON \
88 | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 | UBLK_F_SAFE_STOP_DEV \
90 | UBLK_F_BATCH_IO \
91 | UBLK_F_NO_AUTO_PART_SCAN \
92 | UBLK_F_SHMEM_ZC)
93
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 | UBLK_F_USER_RECOVERY_REISSUE \
96 | UBLK_F_USER_RECOVERY_FAIL_IO)
97
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL \
100 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
102 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 UBLK_PARAM_TYPE_INTEGRITY)
104
105 #define UBLK_BATCH_F_ALL \
106 (UBLK_BATCH_F_HAS_ZONE_LBA | \
107 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 struct list_head node;
113 struct io_uring_cmd *cmd;
114 unsigned short buf_group;
115 };
116
117 struct ublk_uring_cmd_pdu {
118 /*
119 * Store requests in same batch temporarily for queuing them to
120 * daemon context.
121 *
122 * It should have been stored to request payload, but we do want
123 * to avoid extra pre-allocation, and uring_cmd payload is always
124 * free for us
125 */
126 union {
127 struct request *req;
128 struct request *req_list;
129 };
130
131 /*
132 * The following two are valid in this cmd whole lifetime, and
133 * setup in ublk uring_cmd handler
134 */
135 struct ublk_queue *ubq;
136
137 union {
138 u16 tag;
139 struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 };
141 };
142
143 struct ublk_batch_io_data {
144 struct ublk_device *ub;
145 struct io_uring_cmd *cmd;
146 struct ublk_batch_io header;
147 unsigned int issue_flags;
148 struct io_comp_batch *iob;
149 };
150
151 /*
152 * io command is active: sqe cmd is received, and its cqe isn't done
153 *
154 * If the flag is set, the io command is owned by ublk driver, and waited
155 * for incoming blk-mq request from the ublk block device.
156 *
157 * If the flag is cleared, the io command will be completed, and owned by
158 * ublk server.
159 */
160 #define UBLK_IO_FLAG_ACTIVE 0x01
161
162 /*
163 * IO command is completed via cqe, and it is being handled by ublksrv, and
164 * not committed yet
165 *
166 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167 * cross verification
168 */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170
171 /*
172 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173 * get data buffer address from ublksrv.
174 *
175 * Then, bio data could be copied into this data buffer for a WRITE request
176 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177 */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179
180 /*
181 * request buffer is registered automatically, so we have to unregister it
182 * before completing this request.
183 *
184 * io_uring will unregister buffer automatically for us during exiting.
185 */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
187
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED 0x80000000
190
191 /*
192 * Initialize refcount to a large number to include any registered buffers.
193 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194 * any buffers registered on the io daemon task.
195 */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1)
200
201 union ublk_io_buf {
202 __u64 addr;
203 struct ublk_auto_buf_reg auto_reg;
204 };
205
206 struct ublk_io {
207 union ublk_io_buf buf;
208 unsigned int flags;
209 int res;
210
211 union {
212 /* valid if UBLK_IO_FLAG_ACTIVE is set */
213 struct io_uring_cmd *cmd;
214 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 struct request *req;
216 };
217
218 struct task_struct *task;
219
220 /*
221 * The number of uses of this I/O by the ublk server
222 * if user copy or zero copy are enabled:
223 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 * until UBLK_IO_COMMIT_AND_FETCH_REQ
225 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 * - 1 for each io_uring registered buffer not registered on task
227 * The I/O can only be completed once all references are dropped.
228 * User copy and buffer registration operations are only permitted
229 * if the reference count is nonzero.
230 */
231 refcount_t ref;
232 /* Count of buffers registered on task and not yet unregistered */
233 unsigned task_registered_buffers;
234
235 void *buf_ctx_handle;
236 spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238
239 struct ublk_queue {
240 int q_id;
241 int q_depth;
242
243 unsigned long flags;
244 struct ublksrv_io_desc *io_cmd_buf;
245
246 bool force_abort;
247 bool canceling;
248 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 spinlock_t cancel_lock;
250 struct ublk_device *dev;
251 u32 nr_io_ready;
252
253 /*
254 * For supporting UBLK_F_BATCH_IO only.
255 *
256 * Inflight ublk request tag is saved in this fifo
257 *
258 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 * so lock is required for storing request tag to fifo
260 *
261 * Make sure just one reader for fetching request from task work
262 * function to ublk server, so no need to grab the lock in reader
263 * side.
264 *
265 * Batch I/O State Management:
266 *
267 * The batch I/O system uses implicit state management based on the
268 * combination of three key variables below.
269 *
270 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 * No fetch commands available, events queue in evts_fifo
272 *
273 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 * Fetch commands available but none processing events
275 *
276 * - ACTIVE: active_fcmd
277 * One fetch command actively processing events from evts_fifo
278 *
279 * Key Invariants:
280 * - At most one active_fcmd at any time (single reader)
281 * - active_fcmd is always from fcmd_head list when non-NULL
282 * - evts_fifo can be read locklessly by the single active reader
283 * - All state transitions require evts_lock protection
284 * - Multiple writers to evts_fifo require lock protection
285 */
286 struct {
287 DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 spinlock_t evts_lock;
289
290 /* List of fetch commands available to process events */
291 struct list_head fcmd_head;
292
293 /* Currently active fetch command (NULL = none active) */
294 struct ublk_batch_fetch_cmd *active_fcmd;
295 }____cacheline_aligned_in_smp;
296
297 struct ublk_io ios[] __counted_by(q_depth);
298 };
299
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 unsigned short buf_index;
303 unsigned short flags;
304 unsigned int base_offset; /* byte offset within buffer */
305 };
306
307 struct ublk_device {
308 struct gendisk *ub_disk;
309
310 struct ublksrv_ctrl_dev_info dev_info;
311
312 struct blk_mq_tag_set tag_set;
313
314 struct cdev cdev;
315 struct device cdev_dev;
316
317 #define UB_STATE_OPEN 0
318 #define UB_STATE_USED 1
319 #define UB_STATE_DELETED 2
320 unsigned long state;
321 int ub_number;
322
323 struct mutex mutex;
324
325 spinlock_t lock;
326 struct mm_struct *mm;
327
328 struct ublk_params params;
329
330 struct completion completion;
331 u32 nr_queue_ready;
332 bool unprivileged_daemons;
333 struct mutex cancel_mutex;
334 bool canceling;
335 pid_t ublksrv_tgid;
336 struct delayed_work exit_work;
337 struct work_struct partition_scan_work;
338
339 bool block_open; /* protected by open_mutex */
340
341 /* shared memory zero copy */
342 struct maple_tree buf_tree;
343 struct ida buf_ida;
344
345 struct ublk_queue *queues[];
346 };
347
348 /* header of ublk_params */
349 struct ublk_params_header {
350 __u32 len;
351 __u32 types;
352 };
353
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 u16 q_id, u16 tag, struct ublk_io *io);
362 static inline unsigned int ublk_req_build_flags(struct request *req);
363 static void ublk_batch_dispatch(struct ublk_queue *ubq,
364 const struct ublk_batch_io_data *data,
365 struct ublk_batch_fetch_cmd *fcmd);
366
ublk_dev_support_batch_io(const struct ublk_device * ub)367 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
368 {
369 return ub->dev_info.flags & UBLK_F_BATCH_IO;
370 }
371
ublk_support_batch_io(const struct ublk_queue * ubq)372 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
373 {
374 return ubq->flags & UBLK_F_BATCH_IO;
375 }
376
ublk_io_lock(struct ublk_io * io)377 static inline void ublk_io_lock(struct ublk_io *io)
378 {
379 spin_lock(&io->lock);
380 }
381
ublk_io_unlock(struct ublk_io * io)382 static inline void ublk_io_unlock(struct ublk_io *io)
383 {
384 spin_unlock(&io->lock);
385 }
386
387 /* Initialize the event queue */
ublk_io_evts_init(struct ublk_queue * q,unsigned int size,int numa_node)388 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
389 int numa_node)
390 {
391 spin_lock_init(&q->evts_lock);
392 return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
393 }
394
395 /* Check if event queue is empty */
ublk_io_evts_empty(const struct ublk_queue * q)396 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
397 {
398 return kfifo_is_empty(&q->evts_fifo);
399 }
400
ublk_io_evts_deinit(struct ublk_queue * q)401 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
402 {
403 WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
404 kfifo_free(&q->evts_fifo);
405 }
406
407 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)408 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
409 {
410 return &ubq->io_cmd_buf[tag];
411 }
412
ublk_support_zero_copy(const struct ublk_queue * ubq)413 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
414 {
415 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
416 }
417
ublk_dev_support_zero_copy(const struct ublk_device * ub)418 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
419 {
420 return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
421 }
422
ublk_support_shmem_zc(const struct ublk_queue * ubq)423 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
424 {
425 return ubq->flags & UBLK_F_SHMEM_ZC;
426 }
427
ublk_iod_is_shmem_zc(const struct ublk_queue * ubq,unsigned int tag)428 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
429 unsigned int tag)
430 {
431 return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
432 }
433
ublk_dev_support_shmem_zc(const struct ublk_device * ub)434 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
435 {
436 return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
437 }
438
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)439 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
440 {
441 return ubq->flags & UBLK_F_AUTO_BUF_REG;
442 }
443
ublk_dev_support_auto_buf_reg(const struct ublk_device * ub)444 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
445 {
446 return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
447 }
448
ublk_support_user_copy(const struct ublk_queue * ubq)449 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
450 {
451 return ubq->flags & UBLK_F_USER_COPY;
452 }
453
ublk_dev_support_user_copy(const struct ublk_device * ub)454 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
455 {
456 return ub->dev_info.flags & UBLK_F_USER_COPY;
457 }
458
ublk_dev_is_zoned(const struct ublk_device * ub)459 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
460 {
461 return ub->dev_info.flags & UBLK_F_ZONED;
462 }
463
ublk_queue_is_zoned(const struct ublk_queue * ubq)464 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
465 {
466 return ubq->flags & UBLK_F_ZONED;
467 }
468
ublk_dev_support_integrity(const struct ublk_device * ub)469 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
470 {
471 return ub->dev_info.flags & UBLK_F_INTEGRITY;
472 }
473
474 #ifdef CONFIG_BLK_DEV_ZONED
475
476 struct ublk_zoned_report_desc {
477 __u64 sector;
478 __u32 operation;
479 __u32 nr_zones;
480 };
481
482 static DEFINE_XARRAY(ublk_zoned_report_descs);
483
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)484 static int ublk_zoned_insert_report_desc(const struct request *req,
485 struct ublk_zoned_report_desc *desc)
486 {
487 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
488 desc, GFP_KERNEL);
489 }
490
ublk_zoned_erase_report_desc(const struct request * req)491 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
492 const struct request *req)
493 {
494 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
495 }
496
ublk_zoned_get_report_desc(const struct request * req)497 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
498 const struct request *req)
499 {
500 return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
501 }
502
ublk_get_nr_zones(const struct ublk_device * ub)503 static int ublk_get_nr_zones(const struct ublk_device *ub)
504 {
505 const struct ublk_param_basic *p = &ub->params.basic;
506
507 /* Zone size is a power of 2 */
508 return p->dev_sectors >> ilog2(p->chunk_sectors);
509 }
510
ublk_revalidate_disk_zones(struct ublk_device * ub)511 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
512 {
513 return blk_revalidate_disk_zones(ub->ub_disk);
514 }
515
ublk_dev_param_zoned_validate(const struct ublk_device * ub)516 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
517 {
518 const struct ublk_param_zoned *p = &ub->params.zoned;
519 int nr_zones;
520
521 if (!ublk_dev_is_zoned(ub))
522 return -EINVAL;
523
524 if (!p->max_zone_append_sectors)
525 return -EINVAL;
526
527 nr_zones = ublk_get_nr_zones(ub);
528
529 if (p->max_active_zones > nr_zones)
530 return -EINVAL;
531
532 if (p->max_open_zones > nr_zones)
533 return -EINVAL;
534
535 return 0;
536 }
537
ublk_dev_param_zoned_apply(struct ublk_device * ub)538 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
539 {
540 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
541 }
542
543 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)544 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
545 unsigned int nr_zones, size_t *buflen)
546 {
547 struct request_queue *q = ublk->ub_disk->queue;
548 size_t bufsize;
549 void *buf;
550
551 nr_zones = min_t(unsigned int, nr_zones,
552 ublk->ub_disk->nr_zones);
553
554 bufsize = nr_zones * sizeof(struct blk_zone);
555 bufsize =
556 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
557
558 while (bufsize >= sizeof(struct blk_zone)) {
559 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
560 if (buf) {
561 *buflen = bufsize;
562 return buf;
563 }
564 bufsize >>= 1;
565 }
566
567 *buflen = 0;
568 return NULL;
569 }
570
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)571 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
572 unsigned int nr_zones, struct blk_report_zones_args *args)
573 {
574 struct ublk_device *ub = disk->private_data;
575 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
576 unsigned int first_zone = sector >> ilog2(zone_size_sectors);
577 unsigned int done_zones = 0;
578 unsigned int max_zones_per_request;
579 int ret;
580 struct blk_zone *buffer;
581 size_t buffer_length;
582
583 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
584 nr_zones);
585
586 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
587 if (!buffer)
588 return -ENOMEM;
589
590 max_zones_per_request = buffer_length / sizeof(struct blk_zone);
591
592 while (done_zones < nr_zones) {
593 unsigned int remaining_zones = nr_zones - done_zones;
594 unsigned int zones_in_request =
595 min_t(unsigned int, remaining_zones, max_zones_per_request);
596 struct request *req;
597 struct ublk_zoned_report_desc desc;
598 blk_status_t status;
599
600 memset(buffer, 0, buffer_length);
601
602 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
603 if (IS_ERR(req)) {
604 ret = PTR_ERR(req);
605 goto out;
606 }
607
608 desc.operation = UBLK_IO_OP_REPORT_ZONES;
609 desc.sector = sector;
610 desc.nr_zones = zones_in_request;
611 ret = ublk_zoned_insert_report_desc(req, &desc);
612 if (ret)
613 goto free_req;
614
615 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
616 if (ret)
617 goto erase_desc;
618
619 status = blk_execute_rq(req, 0);
620 ret = blk_status_to_errno(status);
621 erase_desc:
622 ublk_zoned_erase_report_desc(req);
623 free_req:
624 blk_mq_free_request(req);
625 if (ret)
626 goto out;
627
628 for (unsigned int i = 0; i < zones_in_request; i++) {
629 struct blk_zone *zone = buffer + i;
630
631 /* A zero length zone means no more zones in this response */
632 if (!zone->len)
633 break;
634
635 ret = disk_report_zone(disk, zone, i, args);
636 if (ret)
637 goto out;
638
639 done_zones++;
640 sector += zone_size_sectors;
641
642 }
643 }
644
645 ret = done_zones;
646
647 out:
648 kvfree(buffer);
649 return ret;
650 }
651
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)652 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
653 struct request *req)
654 {
655 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
656 struct ublk_io *io = &ubq->ios[req->tag];
657 struct ublk_zoned_report_desc *desc;
658 u32 ublk_op;
659
660 switch (req_op(req)) {
661 case REQ_OP_ZONE_OPEN:
662 ublk_op = UBLK_IO_OP_ZONE_OPEN;
663 break;
664 case REQ_OP_ZONE_CLOSE:
665 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
666 break;
667 case REQ_OP_ZONE_FINISH:
668 ublk_op = UBLK_IO_OP_ZONE_FINISH;
669 break;
670 case REQ_OP_ZONE_RESET:
671 ublk_op = UBLK_IO_OP_ZONE_RESET;
672 break;
673 case REQ_OP_ZONE_APPEND:
674 ublk_op = UBLK_IO_OP_ZONE_APPEND;
675 break;
676 case REQ_OP_ZONE_RESET_ALL:
677 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
678 break;
679 case REQ_OP_DRV_IN:
680 desc = ublk_zoned_get_report_desc(req);
681 if (!desc)
682 return BLK_STS_IOERR;
683 ublk_op = desc->operation;
684 switch (ublk_op) {
685 case UBLK_IO_OP_REPORT_ZONES:
686 iod->op_flags = ublk_op | ublk_req_build_flags(req);
687 iod->nr_zones = desc->nr_zones;
688 iod->start_sector = desc->sector;
689 return BLK_STS_OK;
690 default:
691 return BLK_STS_IOERR;
692 }
693 case REQ_OP_DRV_OUT:
694 /* We do not support drv_out */
695 return BLK_STS_NOTSUPP;
696 default:
697 return BLK_STS_IOERR;
698 }
699
700 iod->op_flags = ublk_op | ublk_req_build_flags(req);
701 iod->nr_sectors = blk_rq_sectors(req);
702 iod->start_sector = blk_rq_pos(req);
703 iod->addr = io->buf.addr;
704
705 return BLK_STS_OK;
706 }
707
708 #else
709
710 #define ublk_report_zones (NULL)
711
ublk_dev_param_zoned_validate(const struct ublk_device * ub)712 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
713 {
714 return -EOPNOTSUPP;
715 }
716
ublk_dev_param_zoned_apply(struct ublk_device * ub)717 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
718 {
719 }
720
ublk_revalidate_disk_zones(struct ublk_device * ub)721 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
722 {
723 return 0;
724 }
725
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)726 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
727 struct request *req)
728 {
729 return BLK_STS_NOTSUPP;
730 }
731
732 #endif
733
734 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
735 bool need_map, struct io_comp_batch *iob);
736
737 static dev_t ublk_chr_devt;
738 static const struct class ublk_chr_class = {
739 .name = "ublk-char",
740 };
741
742 static DEFINE_IDR(ublk_index_idr);
743 static DEFINE_SPINLOCK(ublk_idr_lock);
744 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
745
746 static DEFINE_MUTEX(ublk_ctl_mutex);
747
748 static struct ublk_batch_fetch_cmd *
ublk_batch_alloc_fcmd(struct io_uring_cmd * cmd)749 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
750 {
751 struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
752
753 if (fcmd) {
754 fcmd->cmd = cmd;
755 fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
756 }
757 return fcmd;
758 }
759
ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd * fcmd)760 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
761 {
762 kfree(fcmd);
763 }
764
__ublk_release_fcmd(struct ublk_queue * ubq)765 static void __ublk_release_fcmd(struct ublk_queue *ubq)
766 {
767 WRITE_ONCE(ubq->active_fcmd, NULL);
768 }
769
770 /*
771 * Nothing can move on, so clear ->active_fcmd, and the caller should stop
772 * dispatching
773 */
ublk_batch_deinit_fetch_buf(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd,int res)774 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
775 const struct ublk_batch_io_data *data,
776 struct ublk_batch_fetch_cmd *fcmd,
777 int res)
778 {
779 spin_lock(&ubq->evts_lock);
780 list_del_init(&fcmd->node);
781 WARN_ON_ONCE(fcmd != ubq->active_fcmd);
782 __ublk_release_fcmd(ubq);
783 spin_unlock(&ubq->evts_lock);
784
785 io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
786 ublk_batch_free_fcmd(fcmd);
787 }
788
ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd * fcmd,struct io_br_sel * sel,unsigned int issue_flags)789 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
790 struct io_br_sel *sel,
791 unsigned int issue_flags)
792 {
793 if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
794 return -ENOBUFS;
795 return 0;
796 }
797
ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd * fcmd,void __user * buf,const u16 * tag_buf,unsigned int len)798 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
799 void __user *buf, const u16 *tag_buf,
800 unsigned int len)
801 {
802 if (copy_to_user(buf, tag_buf, len))
803 return -EFAULT;
804 return len;
805 }
806
807 #define UBLK_MAX_UBLKS UBLK_MINORS
808
809 /*
810 * Max unprivileged ublk devices allowed to add
811 *
812 * It can be extended to one per-user limit in future or even controlled
813 * by cgroup.
814 */
815 static unsigned int unprivileged_ublks_max = 64;
816 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
817
818 static struct miscdevice ublk_misc;
819
ublk_pos_to_hwq(loff_t pos)820 static inline unsigned ublk_pos_to_hwq(loff_t pos)
821 {
822 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
823 UBLK_QID_BITS_MASK;
824 }
825
ublk_pos_to_buf_off(loff_t pos)826 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
827 {
828 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
829 }
830
ublk_pos_to_tag(loff_t pos)831 static inline unsigned ublk_pos_to_tag(loff_t pos)
832 {
833 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
834 UBLK_TAG_BITS_MASK;
835 }
836
ublk_dev_param_basic_apply(struct ublk_device * ub)837 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
838 {
839 const struct ublk_param_basic *p = &ub->params.basic;
840
841 if (p->attrs & UBLK_ATTR_READ_ONLY)
842 set_disk_ro(ub->ub_disk, true);
843
844 set_capacity(ub->ub_disk, p->dev_sectors);
845 }
846
ublk_integrity_flags(u32 flags)847 static int ublk_integrity_flags(u32 flags)
848 {
849 int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
850
851 if (flags & LBMD_PI_CAP_INTEGRITY) {
852 flags &= ~LBMD_PI_CAP_INTEGRITY;
853 ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
854 }
855 if (flags & LBMD_PI_CAP_REFTAG) {
856 flags &= ~LBMD_PI_CAP_REFTAG;
857 ret_flags |= BLK_INTEGRITY_REF_TAG;
858 }
859 return flags ? -EINVAL : ret_flags;
860 }
861
ublk_integrity_pi_tuple_size(u8 csum_type)862 static int ublk_integrity_pi_tuple_size(u8 csum_type)
863 {
864 switch (csum_type) {
865 case LBMD_PI_CSUM_NONE:
866 return 0;
867 case LBMD_PI_CSUM_IP:
868 case LBMD_PI_CSUM_CRC16_T10DIF:
869 return 8;
870 case LBMD_PI_CSUM_CRC64_NVME:
871 return 16;
872 default:
873 return -EINVAL;
874 }
875 }
876
ublk_integrity_csum_type(u8 csum_type)877 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
878 {
879 switch (csum_type) {
880 case LBMD_PI_CSUM_NONE:
881 return BLK_INTEGRITY_CSUM_NONE;
882 case LBMD_PI_CSUM_IP:
883 return BLK_INTEGRITY_CSUM_IP;
884 case LBMD_PI_CSUM_CRC16_T10DIF:
885 return BLK_INTEGRITY_CSUM_CRC;
886 case LBMD_PI_CSUM_CRC64_NVME:
887 return BLK_INTEGRITY_CSUM_CRC64;
888 default:
889 WARN_ON_ONCE(1);
890 return BLK_INTEGRITY_CSUM_NONE;
891 }
892 }
893
ublk_validate_params(const struct ublk_device * ub)894 static int ublk_validate_params(const struct ublk_device *ub)
895 {
896 /* basic param is the only one which must be set */
897 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
898 const struct ublk_param_basic *p = &ub->params.basic;
899
900 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
901 return -EINVAL;
902
903 /*
904 * 256M is a reasonable upper bound for physical block size,
905 * io_min and io_opt; it aligns with the maximum physical
906 * block size possible in NVMe.
907 */
908 if (p->physical_bs_shift > ilog2(SZ_256M))
909 return -EINVAL;
910
911 if (p->io_min_shift > ilog2(SZ_256M))
912 return -EINVAL;
913
914 if (p->io_opt_shift > ilog2(SZ_256M))
915 return -EINVAL;
916
917 if (p->logical_bs_shift > p->physical_bs_shift)
918 return -EINVAL;
919
920 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
921 return -EINVAL;
922
923 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
924 return -EINVAL;
925 } else
926 return -EINVAL;
927
928 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
929 const struct ublk_param_discard *p = &ub->params.discard;
930
931 /* So far, only support single segment discard */
932 if (p->max_discard_sectors && p->max_discard_segments != 1)
933 return -EINVAL;
934
935 if (!p->discard_granularity)
936 return -EINVAL;
937 }
938
939 /* dev_t is read-only */
940 if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
941 return -EINVAL;
942
943 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
944 return ublk_dev_param_zoned_validate(ub);
945 else if (ublk_dev_is_zoned(ub))
946 return -EINVAL;
947
948 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
949 const struct ublk_param_dma_align *p = &ub->params.dma;
950
951 if (p->alignment >= PAGE_SIZE)
952 return -EINVAL;
953
954 if (!is_power_of_2(p->alignment + 1))
955 return -EINVAL;
956 }
957
958 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
959 const struct ublk_param_segment *p = &ub->params.seg;
960
961 if (!is_power_of_2(p->seg_boundary_mask + 1))
962 return -EINVAL;
963
964 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
965 return -EINVAL;
966 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
967 return -EINVAL;
968 }
969
970 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
971 const struct ublk_param_integrity *p = &ub->params.integrity;
972 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
973 int flags = ublk_integrity_flags(p->flags);
974
975 if (!ublk_dev_support_integrity(ub))
976 return -EINVAL;
977 if (flags < 0)
978 return flags;
979 if (pi_tuple_size < 0)
980 return pi_tuple_size;
981 if (!p->metadata_size)
982 return -EINVAL;
983 if (p->csum_type == LBMD_PI_CSUM_NONE &&
984 p->flags & LBMD_PI_CAP_REFTAG)
985 return -EINVAL;
986 if (p->pi_offset + pi_tuple_size > p->metadata_size)
987 return -EINVAL;
988 if (p->interval_exp < SECTOR_SHIFT ||
989 p->interval_exp > ub->params.basic.logical_bs_shift)
990 return -EINVAL;
991 }
992
993 return 0;
994 }
995
ublk_apply_params(struct ublk_device * ub)996 static void ublk_apply_params(struct ublk_device *ub)
997 {
998 ublk_dev_param_basic_apply(ub);
999
1000 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
1001 ublk_dev_param_zoned_apply(ub);
1002 }
1003
ublk_need_map_io(const struct ublk_queue * ubq)1004 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
1005 {
1006 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
1007 !ublk_support_auto_buf_reg(ubq);
1008 }
1009
ublk_dev_need_map_io(const struct ublk_device * ub)1010 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
1011 {
1012 return !ublk_dev_support_user_copy(ub) &&
1013 !ublk_dev_support_zero_copy(ub) &&
1014 !ublk_dev_support_auto_buf_reg(ub);
1015 }
1016
ublk_need_req_ref(const struct ublk_queue * ubq)1017 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1018 {
1019 /*
1020 * read()/write() is involved in user copy, so request reference
1021 * has to be grabbed
1022 *
1023 * for zero copy, request buffer need to be registered to io_uring
1024 * buffer table, so reference is needed
1025 *
1026 * For auto buffer register, ublk server still may issue
1027 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1028 * so reference is required too.
1029 */
1030 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1031 ublk_support_auto_buf_reg(ubq);
1032 }
1033
ublk_dev_need_req_ref(const struct ublk_device * ub)1034 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1035 {
1036 return ublk_dev_support_user_copy(ub) ||
1037 ublk_dev_support_zero_copy(ub) ||
1038 ublk_dev_support_auto_buf_reg(ub);
1039 }
1040
1041 /*
1042 * ublk IO Reference Counting Design
1043 * ==================================
1044 *
1045 * For user-copy and zero-copy modes, ublk uses a split reference model with
1046 * two counters that together track IO lifetime:
1047 *
1048 * - io->ref: refcount for off-task buffer registrations and user-copy ops
1049 * - io->task_registered_buffers: count of buffers registered on the IO task
1050 *
1051 * Key Invariant:
1052 * --------------
1053 * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1054 * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1055 * when no active references exist. After IO completion, both counters become
1056 * zero. For I/Os not currently dispatched to the ublk server, both ref and
1057 * task_registered_buffers are 0.
1058 *
1059 * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1060 * exit to determine if all references have been released.
1061 *
1062 * Why Split Counters:
1063 * -------------------
1064 * Buffers registered on the IO daemon task can use the lightweight
1065 * task_registered_buffers counter (simple increment/decrement) instead of
1066 * atomic refcount operations. The ublk_io_release() callback checks if
1067 * current == io->task to decide which counter to update.
1068 *
1069 * This optimization only applies before IO completion. At completion,
1070 * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1071 * After that, all subsequent buffer unregistrations must use the atomic ref
1072 * since they may be releasing the last reference.
1073 *
1074 * Reference Lifecycle:
1075 * --------------------
1076 * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1077 *
1078 * 2. During IO processing:
1079 * - On-task buffer reg: task_registered_buffers++ (no ref change)
1080 * - Off-task buffer reg: ref++ via ublk_get_req_ref()
1081 * - Buffer unregister callback (ublk_io_release):
1082 * * If on-task: task_registered_buffers--
1083 * * If off-task: ref-- via ublk_put_req_ref()
1084 *
1085 * 3. ublk_sub_req_ref() at IO completion:
1086 * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1087 * - Subtracts sub_refs from ref and zeroes task_registered_buffers
1088 * - This effectively collapses task_registered_buffers into the atomic ref,
1089 * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1090 * buffers that were already counted
1091 *
1092 * Example (zero-copy, register on-task, unregister off-task):
1093 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1094 * - Register buffer on-task: task_registered_buffers = 1
1095 * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1096 * - Completion via ublk_sub_req_ref():
1097 * sub_refs = UBLK_REFCOUNT_INIT - 1,
1098 * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1099 *
1100 * Example (auto buffer registration):
1101 * Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1102 *
1103 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1104 * - Buffer unregister: task_registered_buffers-- (becomes 0)
1105 * - Completion via ublk_sub_req_ref():
1106 * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1107 *
1108 * Example (zero-copy, ublk server killed):
1109 * When daemon is killed, io_uring cleanup unregisters buffers off-task.
1110 * ublk_check_and_reset_active_ref() waits for the invariant to hold.
1111 *
1112 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1113 * - Register buffer on-task: task_registered_buffers = 1
1114 * - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1115 * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1116 * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1117 * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1118 * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1119 * and abort pending requests
1120 *
1121 * Batch IO Special Case:
1122 * ----------------------
1123 * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1124 * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1125 * task_registered_buffers counter still tracks registered buffers for the
1126 * invariant check, even though the callback doesn't decrement it.
1127 *
1128 * Note: updating task_registered_buffers is protected by io->lock.
1129 */
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)1130 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1131 struct ublk_io *io)
1132 {
1133 if (ublk_need_req_ref(ubq))
1134 refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1135 }
1136
ublk_get_req_ref(struct ublk_io * io)1137 static inline bool ublk_get_req_ref(struct ublk_io *io)
1138 {
1139 return refcount_inc_not_zero(&io->ref);
1140 }
1141
ublk_put_req_ref(struct ublk_io * io,struct request * req)1142 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1143 {
1144 if (!refcount_dec_and_test(&io->ref))
1145 return;
1146
1147 /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1148 __ublk_complete_rq(req, io, false, NULL);
1149 }
1150
ublk_sub_req_ref(struct ublk_io * io)1151 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1152 {
1153 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1154
1155 io->task_registered_buffers = 0;
1156 return refcount_sub_and_test(sub_refs, &io->ref);
1157 }
1158
ublk_need_get_data(const struct ublk_queue * ubq)1159 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1160 {
1161 return ubq->flags & UBLK_F_NEED_GET_DATA;
1162 }
1163
ublk_dev_need_get_data(const struct ublk_device * ub)1164 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1165 {
1166 return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1167 }
1168
1169 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)1170 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1171 {
1172 if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1173 return ub;
1174 return NULL;
1175 }
1176
1177 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)1178 static noinline void ublk_put_device(struct ublk_device *ub)
1179 {
1180 put_device(&ub->cdev_dev);
1181 }
1182
ublk_get_queue(struct ublk_device * dev,int qid)1183 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1184 int qid)
1185 {
1186 return dev->queues[qid];
1187 }
1188
ublk_rq_has_data(const struct request * rq)1189 static inline bool ublk_rq_has_data(const struct request *rq)
1190 {
1191 return bio_has_data(rq->bio);
1192 }
1193
1194 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)1195 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1196 {
1197 return ublk_get_queue(ub, q_id)->io_cmd_buf;
1198 }
1199
__ublk_queue_cmd_buf_size(int depth)1200 static inline int __ublk_queue_cmd_buf_size(int depth)
1201 {
1202 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1203 }
1204
ublk_queue_cmd_buf_size(struct ublk_device * ub)1205 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1206 {
1207 return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1208 }
1209
ublk_max_cmd_buf_size(void)1210 static int ublk_max_cmd_buf_size(void)
1211 {
1212 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1213 }
1214
1215 /*
1216 * Should I/O outstanding to the ublk server when it exits be reissued?
1217 * If not, outstanding I/O will get errors.
1218 */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)1219 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1220 {
1221 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1222 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1223 }
1224
1225 /*
1226 * Should I/O issued while there is no ublk server queue? If not, I/O
1227 * issued while there is no ublk server will get errors.
1228 */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)1229 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1230 {
1231 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1232 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1233 }
1234
1235 /*
1236 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1237 * of the device flags for smaller cache footprint - better for fast
1238 * paths.
1239 */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)1240 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1241 {
1242 return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1243 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1244 }
1245
1246 /*
1247 * Should ublk devices be stopped (i.e. no recovery possible) when the
1248 * ublk server exits? If not, devices can be used again by a future
1249 * incarnation of a ublk server via the start_recovery/end_recovery
1250 * commands.
1251 */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)1252 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1253 {
1254 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1255 }
1256
ublk_dev_in_recoverable_state(struct ublk_device * ub)1257 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1258 {
1259 return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1260 ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1261 }
1262
ublk_free_disk(struct gendisk * disk)1263 static void ublk_free_disk(struct gendisk *disk)
1264 {
1265 struct ublk_device *ub = disk->private_data;
1266
1267 clear_bit(UB_STATE_USED, &ub->state);
1268 ublk_put_device(ub);
1269 }
1270
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)1271 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1272 unsigned int *owner_gid)
1273 {
1274 kuid_t uid;
1275 kgid_t gid;
1276
1277 current_uid_gid(&uid, &gid);
1278
1279 *owner_uid = from_kuid(&init_user_ns, uid);
1280 *owner_gid = from_kgid(&init_user_ns, gid);
1281 }
1282
ublk_open(struct gendisk * disk,blk_mode_t mode)1283 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1284 {
1285 struct ublk_device *ub = disk->private_data;
1286
1287 if (capable(CAP_SYS_ADMIN))
1288 return 0;
1289
1290 /*
1291 * If it is one unprivileged device, only owner can open
1292 * the disk. Otherwise it could be one trap made by one
1293 * evil user who grants this disk's privileges to other
1294 * users deliberately.
1295 *
1296 * This way is reasonable too given anyone can create
1297 * unprivileged device, and no need other's grant.
1298 */
1299 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1300 unsigned int curr_uid, curr_gid;
1301
1302 ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1303
1304 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1305 ub->dev_info.owner_gid)
1306 return -EPERM;
1307 }
1308
1309 if (ub->block_open)
1310 return -ENXIO;
1311
1312 return 0;
1313 }
1314
1315 static const struct block_device_operations ub_fops = {
1316 .owner = THIS_MODULE,
1317 .open = ublk_open,
1318 .free_disk = ublk_free_disk,
1319 .report_zones = ublk_report_zones,
1320 };
1321
ublk_copy_user_bvec(const struct bio_vec * bv,unsigned * offset,struct iov_iter * uiter,int dir,size_t * done)1322 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1323 struct iov_iter *uiter, int dir, size_t *done)
1324 {
1325 unsigned len;
1326 void *bv_buf;
1327 size_t copied;
1328
1329 if (*offset >= bv->bv_len) {
1330 *offset -= bv->bv_len;
1331 return true;
1332 }
1333
1334 len = bv->bv_len - *offset;
1335 bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1336 /*
1337 * Bio pages may originate from slab caches without a usercopy region
1338 * (e.g. jbd2 frozen metadata buffers). This is the same data that
1339 * the loop driver writes to its backing file — no exposure risk.
1340 * The bvec length is always trusted, so the size check in
1341 * check_copy_size() is not needed either. Use the unchecked
1342 * helpers to avoid false positives on slab pages.
1343 */
1344 if (dir == ITER_DEST)
1345 copied = _copy_to_iter(bv_buf, len, uiter);
1346 else
1347 copied = _copy_from_iter(bv_buf, len, uiter);
1348
1349 kunmap_local(bv_buf);
1350
1351 *done += copied;
1352 if (copied < len)
1353 return false;
1354
1355 *offset = 0;
1356 return true;
1357 }
1358
1359 /*
1360 * Copy data between request pages and io_iter, and 'offset'
1361 * is the start point of linear offset of request.
1362 */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1363 static size_t ublk_copy_user_pages(const struct request *req,
1364 unsigned offset, struct iov_iter *uiter, int dir)
1365 {
1366 struct req_iterator iter;
1367 struct bio_vec bv;
1368 size_t done = 0;
1369
1370 rq_for_each_segment(bv, req, iter) {
1371 if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1372 break;
1373 }
1374 return done;
1375 }
1376
1377 #ifdef CONFIG_BLK_DEV_INTEGRITY
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1378 static size_t ublk_copy_user_integrity(const struct request *req,
1379 unsigned offset, struct iov_iter *uiter, int dir)
1380 {
1381 size_t done = 0;
1382 struct bio *bio = req->bio;
1383 struct bvec_iter iter;
1384 struct bio_vec iv;
1385
1386 if (!blk_integrity_rq(req))
1387 return 0;
1388
1389 bio_for_each_integrity_vec(iv, bio, iter) {
1390 if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1391 break;
1392 }
1393
1394 return done;
1395 }
1396 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1397 static size_t ublk_copy_user_integrity(const struct request *req,
1398 unsigned offset, struct iov_iter *uiter, int dir)
1399 {
1400 return 0;
1401 }
1402 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1403
ublk_need_map_req(const struct request * req)1404 static inline bool ublk_need_map_req(const struct request *req)
1405 {
1406 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1407 }
1408
ublk_need_unmap_req(const struct request * req)1409 static inline bool ublk_need_unmap_req(const struct request *req)
1410 {
1411 return ublk_rq_has_data(req) &&
1412 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1413 }
1414
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)1415 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1416 const struct request *req,
1417 const struct ublk_io *io)
1418 {
1419 const unsigned int rq_bytes = blk_rq_bytes(req);
1420
1421 if (!ublk_need_map_io(ubq))
1422 return rq_bytes;
1423
1424 /*
1425 * no zero copy, we delay copy WRITE request data into ublksrv
1426 * context and the big benefit is that pinning pages in current
1427 * context is pretty fast, see ublk_pin_user_pages
1428 */
1429 if (ublk_need_map_req(req)) {
1430 struct iov_iter iter;
1431 const int dir = ITER_DEST;
1432
1433 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1434 return ublk_copy_user_pages(req, 0, &iter, dir);
1435 }
1436 return rq_bytes;
1437 }
1438
ublk_unmap_io(bool need_map,const struct request * req,const struct ublk_io * io)1439 static unsigned int ublk_unmap_io(bool need_map,
1440 const struct request *req,
1441 const struct ublk_io *io)
1442 {
1443 const unsigned int rq_bytes = blk_rq_bytes(req);
1444
1445 if (!need_map)
1446 return rq_bytes;
1447
1448 if (ublk_need_unmap_req(req)) {
1449 struct iov_iter iter;
1450 const int dir = ITER_SOURCE;
1451
1452 WARN_ON_ONCE(io->res > rq_bytes);
1453
1454 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1455 return ublk_copy_user_pages(req, 0, &iter, dir);
1456 }
1457 return rq_bytes;
1458 }
1459
ublk_req_build_flags(struct request * req)1460 static inline unsigned int ublk_req_build_flags(struct request *req)
1461 {
1462 unsigned flags = 0;
1463
1464 if (req->cmd_flags & REQ_FAILFAST_DEV)
1465 flags |= UBLK_IO_F_FAILFAST_DEV;
1466
1467 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1468 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1469
1470 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1471 flags |= UBLK_IO_F_FAILFAST_DRIVER;
1472
1473 if (req->cmd_flags & REQ_META)
1474 flags |= UBLK_IO_F_META;
1475
1476 if (req->cmd_flags & REQ_FUA)
1477 flags |= UBLK_IO_F_FUA;
1478
1479 if (req->cmd_flags & REQ_NOUNMAP)
1480 flags |= UBLK_IO_F_NOUNMAP;
1481
1482 if (req->cmd_flags & REQ_SWAP)
1483 flags |= UBLK_IO_F_SWAP;
1484
1485 if (blk_integrity_rq(req))
1486 flags |= UBLK_IO_F_INTEGRITY;
1487
1488 return flags;
1489 }
1490
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1491 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1492 {
1493 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1494 struct ublk_io *io = &ubq->ios[req->tag];
1495 u32 ublk_op;
1496
1497 switch (req_op(req)) {
1498 case REQ_OP_READ:
1499 ublk_op = UBLK_IO_OP_READ;
1500 break;
1501 case REQ_OP_WRITE:
1502 ublk_op = UBLK_IO_OP_WRITE;
1503 break;
1504 case REQ_OP_FLUSH:
1505 ublk_op = UBLK_IO_OP_FLUSH;
1506 break;
1507 case REQ_OP_DISCARD:
1508 ublk_op = UBLK_IO_OP_DISCARD;
1509 break;
1510 case REQ_OP_WRITE_ZEROES:
1511 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1512 break;
1513 default:
1514 if (ublk_queue_is_zoned(ubq))
1515 return ublk_setup_iod_zoned(ubq, req);
1516 return BLK_STS_IOERR;
1517 }
1518
1519 /* need to translate since kernel may change */
1520 iod->op_flags = ublk_op | ublk_req_build_flags(req);
1521 iod->nr_sectors = blk_rq_sectors(req);
1522 iod->start_sector = blk_rq_pos(req);
1523
1524 /* Try shmem zero-copy match before setting addr */
1525 if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
1526 u32 buf_idx, buf_off;
1527
1528 if (ublk_try_buf_match(ubq->dev, req,
1529 &buf_idx, &buf_off)) {
1530 iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
1531 iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
1532 return BLK_STS_OK;
1533 }
1534 }
1535
1536 iod->addr = io->buf.addr;
1537
1538 return BLK_STS_OK;
1539 }
1540
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1541 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1542 struct io_uring_cmd *ioucmd)
1543 {
1544 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1545 }
1546
ublk_end_request(struct request * req,blk_status_t error)1547 static void ublk_end_request(struct request *req, blk_status_t error)
1548 {
1549 local_bh_disable();
1550 blk_mq_end_request(req, error);
1551 local_bh_enable();
1552 }
1553
1554 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req,struct ublk_io * io,bool need_map,struct io_comp_batch * iob)1555 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1556 bool need_map, struct io_comp_batch *iob)
1557 {
1558 unsigned int unmapped_bytes;
1559 blk_status_t res = BLK_STS_OK;
1560 bool requeue;
1561
1562 /* failed read IO if nothing is read */
1563 if (!io->res && req_op(req) == REQ_OP_READ)
1564 io->res = -EIO;
1565
1566 if (io->res < 0) {
1567 res = errno_to_blk_status(io->res);
1568 goto exit;
1569 }
1570
1571 /*
1572 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1573 * directly.
1574 *
1575 * Both the two needn't unmap.
1576 */
1577 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1578 req_op(req) != REQ_OP_DRV_IN)
1579 goto exit;
1580
1581 /* shmem zero copy: no data to unmap, pages already shared */
1582 if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1583 goto exit;
1584
1585 /* for READ request, writing data in iod->addr to rq buffers */
1586 unmapped_bytes = ublk_unmap_io(need_map, req, io);
1587
1588 /*
1589 * Extremely impossible since we got data filled in just before
1590 *
1591 * Re-read simply for this unlikely case.
1592 */
1593 if (unlikely(unmapped_bytes < io->res))
1594 io->res = unmapped_bytes;
1595
1596 /*
1597 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1598 * happens off this path, then that will prevent ublk's blkdev_release()
1599 * from being called on current's task work, see fput() implementation.
1600 *
1601 * Otherwise, ublk server may not provide forward progress in case of
1602 * reading the partition table from bdev_open() with disk->open_mutex
1603 * held, and causes dead lock as we could already be holding
1604 * disk->open_mutex here.
1605 *
1606 * Preferably we would not be doing IO with a mutex held that is also
1607 * used for release, but this work-around will suffice for now.
1608 */
1609 local_bh_disable();
1610 requeue = blk_update_request(req, BLK_STS_OK, io->res);
1611 local_bh_enable();
1612 if (requeue)
1613 blk_mq_requeue_request(req, true);
1614 else if (likely(!blk_should_fake_timeout(req->q))) {
1615 if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1616 return;
1617 __blk_mq_end_request(req, BLK_STS_OK);
1618 }
1619
1620 return;
1621 exit:
1622 ublk_end_request(req, res);
1623 }
1624
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1625 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1626 struct request *req)
1627 {
1628 /* read cmd first because req will overwrite it */
1629 struct io_uring_cmd *cmd = io->cmd;
1630
1631 /* mark this cmd owned by ublksrv */
1632 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1633
1634 /*
1635 * clear ACTIVE since we are done with this sqe/cmd slot
1636 * We can only accept io cmd in case of being not active.
1637 */
1638 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1639
1640 io->req = req;
1641 return cmd;
1642 }
1643
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1644 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1645 int res, unsigned issue_flags)
1646 {
1647 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1648
1649 /* tell ublksrv one io request is coming */
1650 io_uring_cmd_done(cmd, res, issue_flags);
1651 }
1652
1653 #define UBLK_REQUEUE_DELAY_MS 3
1654
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1655 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1656 struct request *rq)
1657 {
1658 /* We cannot process this rq so just requeue it. */
1659 if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1660 blk_mq_requeue_request(rq, false);
1661 else
1662 ublk_end_request(rq, BLK_STS_IOERR);
1663 }
1664
1665 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,unsigned tag)1666 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1667 {
1668 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1669
1670 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1671 }
1672
1673 enum auto_buf_reg_res {
1674 AUTO_BUF_REG_FAIL,
1675 AUTO_BUF_REG_FALLBACK,
1676 AUTO_BUF_REG_OK,
1677 };
1678
1679 /*
1680 * Setup io state after auto buffer registration.
1681 *
1682 * Must be called after ublk_auto_buf_register() is done.
1683 * Caller must hold io->lock in batch context.
1684 */
ublk_auto_buf_io_setup(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,enum auto_buf_reg_res res)1685 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1686 struct request *req, struct ublk_io *io,
1687 struct io_uring_cmd *cmd,
1688 enum auto_buf_reg_res res)
1689 {
1690 if (res == AUTO_BUF_REG_OK) {
1691 io->task_registered_buffers = 1;
1692 io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1693 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1694 }
1695 ublk_init_req_ref(ubq, io);
1696 __ublk_prep_compl_io_cmd(io, req);
1697 }
1698
1699 /* Register request bvec to io_uring for auto buffer registration. */
1700 static enum auto_buf_reg_res
ublk_auto_buf_register(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1701 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1702 struct ublk_io *io, struct io_uring_cmd *cmd,
1703 unsigned int issue_flags)
1704 {
1705 int ret;
1706
1707 ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1708 io->buf.auto_reg.index, issue_flags);
1709 if (ret) {
1710 if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1711 ublk_auto_buf_reg_fallback(ubq, req->tag);
1712 return AUTO_BUF_REG_FALLBACK;
1713 }
1714 ublk_end_request(req, BLK_STS_IOERR);
1715 return AUTO_BUF_REG_FAIL;
1716 }
1717
1718 return AUTO_BUF_REG_OK;
1719 }
1720
1721 /*
1722 * Dispatch IO to userspace with auto buffer registration.
1723 *
1724 * Only called in non-batch context from task work, io->lock not held.
1725 */
ublk_auto_buf_dispatch(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1726 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1727 struct request *req, struct ublk_io *io,
1728 struct io_uring_cmd *cmd,
1729 unsigned int issue_flags)
1730 {
1731 enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1732 issue_flags);
1733
1734 if (res != AUTO_BUF_REG_FAIL) {
1735 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1736 io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1737 }
1738 }
1739
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1740 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1741 struct ublk_io *io)
1742 {
1743 unsigned mapped_bytes;
1744
1745 /* shmem zero copy: skip data copy, pages already shared */
1746 if (ublk_iod_is_shmem_zc(ubq, req->tag))
1747 return true;
1748
1749 mapped_bytes = ublk_map_io(ubq, req, io);
1750
1751 /* partially mapped, update io descriptor */
1752 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1753 /*
1754 * Nothing mapped, retry until we succeed.
1755 *
1756 * We may never succeed in mapping any bytes here because
1757 * of OOM. TODO: reserve one buffer with single page pinned
1758 * for providing forward progress guarantee.
1759 */
1760 if (unlikely(!mapped_bytes)) {
1761 blk_mq_requeue_request(req, false);
1762 blk_mq_delay_kick_requeue_list(req->q,
1763 UBLK_REQUEUE_DELAY_MS);
1764 return false;
1765 }
1766
1767 ublk_get_iod(ubq, req->tag)->nr_sectors =
1768 mapped_bytes >> 9;
1769 }
1770
1771 return true;
1772 }
1773
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req)1774 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1775 {
1776 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1777 int tag = req->tag;
1778 struct ublk_io *io = &ubq->ios[tag];
1779
1780 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1781 __func__, ubq->q_id, req->tag, io->flags,
1782 ublk_get_iod(ubq, req->tag)->addr);
1783
1784 /*
1785 * Task is exiting if either:
1786 *
1787 * (1) current != io->task.
1788 * io_uring_cmd_complete_in_task() tries to run task_work
1789 * in a workqueue if cmd's task is PF_EXITING.
1790 *
1791 * (2) current->flags & PF_EXITING.
1792 */
1793 if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1794 __ublk_abort_rq(ubq, req);
1795 return;
1796 }
1797
1798 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1799 /*
1800 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1801 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1802 * and notify it.
1803 */
1804 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1805 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1806 __func__, ubq->q_id, req->tag, io->flags);
1807 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1808 issue_flags);
1809 return;
1810 }
1811
1812 if (!ublk_start_io(ubq, req, io))
1813 return;
1814
1815 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1816 ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1817 } else {
1818 ublk_init_req_ref(ubq, io);
1819 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1820 }
1821 }
1822
__ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short tag)1823 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1824 const struct ublk_batch_io_data *data,
1825 unsigned short tag)
1826 {
1827 struct ublk_device *ub = data->ub;
1828 struct ublk_io *io = &ubq->ios[tag];
1829 struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1830 enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1831 struct io_uring_cmd *cmd = data->cmd;
1832
1833 if (!ublk_start_io(ubq, req, io))
1834 return false;
1835
1836 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1837 res = ublk_auto_buf_register(ubq, req, io, cmd,
1838 data->issue_flags);
1839
1840 if (res == AUTO_BUF_REG_FAIL)
1841 return false;
1842 }
1843
1844 ublk_io_lock(io);
1845 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1846 ublk_io_unlock(io);
1847
1848 return true;
1849 }
1850
ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,unsigned int len)1851 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1852 const struct ublk_batch_io_data *data,
1853 unsigned short *tag_buf,
1854 unsigned int len)
1855 {
1856 bool has_unused = false;
1857 unsigned int i;
1858
1859 for (i = 0; i < len; i++) {
1860 unsigned short tag = tag_buf[i];
1861
1862 if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1863 tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1864 has_unused = true;
1865 }
1866 }
1867
1868 return has_unused;
1869 }
1870
1871 /*
1872 * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1873 * Returns the new length after filtering.
1874 */
ublk_filter_unused_tags(unsigned short * tag_buf,unsigned int len)1875 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1876 unsigned int len)
1877 {
1878 unsigned int i, j;
1879
1880 for (i = 0, j = 0; i < len; i++) {
1881 if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1882 if (i != j)
1883 tag_buf[j] = tag_buf[i];
1884 j++;
1885 }
1886 }
1887
1888 return j;
1889 }
1890
ublk_batch_dispatch_fail(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,size_t len,int ret)1891 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1892 const struct ublk_batch_io_data *data,
1893 unsigned short *tag_buf, size_t len, int ret)
1894 {
1895 int i, res;
1896
1897 /*
1898 * Undo prep state for all IOs since userspace never received them.
1899 * This restores IOs to pre-prepared state so they can be cleanly
1900 * re-prepared when tags are pulled from FIFO again.
1901 */
1902 for (i = 0; i < len; i++) {
1903 struct ublk_io *io = &ubq->ios[tag_buf[i]];
1904 int index = -1;
1905
1906 ublk_io_lock(io);
1907 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1908 index = io->buf.auto_reg.index;
1909 io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1910 io->flags |= UBLK_IO_FLAG_ACTIVE;
1911 ublk_io_unlock(io);
1912
1913 if (index != -1)
1914 io_buffer_unregister_bvec(data->cmd, index,
1915 data->issue_flags);
1916 }
1917
1918 res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1919 tag_buf, len, &ubq->evts_lock);
1920
1921 pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1922 "tags(%d %zu) ret %d\n", __func__, res, len,
1923 ret);
1924 }
1925
1926 #define MAX_NR_TAG 128
__ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)1927 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1928 const struct ublk_batch_io_data *data,
1929 struct ublk_batch_fetch_cmd *fcmd)
1930 {
1931 const unsigned int tag_sz = sizeof(unsigned short);
1932 unsigned short tag_buf[MAX_NR_TAG];
1933 struct io_br_sel sel;
1934 size_t len = 0;
1935 bool needs_filter;
1936 int ret;
1937
1938 WARN_ON_ONCE(data->cmd != fcmd->cmd);
1939
1940 sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1941 data->issue_flags);
1942 if (sel.val < 0)
1943 return sel.val;
1944 if (!sel.addr)
1945 return -ENOBUFS;
1946
1947 /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1948 len = min(len, sizeof(tag_buf)) / tag_sz;
1949 len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1950
1951 needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1952 /* Filter out unused tags before posting to userspace */
1953 if (unlikely(needs_filter)) {
1954 int new_len = ublk_filter_unused_tags(tag_buf, len);
1955
1956 /* return actual length if all are failed or requeued */
1957 if (!new_len) {
1958 /* release the selected buffer */
1959 sel.val = 0;
1960 WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1961 &sel, data->issue_flags));
1962 return len;
1963 }
1964 len = new_len;
1965 }
1966
1967 sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1968 ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1969 if (unlikely(ret < 0))
1970 ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1971 return ret;
1972 }
1973
__ublk_acquire_fcmd(struct ublk_queue * ubq)1974 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1975 struct ublk_queue *ubq)
1976 {
1977 struct ublk_batch_fetch_cmd *fcmd;
1978
1979 lockdep_assert_held(&ubq->evts_lock);
1980
1981 /*
1982 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1983 *
1984 * The pair is the smp_mb() in ublk_batch_dispatch().
1985 *
1986 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1987 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1988 */
1989 smp_mb();
1990 if (READ_ONCE(ubq->active_fcmd)) {
1991 fcmd = NULL;
1992 } else {
1993 fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1994 struct ublk_batch_fetch_cmd, node);
1995 WRITE_ONCE(ubq->active_fcmd, fcmd);
1996 }
1997 return fcmd;
1998 }
1999
ublk_batch_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2000 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2001 {
2002 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
2003 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2004 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2005 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2006 struct ublk_batch_io_data data = {
2007 .ub = pdu->ubq->dev,
2008 .cmd = fcmd->cmd,
2009 .issue_flags = issue_flags,
2010 };
2011
2012 WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
2013
2014 ublk_batch_dispatch(pdu->ubq, &data, fcmd);
2015 }
2016
2017 static void
ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)2018 ublk_batch_dispatch(struct ublk_queue *ubq,
2019 const struct ublk_batch_io_data *data,
2020 struct ublk_batch_fetch_cmd *fcmd)
2021 {
2022 struct ublk_batch_fetch_cmd *new_fcmd;
2023 unsigned tried = 0;
2024 int ret = 0;
2025
2026 again:
2027 while (!ublk_io_evts_empty(ubq)) {
2028 ret = __ublk_batch_dispatch(ubq, data, fcmd);
2029 if (ret <= 0)
2030 break;
2031 }
2032
2033 if (ret < 0) {
2034 ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2035 return;
2036 }
2037
2038 __ublk_release_fcmd(ubq);
2039 /*
2040 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2041 * checking ubq->evts_fifo.
2042 *
2043 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2044 */
2045 smp_mb();
2046 if (likely(ublk_io_evts_empty(ubq)))
2047 return;
2048
2049 spin_lock(&ubq->evts_lock);
2050 new_fcmd = __ublk_acquire_fcmd(ubq);
2051 spin_unlock(&ubq->evts_lock);
2052
2053 if (!new_fcmd)
2054 return;
2055
2056 /* Avoid lockup by allowing to handle at most 32 batches */
2057 if (new_fcmd == fcmd && tried++ < 32)
2058 goto again;
2059
2060 io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2061 }
2062
ublk_cmd_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2063 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2064 {
2065 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2066 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2067 struct ublk_queue *ubq = pdu->ubq;
2068
2069 ublk_dispatch_req(ubq, pdu->req);
2070 }
2071
ublk_batch_queue_cmd(struct ublk_queue * ubq,struct request * rq,bool last)2072 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2073 {
2074 unsigned short tag = rq->tag;
2075 struct ublk_batch_fetch_cmd *fcmd = NULL;
2076
2077 spin_lock(&ubq->evts_lock);
2078 kfifo_put(&ubq->evts_fifo, tag);
2079 if (last)
2080 fcmd = __ublk_acquire_fcmd(ubq);
2081 spin_unlock(&ubq->evts_lock);
2082
2083 if (fcmd)
2084 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2085 }
2086
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)2087 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2088 {
2089 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2090 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2091
2092 pdu->req = rq;
2093 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2094 }
2095
ublk_cmd_list_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2096 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2097 {
2098 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2099 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2100 struct request *rq = pdu->req_list;
2101 struct request *next;
2102
2103 do {
2104 next = rq->rq_next;
2105 rq->rq_next = NULL;
2106 ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2107 rq = next;
2108 } while (rq);
2109 }
2110
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)2111 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2112 {
2113 struct io_uring_cmd *cmd = io->cmd;
2114 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2115
2116 pdu->req_list = rq_list_peek(l);
2117 rq_list_init(l);
2118 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2119 }
2120
ublk_timeout(struct request * rq)2121 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2122 {
2123 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2124 pid_t tgid = ubq->dev->ublksrv_tgid;
2125 struct task_struct *p;
2126 struct pid *pid;
2127
2128 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2129 return BLK_EH_RESET_TIMER;
2130
2131 if (unlikely(!tgid))
2132 return BLK_EH_RESET_TIMER;
2133
2134 rcu_read_lock();
2135 pid = find_vpid(tgid);
2136 p = pid_task(pid, PIDTYPE_PID);
2137 if (p)
2138 send_sig(SIGKILL, p, 0);
2139 rcu_read_unlock();
2140 return BLK_EH_DONE;
2141 }
2142
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)2143 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2144 bool check_cancel)
2145 {
2146 blk_status_t res;
2147
2148 if (unlikely(READ_ONCE(ubq->fail_io)))
2149 return BLK_STS_TARGET;
2150
2151 /* With recovery feature enabled, force_abort is set in
2152 * ublk_stop_dev() before calling del_gendisk(). We have to
2153 * abort all requeued and new rqs here to let del_gendisk()
2154 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2155 * to avoid UAF on io_uring ctx.
2156 *
2157 * Note: force_abort is guaranteed to be seen because it is set
2158 * before request queue is unqiuesced.
2159 */
2160 if (ublk_nosrv_should_queue_io(ubq) &&
2161 unlikely(READ_ONCE(ubq->force_abort)))
2162 return BLK_STS_IOERR;
2163
2164 if (check_cancel && unlikely(ubq->canceling))
2165 return BLK_STS_IOERR;
2166
2167 /* fill iod to slot in io cmd buffer */
2168 res = ublk_setup_iod(ubq, rq);
2169 if (unlikely(res != BLK_STS_OK))
2170 return BLK_STS_IOERR;
2171
2172 blk_mq_start_request(rq);
2173 return BLK_STS_OK;
2174 }
2175
2176 /*
2177 * Common helper for queue_rq that handles request preparation and
2178 * cancellation checks. Returns status and sets should_queue to indicate
2179 * whether the caller should proceed with queuing the request.
2180 */
__ublk_queue_rq_common(struct ublk_queue * ubq,struct request * rq,bool * should_queue)2181 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2182 struct request *rq,
2183 bool *should_queue)
2184 {
2185 blk_status_t res;
2186
2187 res = ublk_prep_req(ubq, rq, false);
2188 if (res != BLK_STS_OK) {
2189 *should_queue = false;
2190 return res;
2191 }
2192
2193 /*
2194 * ->canceling has to be handled after ->force_abort and ->fail_io
2195 * is dealt with, otherwise this request may not be failed in case
2196 * of recovery, and cause hang when deleting disk
2197 */
2198 if (unlikely(ubq->canceling)) {
2199 *should_queue = false;
2200 __ublk_abort_rq(ubq, rq);
2201 return BLK_STS_OK;
2202 }
2203
2204 *should_queue = true;
2205 return BLK_STS_OK;
2206 }
2207
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2208 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2209 const struct blk_mq_queue_data *bd)
2210 {
2211 struct ublk_queue *ubq = hctx->driver_data;
2212 struct request *rq = bd->rq;
2213 bool should_queue;
2214 blk_status_t res;
2215
2216 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2217 if (!should_queue)
2218 return res;
2219
2220 ublk_queue_cmd(ubq, rq);
2221 return BLK_STS_OK;
2222 }
2223
ublk_batch_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2224 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2225 const struct blk_mq_queue_data *bd)
2226 {
2227 struct ublk_queue *ubq = hctx->driver_data;
2228 struct request *rq = bd->rq;
2229 bool should_queue;
2230 blk_status_t res;
2231
2232 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2233 if (!should_queue)
2234 return res;
2235
2236 ublk_batch_queue_cmd(ubq, rq, bd->last);
2237 return BLK_STS_OK;
2238 }
2239
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)2240 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2241 const struct ublk_io *io2)
2242 {
2243 return (io_uring_cmd_ctx_handle(io->cmd) ==
2244 io_uring_cmd_ctx_handle(io2->cmd)) &&
2245 (io->task == io2->task);
2246 }
2247
ublk_commit_rqs(struct blk_mq_hw_ctx * hctx)2248 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2249 {
2250 struct ublk_queue *ubq = hctx->driver_data;
2251 struct ublk_batch_fetch_cmd *fcmd;
2252
2253 spin_lock(&ubq->evts_lock);
2254 fcmd = __ublk_acquire_fcmd(ubq);
2255 spin_unlock(&ubq->evts_lock);
2256
2257 if (fcmd)
2258 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2259 }
2260
ublk_queue_rqs(struct rq_list * rqlist)2261 static void ublk_queue_rqs(struct rq_list *rqlist)
2262 {
2263 struct rq_list requeue_list = { };
2264 struct rq_list submit_list = { };
2265 struct ublk_io *io = NULL;
2266 struct request *req;
2267
2268 while ((req = rq_list_pop(rqlist))) {
2269 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2270 struct ublk_io *this_io = &this_q->ios[req->tag];
2271
2272 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2273 rq_list_add_tail(&requeue_list, req);
2274 continue;
2275 }
2276
2277 if (io && !ublk_belong_to_same_batch(io, this_io) &&
2278 !rq_list_empty(&submit_list))
2279 ublk_queue_cmd_list(io, &submit_list);
2280 io = this_io;
2281 rq_list_add_tail(&submit_list, req);
2282 }
2283
2284 if (!rq_list_empty(&submit_list))
2285 ublk_queue_cmd_list(io, &submit_list);
2286 *rqlist = requeue_list;
2287 }
2288
ublk_batch_queue_cmd_list(struct ublk_queue * ubq,struct rq_list * l)2289 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2290 {
2291 unsigned short tags[MAX_NR_TAG];
2292 struct ublk_batch_fetch_cmd *fcmd;
2293 struct request *rq;
2294 unsigned cnt = 0;
2295
2296 spin_lock(&ubq->evts_lock);
2297 rq_list_for_each(l, rq) {
2298 tags[cnt++] = (unsigned short)rq->tag;
2299 if (cnt >= MAX_NR_TAG) {
2300 kfifo_in(&ubq->evts_fifo, tags, cnt);
2301 cnt = 0;
2302 }
2303 }
2304 if (cnt)
2305 kfifo_in(&ubq->evts_fifo, tags, cnt);
2306 fcmd = __ublk_acquire_fcmd(ubq);
2307 spin_unlock(&ubq->evts_lock);
2308
2309 rq_list_init(l);
2310 if (fcmd)
2311 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2312 }
2313
ublk_batch_queue_rqs(struct rq_list * rqlist)2314 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2315 {
2316 struct rq_list requeue_list = { };
2317 struct rq_list submit_list = { };
2318 struct ublk_queue *ubq = NULL;
2319 struct request *req;
2320
2321 while ((req = rq_list_pop(rqlist))) {
2322 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2323
2324 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2325 rq_list_add_tail(&requeue_list, req);
2326 continue;
2327 }
2328
2329 if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2330 ublk_batch_queue_cmd_list(ubq, &submit_list);
2331 ubq = this_q;
2332 rq_list_add_tail(&submit_list, req);
2333 }
2334
2335 if (!rq_list_empty(&submit_list))
2336 ublk_batch_queue_cmd_list(ubq, &submit_list);
2337 *rqlist = requeue_list;
2338 }
2339
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)2340 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2341 unsigned int hctx_idx)
2342 {
2343 struct ublk_device *ub = driver_data;
2344 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2345
2346 hctx->driver_data = ubq;
2347 return 0;
2348 }
2349
2350 static const struct blk_mq_ops ublk_mq_ops = {
2351 .queue_rq = ublk_queue_rq,
2352 .queue_rqs = ublk_queue_rqs,
2353 .init_hctx = ublk_init_hctx,
2354 .timeout = ublk_timeout,
2355 };
2356
2357 static const struct blk_mq_ops ublk_batch_mq_ops = {
2358 .commit_rqs = ublk_commit_rqs,
2359 .queue_rq = ublk_batch_queue_rq,
2360 .queue_rqs = ublk_batch_queue_rqs,
2361 .init_hctx = ublk_init_hctx,
2362 .timeout = ublk_timeout,
2363 };
2364
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2365 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2366 {
2367 int i;
2368
2369 ubq->nr_io_ready = 0;
2370
2371 for (i = 0; i < ubq->q_depth; i++) {
2372 struct ublk_io *io = &ubq->ios[i];
2373
2374 /*
2375 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2376 * io->cmd
2377 */
2378 io->flags &= UBLK_IO_FLAG_CANCELED;
2379 io->cmd = NULL;
2380 io->buf.addr = 0;
2381
2382 /*
2383 * old task is PF_EXITING, put it now
2384 *
2385 * It could be NULL in case of closing one quiesced
2386 * device.
2387 */
2388 if (io->task) {
2389 put_task_struct(io->task);
2390 io->task = NULL;
2391 }
2392
2393 WARN_ON_ONCE(refcount_read(&io->ref));
2394 WARN_ON_ONCE(io->task_registered_buffers);
2395 }
2396 }
2397
ublk_ch_open(struct inode * inode,struct file * filp)2398 static int ublk_ch_open(struct inode *inode, struct file *filp)
2399 {
2400 struct ublk_device *ub = container_of(inode->i_cdev,
2401 struct ublk_device, cdev);
2402
2403 if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2404 return -EBUSY;
2405 filp->private_data = ub;
2406 ub->ublksrv_tgid = current->tgid;
2407 return 0;
2408 }
2409
ublk_reset_ch_dev(struct ublk_device * ub)2410 static void ublk_reset_ch_dev(struct ublk_device *ub)
2411 {
2412 int i;
2413
2414 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2415 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2416
2417 /* Sync with ublk_cancel_cmd() */
2418 spin_lock(&ubq->cancel_lock);
2419 ublk_queue_reinit(ub, ubq);
2420 spin_unlock(&ubq->cancel_lock);
2421 }
2422
2423 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2424 ub->mm = NULL;
2425 ub->nr_queue_ready = 0;
2426 ub->unprivileged_daemons = false;
2427 ub->ublksrv_tgid = -1;
2428 }
2429
ublk_get_disk(struct ublk_device * ub)2430 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2431 {
2432 struct gendisk *disk;
2433
2434 spin_lock(&ub->lock);
2435 disk = ub->ub_disk;
2436 if (disk)
2437 get_device(disk_to_dev(disk));
2438 spin_unlock(&ub->lock);
2439
2440 return disk;
2441 }
2442
ublk_put_disk(struct gendisk * disk)2443 static void ublk_put_disk(struct gendisk *disk)
2444 {
2445 if (disk)
2446 put_device(disk_to_dev(disk));
2447 }
2448
ublk_partition_scan_work(struct work_struct * work)2449 static void ublk_partition_scan_work(struct work_struct *work)
2450 {
2451 struct ublk_device *ub =
2452 container_of(work, struct ublk_device, partition_scan_work);
2453 /* Hold disk reference to prevent UAF during concurrent teardown */
2454 struct gendisk *disk = ublk_get_disk(ub);
2455
2456 if (!disk)
2457 return;
2458
2459 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2460 &disk->state)))
2461 goto out;
2462
2463 mutex_lock(&disk->open_mutex);
2464 bdev_disk_changed(disk, false);
2465 mutex_unlock(&disk->open_mutex);
2466 out:
2467 ublk_put_disk(disk);
2468 }
2469
2470 /*
2471 * Use this function to ensure that ->canceling is consistently set for
2472 * the device and all queues. Do not set these flags directly.
2473 *
2474 * Caller must ensure that:
2475 * - cancel_mutex is held. This ensures that there is no concurrent
2476 * access to ub->canceling and no concurrent writes to ubq->canceling.
2477 * - there are no concurrent reads of ubq->canceling from the queue_rq
2478 * path. This can be done by quiescing the queue, or through other
2479 * means.
2480 */
ublk_set_canceling(struct ublk_device * ub,bool canceling)2481 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2482 __must_hold(&ub->cancel_mutex)
2483 {
2484 int i;
2485
2486 ub->canceling = canceling;
2487 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2488 ublk_get_queue(ub, i)->canceling = canceling;
2489 }
2490
ublk_check_and_reset_active_ref(struct ublk_device * ub)2491 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2492 {
2493 int i, j;
2494
2495 if (!ublk_dev_need_req_ref(ub))
2496 return false;
2497
2498 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2499 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2500
2501 for (j = 0; j < ubq->q_depth; j++) {
2502 struct ublk_io *io = &ubq->ios[j];
2503 unsigned int refs = refcount_read(&io->ref) +
2504 io->task_registered_buffers;
2505
2506 /*
2507 * UBLK_REFCOUNT_INIT or zero means no active
2508 * reference
2509 */
2510 if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2511 return true;
2512
2513 /* reset to zero if the io hasn't active references */
2514 refcount_set(&io->ref, 0);
2515 io->task_registered_buffers = 0;
2516 }
2517 }
2518 return false;
2519 }
2520
ublk_ch_release_work_fn(struct work_struct * work)2521 static void ublk_ch_release_work_fn(struct work_struct *work)
2522 {
2523 struct ublk_device *ub =
2524 container_of(work, struct ublk_device, exit_work.work);
2525 struct gendisk *disk;
2526 int i;
2527
2528 /*
2529 * For zero-copy and auto buffer register modes, I/O references
2530 * might not be dropped naturally when the daemon is killed, but
2531 * io_uring guarantees that registered bvec kernel buffers are
2532 * unregistered finally when freeing io_uring context, then the
2533 * active references are dropped.
2534 *
2535 * Wait until active references are dropped for avoiding use-after-free
2536 *
2537 * registered buffer may be unregistered in io_ring's release hander,
2538 * so have to wait by scheduling work function for avoiding the two
2539 * file release dependency.
2540 */
2541 if (ublk_check_and_reset_active_ref(ub)) {
2542 schedule_delayed_work(&ub->exit_work, 1);
2543 return;
2544 }
2545
2546 /*
2547 * disk isn't attached yet, either device isn't live, or it has
2548 * been removed already, so we needn't to do anything
2549 */
2550 disk = ublk_get_disk(ub);
2551 if (!disk)
2552 goto out;
2553
2554 /*
2555 * All uring_cmd are done now, so abort any request outstanding to
2556 * the ublk server
2557 *
2558 * This can be done in lockless way because ublk server has been
2559 * gone
2560 *
2561 * More importantly, we have to provide forward progress guarantee
2562 * without holding ub->mutex, otherwise control task grabbing
2563 * ub->mutex triggers deadlock
2564 *
2565 * All requests may be inflight, so ->canceling may not be set, set
2566 * it now.
2567 */
2568 mutex_lock(&ub->cancel_mutex);
2569 ublk_set_canceling(ub, true);
2570 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2571 ublk_abort_queue(ub, ublk_get_queue(ub, i));
2572 mutex_unlock(&ub->cancel_mutex);
2573 blk_mq_kick_requeue_list(disk->queue);
2574
2575 /*
2576 * All infligh requests have been completed or requeued and any new
2577 * request will be failed or requeued via `->canceling` now, so it is
2578 * fine to grab ub->mutex now.
2579 */
2580 mutex_lock(&ub->mutex);
2581
2582 /* double check after grabbing lock */
2583 if (!ub->ub_disk)
2584 goto unlock;
2585
2586 /*
2587 * Transition the device to the nosrv state. What exactly this
2588 * means depends on the recovery flags
2589 */
2590 if (ublk_nosrv_should_stop_dev(ub)) {
2591 /*
2592 * Allow any pending/future I/O to pass through quickly
2593 * with an error. This is needed because del_gendisk
2594 * waits for all pending I/O to complete
2595 */
2596 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2597 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2598
2599 ublk_stop_dev_unlocked(ub);
2600 } else {
2601 if (ublk_nosrv_dev_should_queue_io(ub)) {
2602 /* ->canceling is set and all requests are aborted */
2603 ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2604 } else {
2605 ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2606 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2607 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2608 }
2609 }
2610 unlock:
2611 mutex_unlock(&ub->mutex);
2612 ublk_put_disk(disk);
2613
2614 /* all uring_cmd has been done now, reset device & ubq */
2615 ublk_reset_ch_dev(ub);
2616 out:
2617 clear_bit(UB_STATE_OPEN, &ub->state);
2618
2619 /* put the reference grabbed in ublk_ch_release() */
2620 ublk_put_device(ub);
2621 }
2622
ublk_ch_release(struct inode * inode,struct file * filp)2623 static int ublk_ch_release(struct inode *inode, struct file *filp)
2624 {
2625 struct ublk_device *ub = filp->private_data;
2626
2627 /*
2628 * Grab ublk device reference, so it won't be gone until we are
2629 * really released from work function.
2630 */
2631 ublk_get_device(ub);
2632
2633 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2634 schedule_delayed_work(&ub->exit_work, 0);
2635 return 0;
2636 }
2637
2638 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)2639 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2640 {
2641 struct ublk_device *ub = filp->private_data;
2642 size_t sz = vma->vm_end - vma->vm_start;
2643 unsigned max_sz = ublk_max_cmd_buf_size();
2644 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2645 int q_id, ret = 0;
2646
2647 spin_lock(&ub->lock);
2648 if (!ub->mm)
2649 ub->mm = current->mm;
2650 if (current->mm != ub->mm)
2651 ret = -EINVAL;
2652 spin_unlock(&ub->lock);
2653
2654 if (ret)
2655 return ret;
2656
2657 if (vma->vm_flags & VM_WRITE)
2658 return -EPERM;
2659
2660 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2661 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2662 return -EINVAL;
2663
2664 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2665 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2666 __func__, q_id, current->pid, vma->vm_start,
2667 phys_off, (unsigned long)sz);
2668
2669 if (sz != ublk_queue_cmd_buf_size(ub))
2670 return -EINVAL;
2671
2672 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2673 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2674 }
2675
__ublk_fail_req(struct ublk_device * ub,struct ublk_io * io,struct request * req)2676 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2677 struct request *req)
2678 {
2679 WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2680 io->flags & UBLK_IO_FLAG_ACTIVE);
2681
2682 if (ublk_nosrv_should_reissue_outstanding(ub))
2683 blk_mq_requeue_request(req, false);
2684 else {
2685 io->res = -EIO;
2686 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2687 }
2688 }
2689
2690 /*
2691 * Request tag may just be filled to event kfifo, not get chance to
2692 * dispatch, abort these requests too
2693 */
ublk_abort_batch_queue(struct ublk_device * ub,struct ublk_queue * ubq)2694 static void ublk_abort_batch_queue(struct ublk_device *ub,
2695 struct ublk_queue *ubq)
2696 {
2697 unsigned short tag;
2698
2699 while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2700 struct request *req = blk_mq_tag_to_rq(
2701 ub->tag_set.tags[ubq->q_id], tag);
2702
2703 if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2704 __ublk_fail_req(ub, &ubq->ios[tag], req);
2705 }
2706 }
2707
2708 /*
2709 * Called from ublk char device release handler, when any uring_cmd is
2710 * done, meantime request queue is "quiesced" since all inflight requests
2711 * can't be completed because ublk server is dead.
2712 *
2713 * So no one can hold our request IO reference any more, simply ignore the
2714 * reference, and complete the request immediately
2715 */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)2716 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2717 {
2718 int i;
2719
2720 for (i = 0; i < ubq->q_depth; i++) {
2721 struct ublk_io *io = &ubq->ios[i];
2722
2723 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2724 __ublk_fail_req(ub, io, io->req);
2725 }
2726
2727 if (ublk_support_batch_io(ubq))
2728 ublk_abort_batch_queue(ub, ubq);
2729 }
2730
ublk_start_cancel(struct ublk_device * ub)2731 static void ublk_start_cancel(struct ublk_device *ub)
2732 {
2733 struct gendisk *disk = ublk_get_disk(ub);
2734
2735 /* Our disk has been dead */
2736 if (!disk)
2737 return;
2738
2739 mutex_lock(&ub->cancel_mutex);
2740 if (ub->canceling)
2741 goto out;
2742 /*
2743 * Now we are serialized with ublk_queue_rq()
2744 *
2745 * Make sure that ubq->canceling is set when queue is frozen,
2746 * because ublk_queue_rq() has to rely on this flag for avoiding to
2747 * touch completed uring_cmd
2748 */
2749 blk_mq_quiesce_queue(disk->queue);
2750 ublk_set_canceling(ub, true);
2751 blk_mq_unquiesce_queue(disk->queue);
2752 out:
2753 mutex_unlock(&ub->cancel_mutex);
2754 ublk_put_disk(disk);
2755 }
2756
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)2757 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2758 unsigned int issue_flags)
2759 {
2760 struct ublk_io *io = &ubq->ios[tag];
2761 struct ublk_device *ub = ubq->dev;
2762 struct io_uring_cmd *cmd = NULL;
2763 struct request *req;
2764 bool done;
2765
2766 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2767 return;
2768
2769 /*
2770 * Don't try to cancel this command if the request is started for
2771 * avoiding race between io_uring_cmd_done() and
2772 * io_uring_cmd_complete_in_task().
2773 *
2774 * Either the started request will be aborted via __ublk_abort_rq(),
2775 * then this uring_cmd is canceled next time, or it will be done in
2776 * task work function ublk_dispatch_req() because io_uring guarantees
2777 * that ublk_dispatch_req() is always called
2778 */
2779 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2780 if (req && blk_mq_request_started(req) && req->tag == tag)
2781 return;
2782
2783 spin_lock(&ubq->cancel_lock);
2784 done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2785 if (!done) {
2786 io->flags |= UBLK_IO_FLAG_CANCELED;
2787 cmd = io->cmd;
2788 io->cmd = NULL;
2789 }
2790 spin_unlock(&ubq->cancel_lock);
2791
2792 if (!done && cmd)
2793 io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, issue_flags);
2794 }
2795
2796 /*
2797 * Cancel a batch fetch command if it hasn't been claimed by another path.
2798 *
2799 * An fcmd can only be cancelled if:
2800 * 1. It's not the active_fcmd (which is currently being processed)
2801 * 2. It's still on the list (!list_empty check) - once removed from the list,
2802 * the fcmd is considered claimed and will be freed by whoever removed it
2803 *
2804 * Use list_del_init() so subsequent list_empty() checks work correctly.
2805 */
ublk_batch_cancel_cmd(struct ublk_queue * ubq,struct ublk_batch_fetch_cmd * fcmd,unsigned int issue_flags)2806 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2807 struct ublk_batch_fetch_cmd *fcmd,
2808 unsigned int issue_flags)
2809 {
2810 bool done;
2811
2812 spin_lock(&ubq->evts_lock);
2813 done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2814 if (done)
2815 list_del_init(&fcmd->node);
2816 spin_unlock(&ubq->evts_lock);
2817
2818 if (done) {
2819 io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2820 ublk_batch_free_fcmd(fcmd);
2821 }
2822 }
2823
ublk_batch_cancel_queue(struct ublk_queue * ubq)2824 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2825 {
2826 struct ublk_batch_fetch_cmd *fcmd;
2827 LIST_HEAD(fcmd_list);
2828
2829 spin_lock(&ubq->evts_lock);
2830 ubq->force_abort = true;
2831 list_splice_init(&ubq->fcmd_head, &fcmd_list);
2832 fcmd = READ_ONCE(ubq->active_fcmd);
2833 if (fcmd)
2834 list_move(&fcmd->node, &ubq->fcmd_head);
2835 spin_unlock(&ubq->evts_lock);
2836
2837 while (!list_empty(&fcmd_list)) {
2838 fcmd = list_first_entry(&fcmd_list,
2839 struct ublk_batch_fetch_cmd, node);
2840 ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2841 }
2842 }
2843
ublk_batch_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2844 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2845 unsigned int issue_flags)
2846 {
2847 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2848 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2849 struct ublk_queue *ubq = pdu->ubq;
2850
2851 ublk_start_cancel(ubq->dev);
2852
2853 ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2854 }
2855
2856 /*
2857 * The ublk char device won't be closed when calling cancel fn, so both
2858 * ublk device and queue are guaranteed to be live
2859 *
2860 * Two-stage cancel:
2861 *
2862 * - make every active uring_cmd done in ->cancel_fn()
2863 *
2864 * - aborting inflight ublk IO requests in ublk char device release handler,
2865 * which depends on 1st stage because device can only be closed iff all
2866 * uring_cmd are done
2867 *
2868 * Do _not_ try to acquire ub->mutex before all inflight requests are
2869 * aborted, otherwise deadlock may be caused.
2870 */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2871 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2872 unsigned int issue_flags)
2873 {
2874 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2875 struct ublk_queue *ubq = pdu->ubq;
2876 struct task_struct *task;
2877 struct ublk_io *io;
2878
2879 if (WARN_ON_ONCE(!ubq))
2880 return;
2881
2882 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2883 return;
2884
2885 task = io_uring_cmd_get_task(cmd);
2886 io = &ubq->ios[pdu->tag];
2887 if (WARN_ON_ONCE(task && task != io->task))
2888 return;
2889
2890 ublk_start_cancel(ubq->dev);
2891
2892 WARN_ON_ONCE(io->cmd != cmd);
2893 ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2894 }
2895
ublk_queue_ready(const struct ublk_queue * ubq)2896 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2897 {
2898 return ubq->nr_io_ready == ubq->q_depth;
2899 }
2900
ublk_dev_ready(const struct ublk_device * ub)2901 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2902 {
2903 return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2904 }
2905
ublk_cancel_queue(struct ublk_queue * ubq)2906 static void ublk_cancel_queue(struct ublk_queue *ubq)
2907 {
2908 int i;
2909
2910 if (ublk_support_batch_io(ubq)) {
2911 ublk_batch_cancel_queue(ubq);
2912 return;
2913 }
2914
2915 for (i = 0; i < ubq->q_depth; i++)
2916 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2917 }
2918
2919 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)2920 static void ublk_cancel_dev(struct ublk_device *ub)
2921 {
2922 int i;
2923
2924 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2925 ublk_cancel_queue(ublk_get_queue(ub, i));
2926 }
2927
ublk_check_inflight_rq(struct request * rq,void * data)2928 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2929 {
2930 bool *idle = data;
2931
2932 if (blk_mq_request_started(rq)) {
2933 *idle = false;
2934 return false;
2935 }
2936 return true;
2937 }
2938
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)2939 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2940 {
2941 bool idle;
2942
2943 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2944 while (true) {
2945 idle = true;
2946 blk_mq_tagset_busy_iter(&ub->tag_set,
2947 ublk_check_inflight_rq, &idle);
2948 if (idle)
2949 break;
2950 msleep(UBLK_REQUEUE_DELAY_MS);
2951 }
2952 }
2953
ublk_force_abort_dev(struct ublk_device * ub)2954 static void ublk_force_abort_dev(struct ublk_device *ub)
2955 {
2956 int i;
2957
2958 pr_devel("%s: force abort ub: dev_id %d state %s\n",
2959 __func__, ub->dev_info.dev_id,
2960 ub->dev_info.state == UBLK_S_DEV_LIVE ?
2961 "LIVE" : "QUIESCED");
2962 blk_mq_quiesce_queue(ub->ub_disk->queue);
2963 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2964 ublk_wait_tagset_rqs_idle(ub);
2965
2966 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2967 ublk_get_queue(ub, i)->force_abort = true;
2968 blk_mq_unquiesce_queue(ub->ub_disk->queue);
2969 /* We may have requeued some rqs in ublk_quiesce_queue() */
2970 blk_mq_kick_requeue_list(ub->ub_disk->queue);
2971 }
2972
ublk_detach_disk(struct ublk_device * ub)2973 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2974 {
2975 struct gendisk *disk;
2976
2977 /* Sync with ublk_abort_queue() by holding the lock */
2978 spin_lock(&ub->lock);
2979 disk = ub->ub_disk;
2980 ub->dev_info.state = UBLK_S_DEV_DEAD;
2981 ub->dev_info.ublksrv_pid = -1;
2982 ub->ub_disk = NULL;
2983 spin_unlock(&ub->lock);
2984
2985 return disk;
2986 }
2987
ublk_stop_dev_unlocked(struct ublk_device * ub)2988 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2989 __must_hold(&ub->mutex)
2990 {
2991 struct gendisk *disk;
2992
2993 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2994 return;
2995
2996 if (ublk_nosrv_dev_should_queue_io(ub))
2997 ublk_force_abort_dev(ub);
2998 del_gendisk(ub->ub_disk);
2999 disk = ublk_detach_disk(ub);
3000 put_disk(disk);
3001 }
3002
ublk_stop_dev(struct ublk_device * ub)3003 static void ublk_stop_dev(struct ublk_device *ub)
3004 {
3005 mutex_lock(&ub->mutex);
3006 ublk_stop_dev_unlocked(ub);
3007 mutex_unlock(&ub->mutex);
3008 cancel_work_sync(&ub->partition_scan_work);
3009 ublk_cancel_dev(ub);
3010 }
3011
ublk_reset_io_flags(struct ublk_queue * ubq,struct ublk_io * io)3012 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
3013 {
3014 /* UBLK_IO_FLAG_CANCELED can be cleared now */
3015 spin_lock(&ubq->cancel_lock);
3016 io->flags &= ~UBLK_IO_FLAG_CANCELED;
3017 spin_unlock(&ubq->cancel_lock);
3018 }
3019
3020 /* reset per-queue io flags */
ublk_queue_reset_io_flags(struct ublk_queue * ubq)3021 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
3022 {
3023 spin_lock(&ubq->cancel_lock);
3024 ubq->canceling = false;
3025 spin_unlock(&ubq->cancel_lock);
3026 ubq->fail_io = false;
3027 }
3028
3029 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,u16 q_id,struct ublk_io * io)3030 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
3031 struct ublk_io *io)
3032 __must_hold(&ub->mutex)
3033 {
3034 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3035
3036 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3037 ub->unprivileged_daemons = true;
3038
3039 ubq->nr_io_ready++;
3040 ublk_reset_io_flags(ubq, io);
3041
3042 /* Check if this specific queue is now fully ready */
3043 if (ublk_queue_ready(ubq)) {
3044 ub->nr_queue_ready++;
3045
3046 /*
3047 * Reset queue flags as soon as this queue is ready.
3048 * This clears the canceling flag, allowing batch FETCH commands
3049 * to succeed during recovery without waiting for all queues.
3050 */
3051 ublk_queue_reset_io_flags(ubq);
3052 }
3053
3054 /* Check if all queues are ready */
3055 if (ublk_dev_ready(ub)) {
3056 /*
3057 * All queues ready - clear device-level canceling flag
3058 * and complete the recovery/initialization.
3059 */
3060 mutex_lock(&ub->cancel_mutex);
3061 ub->canceling = false;
3062 mutex_unlock(&ub->cancel_mutex);
3063 complete_all(&ub->completion);
3064 }
3065 }
3066
ublk_check_cmd_op(u32 cmd_op)3067 static inline int ublk_check_cmd_op(u32 cmd_op)
3068 {
3069 u32 ioc_type = _IOC_TYPE(cmd_op);
3070
3071 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3072 return -EOPNOTSUPP;
3073
3074 if (ioc_type != 'u' && ioc_type != 0)
3075 return -EOPNOTSUPP;
3076
3077 return 0;
3078 }
3079
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)3080 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3081 {
3082 struct ublk_auto_buf_reg buf;
3083
3084 buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3085
3086 if (buf.reserved0 || buf.reserved1)
3087 return -EINVAL;
3088
3089 if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3090 return -EINVAL;
3091 io->buf.auto_reg = buf;
3092 return 0;
3093 }
3094
ublk_clear_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3095 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3096 struct io_uring_cmd *cmd,
3097 u16 *buf_idx)
3098 {
3099 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3100 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3101
3102 /*
3103 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3104 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3105 * `io_ring_ctx`.
3106 *
3107 * If this uring_cmd's io_ring_ctx isn't same with the
3108 * one for registering the buffer, it is ublk server's
3109 * responsibility for unregistering the buffer, otherwise
3110 * this ublk request gets stuck.
3111 */
3112 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3113 *buf_idx = io->buf.auto_reg.index;
3114 }
3115 }
3116
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3117 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3118 struct io_uring_cmd *cmd,
3119 u16 *buf_idx)
3120 {
3121 ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3122 return ublk_set_auto_buf_reg(io, cmd);
3123 }
3124
3125 /* Once we return, `io->req` can't be used any more */
3126 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)3127 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3128 {
3129 struct request *req = io->req;
3130
3131 io->cmd = cmd;
3132 io->flags |= UBLK_IO_FLAG_ACTIVE;
3133 /* now this cmd slot is owned by ublk driver */
3134 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3135
3136 return req;
3137 }
3138
3139 static inline int
ublk_config_io_buf(const struct ublk_device * ub,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)3140 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3141 struct io_uring_cmd *cmd, unsigned long buf_addr,
3142 u16 *buf_idx)
3143 {
3144 if (ublk_dev_support_auto_buf_reg(ub))
3145 return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3146
3147 io->buf.addr = buf_addr;
3148 return 0;
3149 }
3150
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)3151 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3152 unsigned int issue_flags,
3153 struct ublk_queue *ubq, unsigned int tag)
3154 {
3155 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3156
3157 /*
3158 * Safe to refer to @ubq since ublk_queue won't be died until its
3159 * commands are completed
3160 */
3161 pdu->ubq = ubq;
3162 pdu->tag = tag;
3163 io_uring_cmd_mark_cancelable(cmd, issue_flags);
3164 }
3165
ublk_io_release(void * priv)3166 static void ublk_io_release(void *priv)
3167 {
3168 struct request *rq = priv;
3169 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3170 struct ublk_io *io = &ubq->ios[rq->tag];
3171
3172 /*
3173 * task_registered_buffers may be 0 if buffers were registered off task
3174 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3175 */
3176 if (current == io->task && io->task_registered_buffers)
3177 io->task_registered_buffers--;
3178 else
3179 ublk_put_req_ref(io, rq);
3180 }
3181
ublk_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned int index,unsigned int issue_flags)3182 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3183 struct ublk_device *ub,
3184 u16 q_id, u16 tag,
3185 struct ublk_io *io,
3186 unsigned int index, unsigned int issue_flags)
3187 {
3188 struct request *req;
3189 int ret;
3190
3191 if (!ublk_dev_support_zero_copy(ub))
3192 return -EINVAL;
3193
3194 req = __ublk_check_and_get_req(ub, q_id, tag, io);
3195 if (!req)
3196 return -EINVAL;
3197
3198 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3199 issue_flags);
3200 if (ret) {
3201 ublk_put_req_ref(io, req);
3202 return ret;
3203 }
3204
3205 return 0;
3206 }
3207
3208 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned index,unsigned issue_flags)3209 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3210 struct ublk_device *ub,
3211 u16 q_id, u16 tag, struct ublk_io *io,
3212 unsigned index, unsigned issue_flags)
3213 {
3214 unsigned new_registered_buffers;
3215 struct request *req = io->req;
3216 int ret;
3217
3218 /*
3219 * Ensure there are still references for ublk_sub_req_ref() to release.
3220 * If not, fall back on the thread-safe buffer registration.
3221 */
3222 new_registered_buffers = io->task_registered_buffers + 1;
3223 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3224 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3225 issue_flags);
3226
3227 if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3228 return -EINVAL;
3229
3230 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3231 issue_flags);
3232 if (ret)
3233 return ret;
3234
3235 io->task_registered_buffers = new_registered_buffers;
3236 return 0;
3237 }
3238
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)3239 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3240 const struct ublk_device *ub,
3241 unsigned int index, unsigned int issue_flags)
3242 {
3243 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3244 return -EINVAL;
3245
3246 return io_buffer_unregister_bvec(cmd, index, issue_flags);
3247 }
3248
ublk_check_fetch_buf(const struct ublk_device * ub,__u64 buf_addr)3249 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3250 {
3251 if (ublk_dev_need_map_io(ub)) {
3252 /*
3253 * FETCH_RQ has to provide IO buffer if NEED GET
3254 * DATA is not enabled
3255 */
3256 if (!buf_addr && !ublk_dev_need_get_data(ub))
3257 return -EINVAL;
3258 } else if (buf_addr) {
3259 /* User copy requires addr to be unset */
3260 return -EINVAL;
3261 }
3262 return 0;
3263 }
3264
__ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,u16 q_id)3265 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3266 struct ublk_io *io, u16 q_id)
3267 {
3268 /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3269 if (ublk_dev_ready(ub))
3270 return -EBUSY;
3271
3272 /* allow each command to be FETCHed at most once */
3273 if (io->flags & UBLK_IO_FLAG_ACTIVE)
3274 return -EINVAL;
3275
3276 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3277
3278 ublk_fill_io_cmd(io, cmd);
3279
3280 if (ublk_dev_support_batch_io(ub))
3281 WRITE_ONCE(io->task, NULL);
3282 else
3283 WRITE_ONCE(io->task, get_task_struct(current));
3284
3285 return 0;
3286 }
3287
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr,u16 q_id)3288 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3289 struct ublk_io *io, __u64 buf_addr, u16 q_id)
3290 {
3291 int ret;
3292
3293 /*
3294 * When handling FETCH command for setting up ublk uring queue,
3295 * ub->mutex is the innermost lock, and we won't block for handling
3296 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3297 */
3298 mutex_lock(&ub->mutex);
3299 ret = __ublk_fetch(cmd, ub, io, q_id);
3300 if (!ret)
3301 ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3302 if (!ret)
3303 ublk_mark_io_ready(ub, q_id, io);
3304 mutex_unlock(&ub->mutex);
3305 return ret;
3306 }
3307
ublk_check_commit_and_fetch(const struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)3308 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3309 struct ublk_io *io, __u64 buf_addr)
3310 {
3311 struct request *req = io->req;
3312
3313 if (ublk_dev_need_map_io(ub)) {
3314 /*
3315 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3316 * NEED GET DATA is not enabled or it is Read IO.
3317 */
3318 if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3319 req_op(req) == REQ_OP_READ))
3320 return -EINVAL;
3321 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3322 /*
3323 * User copy requires addr to be unset when command is
3324 * not zone append
3325 */
3326 return -EINVAL;
3327 }
3328
3329 return 0;
3330 }
3331
ublk_need_complete_req(const struct ublk_device * ub,struct ublk_io * io)3332 static bool ublk_need_complete_req(const struct ublk_device *ub,
3333 struct ublk_io *io)
3334 {
3335 if (ublk_dev_need_req_ref(ub))
3336 return ublk_sub_req_ref(io);
3337 return true;
3338 }
3339
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)3340 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3341 struct request *req)
3342 {
3343 /*
3344 * We have handled UBLK_IO_NEED_GET_DATA command,
3345 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3346 * do the copy work.
3347 */
3348 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3349 /* update iod->addr because ublksrv may have passed a new io buffer */
3350 ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3351 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3352 __func__, ubq->q_id, req->tag, io->flags,
3353 ublk_get_iod(ubq, req->tag)->addr);
3354
3355 return ublk_start_io(ubq, req, io);
3356 }
3357
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)3358 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3359 unsigned int issue_flags)
3360 {
3361 /* May point to userspace-mapped memory */
3362 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3363 struct ublksrv_io_cmd);
3364 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3365 struct ublk_device *ub = cmd->file->private_data;
3366 struct ublk_queue *ubq;
3367 struct ublk_io *io = NULL;
3368 u32 cmd_op = cmd->cmd_op;
3369 u16 q_id = READ_ONCE(ub_src->q_id);
3370 u16 tag = READ_ONCE(ub_src->tag);
3371 s32 result = READ_ONCE(ub_src->result);
3372 u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3373 struct request *req;
3374 int ret;
3375 bool compl;
3376
3377 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3378
3379 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3380 __func__, cmd->cmd_op, q_id, tag, result);
3381
3382 ret = ublk_check_cmd_op(cmd_op);
3383 if (ret)
3384 goto out;
3385
3386 /*
3387 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3388 * so no need to validate the q_id, tag, or task
3389 */
3390 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3391 return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3392
3393 ret = -EINVAL;
3394 if (q_id >= ub->dev_info.nr_hw_queues)
3395 goto out;
3396
3397 ubq = ublk_get_queue(ub, q_id);
3398
3399 if (tag >= ub->dev_info.queue_depth)
3400 goto out;
3401
3402 io = &ubq->ios[tag];
3403 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3404 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3405 ret = ublk_check_fetch_buf(ub, addr);
3406 if (ret)
3407 goto out;
3408 ret = ublk_fetch(cmd, ub, io, addr, q_id);
3409 if (ret)
3410 goto out;
3411
3412 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3413 return -EIOCBQUEUED;
3414 }
3415
3416 if (READ_ONCE(io->task) != current) {
3417 /*
3418 * ublk_register_io_buf() accesses only the io's refcount,
3419 * so can be handled on any task
3420 */
3421 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3422 return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3423 addr, issue_flags);
3424
3425 goto out;
3426 }
3427
3428 /* there is pending io cmd, something must be wrong */
3429 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3430 ret = -EBUSY;
3431 goto out;
3432 }
3433
3434 /*
3435 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3436 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3437 */
3438 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3439 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3440 goto out;
3441
3442 switch (_IOC_NR(cmd_op)) {
3443 case UBLK_IO_REGISTER_IO_BUF:
3444 return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3445 issue_flags);
3446 case UBLK_IO_COMMIT_AND_FETCH_REQ:
3447 ret = ublk_check_commit_and_fetch(ub, io, addr);
3448 if (ret)
3449 goto out;
3450 io->res = result;
3451 req = ublk_fill_io_cmd(io, cmd);
3452 ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3453 if (buf_idx != UBLK_INVALID_BUF_IDX)
3454 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3455 compl = ublk_need_complete_req(ub, io);
3456
3457 if (req_op(req) == REQ_OP_ZONE_APPEND)
3458 req->__sector = addr;
3459 if (compl)
3460 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3461
3462 if (ret)
3463 goto out;
3464 break;
3465 case UBLK_IO_NEED_GET_DATA:
3466 /*
3467 * ublk_get_data() may fail and fallback to requeue, so keep
3468 * uring_cmd active first and prepare for handling new requeued
3469 * request
3470 */
3471 req = ublk_fill_io_cmd(io, cmd);
3472 ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3473 WARN_ON_ONCE(ret);
3474 if (likely(ublk_get_data(ubq, io, req))) {
3475 __ublk_prep_compl_io_cmd(io, req);
3476 return UBLK_IO_RES_OK;
3477 }
3478 break;
3479 default:
3480 goto out;
3481 }
3482 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3483 return -EIOCBQUEUED;
3484
3485 out:
3486 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3487 __func__, cmd_op, tag, ret, io ? io->flags : 0);
3488 return ret;
3489 }
3490
__ublk_check_and_get_req(struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io)3491 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3492 u16 q_id, u16 tag, struct ublk_io *io)
3493 {
3494 struct request *req;
3495
3496 /*
3497 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3498 * which would overwrite it with io->cmd
3499 */
3500 req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3501 if (!req)
3502 return NULL;
3503
3504 if (!ublk_get_req_ref(io))
3505 return NULL;
3506
3507 if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3508 goto fail_put;
3509
3510 if (!ublk_rq_has_data(req))
3511 goto fail_put;
3512
3513 return req;
3514 fail_put:
3515 ublk_put_req_ref(io, req);
3516 return NULL;
3517 }
3518
ublk_ch_uring_cmd_cb(struct io_tw_req tw_req,io_tw_token_t tw)3519 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3520 {
3521 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3522 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3523 int ret = -ECANCELED;
3524
3525 if (!tw.cancel)
3526 ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3527 if (ret != -EIOCBQUEUED)
3528 io_uring_cmd_done(cmd, ret, issue_flags);
3529 }
3530
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3531 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3532 {
3533 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3534 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3535 return 0;
3536 }
3537
3538 /* well-implemented server won't run into unlocked */
3539 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3540 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3541 return -EIOCBQUEUED;
3542 }
3543
3544 return ublk_ch_uring_cmd_local(cmd, issue_flags);
3545 }
3546
ublk_batch_buf_addr(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3547 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3548 const struct ublk_elem_header *elem)
3549 {
3550 const void *buf = elem;
3551
3552 if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3553 return *(const __u64 *)(buf + sizeof(*elem));
3554 return 0;
3555 }
3556
ublk_batch_zone_lba(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3557 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3558 const struct ublk_elem_header *elem)
3559 {
3560 const void *buf = elem;
3561
3562 if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3563 return *(const __u64 *)(buf + sizeof(*elem) +
3564 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3565 return -1;
3566 }
3567
3568 static struct ublk_auto_buf_reg
ublk_batch_auto_buf_reg(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3569 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3570 const struct ublk_elem_header *elem)
3571 {
3572 struct ublk_auto_buf_reg reg = {
3573 .index = elem->buf_index,
3574 .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3575 UBLK_AUTO_BUF_REG_FALLBACK : 0,
3576 };
3577
3578 return reg;
3579 }
3580
3581 /*
3582 * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3583 * it is the least common multiple(LCM) of 8, 16 and 24
3584 */
3585 #define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10)
3586 struct ublk_batch_io_iter {
3587 void __user *uaddr;
3588 unsigned done, total;
3589 unsigned char elem_bytes;
3590 /* copy to this buffer from user space */
3591 unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3592 };
3593
3594 static inline int
__ublk_walk_cmd_buf(struct ublk_queue * ubq,struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,unsigned bytes,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3595 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3596 struct ublk_batch_io_iter *iter,
3597 const struct ublk_batch_io_data *data,
3598 unsigned bytes,
3599 int (*cb)(struct ublk_queue *q,
3600 const struct ublk_batch_io_data *data,
3601 const struct ublk_elem_header *elem))
3602 {
3603 unsigned int i;
3604 int ret = 0;
3605
3606 for (i = 0; i < bytes; i += iter->elem_bytes) {
3607 const struct ublk_elem_header *elem =
3608 (const struct ublk_elem_header *)&iter->buf[i];
3609
3610 if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3611 ret = -EINVAL;
3612 break;
3613 }
3614
3615 ret = cb(ubq, data, elem);
3616 if (unlikely(ret))
3617 break;
3618 }
3619
3620 iter->done += i;
3621 return ret;
3622 }
3623
ublk_walk_cmd_buf(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3624 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3625 const struct ublk_batch_io_data *data,
3626 int (*cb)(struct ublk_queue *q,
3627 const struct ublk_batch_io_data *data,
3628 const struct ublk_elem_header *elem))
3629 {
3630 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3631 int ret = 0;
3632
3633 while (iter->done < iter->total) {
3634 unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3635
3636 if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3637 pr_warn("ublk%d: read batch cmd buffer failed\n",
3638 data->ub->dev_info.dev_id);
3639 return -EFAULT;
3640 }
3641
3642 ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3643 if (ret)
3644 return ret;
3645 }
3646 return 0;
3647 }
3648
ublk_batch_unprep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3649 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3650 const struct ublk_batch_io_data *data,
3651 const struct ublk_elem_header *elem)
3652 {
3653 struct ublk_io *io = &ubq->ios[elem->tag];
3654
3655 /*
3656 * If queue was ready before this decrement, it won't be anymore,
3657 * so we need to decrement the queue ready count and restore the
3658 * canceling flag to prevent new requests from being queued.
3659 */
3660 if (ublk_queue_ready(ubq)) {
3661 data->ub->nr_queue_ready--;
3662 spin_lock(&ubq->cancel_lock);
3663 ubq->canceling = true;
3664 spin_unlock(&ubq->cancel_lock);
3665 }
3666 ubq->nr_io_ready--;
3667
3668 ublk_io_lock(io);
3669 io->flags = 0;
3670 ublk_io_unlock(io);
3671 return 0;
3672 }
3673
ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data)3674 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3675 const struct ublk_batch_io_data *data)
3676 {
3677 int ret;
3678
3679 /* Re-process only what we've already processed, starting from beginning */
3680 iter->total = iter->done;
3681 iter->done = 0;
3682
3683 ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3684 WARN_ON_ONCE(ret);
3685 }
3686
ublk_batch_prep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3687 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3688 const struct ublk_batch_io_data *data,
3689 const struct ublk_elem_header *elem)
3690 {
3691 struct ublk_io *io = &ubq->ios[elem->tag];
3692 const struct ublk_batch_io *uc = &data->header;
3693 union ublk_io_buf buf = { 0 };
3694 int ret;
3695
3696 if (ublk_dev_support_auto_buf_reg(data->ub))
3697 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3698 else if (ublk_dev_need_map_io(data->ub)) {
3699 buf.addr = ublk_batch_buf_addr(uc, elem);
3700
3701 ret = ublk_check_fetch_buf(data->ub, buf.addr);
3702 if (ret)
3703 return ret;
3704 }
3705
3706 ublk_io_lock(io);
3707 ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3708 if (!ret)
3709 io->buf = buf;
3710 ublk_io_unlock(io);
3711
3712 if (!ret)
3713 ublk_mark_io_ready(data->ub, ubq->q_id, io);
3714
3715 return ret;
3716 }
3717
ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data * data)3718 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3719 {
3720 const struct ublk_batch_io *uc = &data->header;
3721 struct io_uring_cmd *cmd = data->cmd;
3722 struct ublk_batch_io_iter iter = {
3723 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3724 .total = uc->nr_elem * uc->elem_bytes,
3725 .elem_bytes = uc->elem_bytes,
3726 };
3727 int ret;
3728
3729 mutex_lock(&data->ub->mutex);
3730 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3731
3732 if (ret && iter.done)
3733 ublk_batch_revert_prep_cmd(&iter, data);
3734 mutex_unlock(&data->ub->mutex);
3735 return ret;
3736 }
3737
ublk_batch_commit_io_check(const struct ublk_queue * ubq,struct ublk_io * io,union ublk_io_buf * buf)3738 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3739 struct ublk_io *io,
3740 union ublk_io_buf *buf)
3741 {
3742 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3743 return -EBUSY;
3744
3745 /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3746 if (ublk_need_map_io(ubq) && !buf->addr)
3747 return -EINVAL;
3748 return 0;
3749 }
3750
ublk_batch_commit_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3751 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3752 const struct ublk_batch_io_data *data,
3753 const struct ublk_elem_header *elem)
3754 {
3755 struct ublk_io *io = &ubq->ios[elem->tag];
3756 const struct ublk_batch_io *uc = &data->header;
3757 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3758 union ublk_io_buf buf = { 0 };
3759 struct request *req = NULL;
3760 bool auto_reg = false;
3761 bool compl = false;
3762 int ret;
3763
3764 if (ublk_dev_support_auto_buf_reg(data->ub)) {
3765 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3766 auto_reg = true;
3767 } else if (ublk_dev_need_map_io(data->ub))
3768 buf.addr = ublk_batch_buf_addr(uc, elem);
3769
3770 ublk_io_lock(io);
3771 ret = ublk_batch_commit_io_check(ubq, io, &buf);
3772 if (!ret) {
3773 io->res = elem->result;
3774 io->buf = buf;
3775 req = ublk_fill_io_cmd(io, data->cmd);
3776
3777 if (auto_reg)
3778 ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3779 compl = ublk_need_complete_req(data->ub, io);
3780 }
3781 ublk_io_unlock(io);
3782
3783 if (unlikely(ret)) {
3784 pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3785 __func__, data->ub->dev_info.dev_id, ubq->q_id,
3786 elem->tag, ret);
3787 return ret;
3788 }
3789
3790 if (buf_idx != UBLK_INVALID_BUF_IDX)
3791 io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3792 if (req_op(req) == REQ_OP_ZONE_APPEND)
3793 req->__sector = ublk_batch_zone_lba(uc, elem);
3794 if (compl)
3795 __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3796 return 0;
3797 }
3798
ublk_handle_batch_commit_cmd(struct ublk_batch_io_data * data)3799 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3800 {
3801 const struct ublk_batch_io *uc = &data->header;
3802 struct io_uring_cmd *cmd = data->cmd;
3803 struct ublk_batch_io_iter iter = {
3804 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3805 .total = uc->nr_elem * uc->elem_bytes,
3806 .elem_bytes = uc->elem_bytes,
3807 };
3808 DEFINE_IO_COMP_BATCH(iob);
3809 int ret;
3810
3811 data->iob = &iob;
3812 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3813
3814 if (iob.complete)
3815 iob.complete(&iob);
3816
3817 return iter.done == 0 ? ret : iter.done;
3818 }
3819
ublk_check_batch_cmd_flags(const struct ublk_batch_io * uc)3820 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3821 {
3822 unsigned elem_bytes = sizeof(struct ublk_elem_header);
3823
3824 if (uc->flags & ~UBLK_BATCH_F_ALL)
3825 return -EINVAL;
3826
3827 /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3828 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3829 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3830 return -EINVAL;
3831
3832 elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3833 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3834 if (uc->elem_bytes != elem_bytes)
3835 return -EINVAL;
3836 return 0;
3837 }
3838
ublk_check_batch_cmd(const struct ublk_batch_io_data * data)3839 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3840 {
3841 const struct ublk_batch_io *uc = &data->header;
3842
3843 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3844 return -EINVAL;
3845
3846 if (uc->nr_elem > data->ub->dev_info.queue_depth)
3847 return -E2BIG;
3848
3849 if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3850 !ublk_dev_is_zoned(data->ub))
3851 return -EINVAL;
3852
3853 if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3854 !ublk_dev_need_map_io(data->ub))
3855 return -EINVAL;
3856
3857 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3858 !ublk_dev_support_auto_buf_reg(data->ub))
3859 return -EINVAL;
3860
3861 return ublk_check_batch_cmd_flags(uc);
3862 }
3863
ublk_batch_attach(struct ublk_queue * ubq,struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)3864 static int ublk_batch_attach(struct ublk_queue *ubq,
3865 struct ublk_batch_io_data *data,
3866 struct ublk_batch_fetch_cmd *fcmd)
3867 {
3868 struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3869 bool free = false;
3870 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3871
3872 spin_lock(&ubq->evts_lock);
3873 if (unlikely(ubq->force_abort || ubq->canceling)) {
3874 free = true;
3875 } else {
3876 list_add_tail(&fcmd->node, &ubq->fcmd_head);
3877 new_fcmd = __ublk_acquire_fcmd(ubq);
3878 }
3879 spin_unlock(&ubq->evts_lock);
3880
3881 if (unlikely(free)) {
3882 ublk_batch_free_fcmd(fcmd);
3883 return -ENODEV;
3884 }
3885
3886 pdu->ubq = ubq;
3887 pdu->fcmd = fcmd;
3888 io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3889
3890 if (!new_fcmd)
3891 goto out;
3892
3893 /*
3894 * If the two fetch commands are originated from same io_ring_ctx,
3895 * run batch dispatch directly. Otherwise, schedule task work for
3896 * doing it.
3897 */
3898 if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3899 io_uring_cmd_ctx_handle(fcmd->cmd)) {
3900 data->cmd = new_fcmd->cmd;
3901 ublk_batch_dispatch(ubq, data, new_fcmd);
3902 } else {
3903 io_uring_cmd_complete_in_task(new_fcmd->cmd,
3904 ublk_batch_tw_cb);
3905 }
3906 out:
3907 return -EIOCBQUEUED;
3908 }
3909
ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data * data)3910 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3911 {
3912 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3913 struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3914
3915 if (!fcmd)
3916 return -ENOMEM;
3917
3918 return ublk_batch_attach(ubq, data, fcmd);
3919 }
3920
ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data * data)3921 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3922 {
3923 const struct ublk_batch_io *uc = &data->header;
3924
3925 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3926 return -EINVAL;
3927
3928 if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3929 return -EINVAL;
3930
3931 if (uc->elem_bytes != sizeof(__u16))
3932 return -EINVAL;
3933
3934 if (uc->flags != 0)
3935 return -EINVAL;
3936
3937 return 0;
3938 }
3939
ublk_handle_non_batch_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3940 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3941 unsigned int issue_flags)
3942 {
3943 const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3944 struct ublksrv_io_cmd);
3945 struct ublk_device *ub = cmd->file->private_data;
3946 unsigned tag = READ_ONCE(ub_cmd->tag);
3947 unsigned q_id = READ_ONCE(ub_cmd->q_id);
3948 unsigned index = READ_ONCE(ub_cmd->addr);
3949 struct ublk_queue *ubq;
3950 struct ublk_io *io;
3951
3952 if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3953 return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3954
3955 if (q_id >= ub->dev_info.nr_hw_queues)
3956 return -EINVAL;
3957
3958 if (tag >= ub->dev_info.queue_depth)
3959 return -EINVAL;
3960
3961 if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3962 return -EOPNOTSUPP;
3963
3964 ubq = ublk_get_queue(ub, q_id);
3965 io = &ubq->ios[tag];
3966 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3967 issue_flags);
3968 }
3969
ublk_ch_batch_io_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3970 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3971 unsigned int issue_flags)
3972 {
3973 const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3974 struct ublk_batch_io);
3975 struct ublk_device *ub = cmd->file->private_data;
3976 struct ublk_batch_io_data data = {
3977 .ub = ub,
3978 .cmd = cmd,
3979 .header = (struct ublk_batch_io) {
3980 .q_id = READ_ONCE(uc->q_id),
3981 .flags = READ_ONCE(uc->flags),
3982 .nr_elem = READ_ONCE(uc->nr_elem),
3983 .elem_bytes = READ_ONCE(uc->elem_bytes),
3984 },
3985 .issue_flags = issue_flags,
3986 };
3987 u32 cmd_op = cmd->cmd_op;
3988 int ret = -EINVAL;
3989
3990 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3991 ublk_batch_cancel_fn(cmd, issue_flags);
3992 return 0;
3993 }
3994
3995 switch (cmd_op) {
3996 case UBLK_U_IO_PREP_IO_CMDS:
3997 ret = ublk_check_batch_cmd(&data);
3998 if (ret)
3999 goto out;
4000 ret = ublk_handle_batch_prep_cmd(&data);
4001 break;
4002 case UBLK_U_IO_COMMIT_IO_CMDS:
4003 ret = ublk_check_batch_cmd(&data);
4004 if (ret)
4005 goto out;
4006 ret = ublk_handle_batch_commit_cmd(&data);
4007 break;
4008 case UBLK_U_IO_FETCH_IO_CMDS:
4009 ret = ublk_validate_batch_fetch_cmd(&data);
4010 if (ret)
4011 goto out;
4012 ret = ublk_handle_batch_fetch_cmd(&data);
4013 break;
4014 default:
4015 ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
4016 break;
4017 }
4018 out:
4019 return ret;
4020 }
4021
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)4022 static inline bool ublk_check_ubuf_dir(const struct request *req,
4023 int ubuf_dir)
4024 {
4025 /* copy ubuf to request pages */
4026 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
4027 ubuf_dir == ITER_SOURCE)
4028 return true;
4029
4030 /* copy request pages to ubuf */
4031 if ((req_op(req) == REQ_OP_WRITE ||
4032 req_op(req) == REQ_OP_ZONE_APPEND) &&
4033 ubuf_dir == ITER_DEST)
4034 return true;
4035
4036 return false;
4037 }
4038
4039 static ssize_t
ublk_user_copy(struct kiocb * iocb,struct iov_iter * iter,int dir)4040 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4041 {
4042 struct ublk_device *ub = iocb->ki_filp->private_data;
4043 struct ublk_queue *ubq;
4044 struct request *req;
4045 struct ublk_io *io;
4046 unsigned data_len;
4047 bool is_integrity;
4048 bool on_daemon;
4049 size_t buf_off;
4050 u16 tag, q_id;
4051 ssize_t ret;
4052
4053 if (!user_backed_iter(iter))
4054 return -EACCES;
4055
4056 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4057 return -EACCES;
4058
4059 tag = ublk_pos_to_tag(iocb->ki_pos);
4060 q_id = ublk_pos_to_hwq(iocb->ki_pos);
4061 buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4062 is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4063
4064 if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4065 return -EINVAL;
4066
4067 if (q_id >= ub->dev_info.nr_hw_queues)
4068 return -EINVAL;
4069
4070 ubq = ublk_get_queue(ub, q_id);
4071 if (!ublk_dev_support_user_copy(ub))
4072 return -EACCES;
4073
4074 if (tag >= ub->dev_info.queue_depth)
4075 return -EINVAL;
4076
4077 io = &ubq->ios[tag];
4078 on_daemon = current == READ_ONCE(io->task);
4079 if (on_daemon) {
4080 /* On daemon, io can't be completed concurrently, so skip ref */
4081 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4082 return -EINVAL;
4083
4084 req = io->req;
4085 if (!ublk_rq_has_data(req))
4086 return -EINVAL;
4087 } else {
4088 req = __ublk_check_and_get_req(ub, q_id, tag, io);
4089 if (!req)
4090 return -EINVAL;
4091 }
4092
4093 if (is_integrity) {
4094 struct blk_integrity *bi = &req->q->limits.integrity;
4095
4096 data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4097 } else {
4098 data_len = blk_rq_bytes(req);
4099 }
4100 if (buf_off > data_len) {
4101 ret = -EINVAL;
4102 goto out;
4103 }
4104
4105 if (!ublk_check_ubuf_dir(req, dir)) {
4106 ret = -EACCES;
4107 goto out;
4108 }
4109
4110 if (is_integrity)
4111 ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4112 else
4113 ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4114
4115 out:
4116 if (!on_daemon)
4117 ublk_put_req_ref(io, req);
4118 return ret;
4119 }
4120
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)4121 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4122 {
4123 return ublk_user_copy(iocb, to, ITER_DEST);
4124 }
4125
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)4126 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4127 {
4128 return ublk_user_copy(iocb, from, ITER_SOURCE);
4129 }
4130
4131 static const struct file_operations ublk_ch_fops = {
4132 .owner = THIS_MODULE,
4133 .open = ublk_ch_open,
4134 .release = ublk_ch_release,
4135 .read_iter = ublk_ch_read_iter,
4136 .write_iter = ublk_ch_write_iter,
4137 .uring_cmd = ublk_ch_uring_cmd,
4138 .mmap = ublk_ch_mmap,
4139 };
4140
4141 static const struct file_operations ublk_ch_batch_io_fops = {
4142 .owner = THIS_MODULE,
4143 .open = ublk_ch_open,
4144 .release = ublk_ch_release,
4145 .read_iter = ublk_ch_read_iter,
4146 .write_iter = ublk_ch_write_iter,
4147 .uring_cmd = ublk_ch_batch_io_uring_cmd,
4148 .mmap = ublk_ch_mmap,
4149 };
4150
__ublk_deinit_queue(struct ublk_device * ub,struct ublk_queue * ubq)4151 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4152 {
4153 int size, i;
4154
4155 size = ublk_queue_cmd_buf_size(ub);
4156
4157 for (i = 0; i < ubq->q_depth; i++) {
4158 struct ublk_io *io = &ubq->ios[i];
4159 if (io->task)
4160 put_task_struct(io->task);
4161 WARN_ON_ONCE(refcount_read(&io->ref));
4162 WARN_ON_ONCE(io->task_registered_buffers);
4163 }
4164
4165 if (ubq->io_cmd_buf)
4166 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4167
4168 if (ublk_dev_support_batch_io(ub))
4169 ublk_io_evts_deinit(ubq);
4170
4171 kvfree(ubq);
4172 }
4173
ublk_deinit_queue(struct ublk_device * ub,int q_id)4174 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4175 {
4176 struct ublk_queue *ubq = ub->queues[q_id];
4177
4178 if (!ubq)
4179 return;
4180
4181 __ublk_deinit_queue(ub, ubq);
4182 ub->queues[q_id] = NULL;
4183 }
4184
ublk_get_queue_numa_node(struct ublk_device * ub,int q_id)4185 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4186 {
4187 unsigned int cpu;
4188
4189 /* Find first CPU mapped to this queue */
4190 for_each_possible_cpu(cpu) {
4191 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4192 return cpu_to_node(cpu);
4193 }
4194
4195 return NUMA_NO_NODE;
4196 }
4197
ublk_init_queue(struct ublk_device * ub,int q_id)4198 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4199 {
4200 int depth = ub->dev_info.queue_depth;
4201 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4202 struct ublk_queue *ubq;
4203 struct page *page;
4204 int numa_node;
4205 int size, i, ret;
4206
4207 /* Determine NUMA node based on queue's CPU affinity */
4208 numa_node = ublk_get_queue_numa_node(ub, q_id);
4209
4210 /* Allocate queue structure on local NUMA node */
4211 ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4212 numa_node);
4213 if (!ubq)
4214 return -ENOMEM;
4215
4216 spin_lock_init(&ubq->cancel_lock);
4217 ubq->flags = ub->dev_info.flags;
4218 ubq->q_id = q_id;
4219 ubq->q_depth = depth;
4220 size = ublk_queue_cmd_buf_size(ub);
4221
4222 /* Allocate I/O command buffer on local NUMA node */
4223 page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4224 if (!page) {
4225 kvfree(ubq);
4226 return -ENOMEM;
4227 }
4228 ubq->io_cmd_buf = page_address(page);
4229
4230 for (i = 0; i < ubq->q_depth; i++)
4231 spin_lock_init(&ubq->ios[i].lock);
4232
4233 if (ublk_dev_support_batch_io(ub)) {
4234 ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4235 if (ret)
4236 goto fail;
4237 INIT_LIST_HEAD(&ubq->fcmd_head);
4238 }
4239 ub->queues[q_id] = ubq;
4240 ubq->dev = ub;
4241
4242 return 0;
4243 fail:
4244 __ublk_deinit_queue(ub, ubq);
4245 return ret;
4246 }
4247
ublk_deinit_queues(struct ublk_device * ub)4248 static void ublk_deinit_queues(struct ublk_device *ub)
4249 {
4250 int i;
4251
4252 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4253 ublk_deinit_queue(ub, i);
4254 }
4255
ublk_init_queues(struct ublk_device * ub)4256 static int ublk_init_queues(struct ublk_device *ub)
4257 {
4258 int i, ret;
4259
4260 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4261 ret = ublk_init_queue(ub, i);
4262 if (ret)
4263 goto fail;
4264 }
4265
4266 init_completion(&ub->completion);
4267 return 0;
4268
4269 fail:
4270 ublk_deinit_queues(ub);
4271 return ret;
4272 }
4273
ublk_alloc_dev_number(struct ublk_device * ub,int idx)4274 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4275 {
4276 int i = idx;
4277 int err;
4278
4279 spin_lock(&ublk_idr_lock);
4280 /* allocate id, if @id >= 0, we're requesting that specific id */
4281 if (i >= 0) {
4282 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4283 if (err == -ENOSPC)
4284 err = -EEXIST;
4285 } else {
4286 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4287 GFP_NOWAIT);
4288 }
4289 spin_unlock(&ublk_idr_lock);
4290
4291 if (err >= 0)
4292 ub->ub_number = err;
4293
4294 return err;
4295 }
4296
ublk_free_dev_number(struct ublk_device * ub)4297 static void ublk_free_dev_number(struct ublk_device *ub)
4298 {
4299 spin_lock(&ublk_idr_lock);
4300 idr_remove(&ublk_index_idr, ub->ub_number);
4301 wake_up_all(&ublk_idr_wq);
4302 spin_unlock(&ublk_idr_lock);
4303 }
4304
ublk_cdev_rel(struct device * dev)4305 static void ublk_cdev_rel(struct device *dev)
4306 {
4307 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4308
4309 ublk_buf_cleanup(ub);
4310 blk_mq_free_tag_set(&ub->tag_set);
4311 ublk_deinit_queues(ub);
4312 ublk_free_dev_number(ub);
4313 mutex_destroy(&ub->mutex);
4314 mutex_destroy(&ub->cancel_mutex);
4315 kfree(ub);
4316 }
4317
ublk_add_chdev(struct ublk_device * ub)4318 static int ublk_add_chdev(struct ublk_device *ub)
4319 {
4320 struct device *dev = &ub->cdev_dev;
4321 int minor = ub->ub_number;
4322 int ret;
4323
4324 dev->parent = ublk_misc.this_device;
4325 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4326 dev->class = &ublk_chr_class;
4327 dev->release = ublk_cdev_rel;
4328 device_initialize(dev);
4329
4330 ret = dev_set_name(dev, "ublkc%d", minor);
4331 if (ret)
4332 goto fail;
4333
4334 if (ublk_dev_support_batch_io(ub))
4335 cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4336 else
4337 cdev_init(&ub->cdev, &ublk_ch_fops);
4338 ret = cdev_device_add(&ub->cdev, dev);
4339 if (ret)
4340 goto fail;
4341
4342 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4343 unprivileged_ublks_added++;
4344 return 0;
4345 fail:
4346 put_device(dev);
4347 return ret;
4348 }
4349
4350 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)4351 static void ublk_align_max_io_size(struct ublk_device *ub)
4352 {
4353 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4354
4355 ub->dev_info.max_io_buf_bytes =
4356 round_down(max_io_bytes, PAGE_SIZE);
4357 }
4358
ublk_add_tag_set(struct ublk_device * ub)4359 static int ublk_add_tag_set(struct ublk_device *ub)
4360 {
4361 if (ublk_dev_support_batch_io(ub))
4362 ub->tag_set.ops = &ublk_batch_mq_ops;
4363 else
4364 ub->tag_set.ops = &ublk_mq_ops;
4365 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4366 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4367 ub->tag_set.numa_node = NUMA_NO_NODE;
4368 ub->tag_set.driver_data = ub;
4369 return blk_mq_alloc_tag_set(&ub->tag_set);
4370 }
4371
ublk_remove(struct ublk_device * ub)4372 static void ublk_remove(struct ublk_device *ub)
4373 {
4374 bool unprivileged;
4375
4376 ublk_stop_dev(ub);
4377 cdev_device_del(&ub->cdev, &ub->cdev_dev);
4378 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4379 ublk_put_device(ub);
4380
4381 if (unprivileged)
4382 unprivileged_ublks_added--;
4383 }
4384
ublk_get_device_from_id(int idx)4385 static struct ublk_device *ublk_get_device_from_id(int idx)
4386 {
4387 struct ublk_device *ub = NULL;
4388
4389 if (idx < 0)
4390 return NULL;
4391
4392 spin_lock(&ublk_idr_lock);
4393 ub = idr_find(&ublk_index_idr, idx);
4394 if (ub)
4395 ub = ublk_get_device(ub);
4396 spin_unlock(&ublk_idr_lock);
4397
4398 return ub;
4399 }
4400
ublk_validate_user_pid(struct ublk_device * ub,pid_t ublksrv_pid)4401 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4402 {
4403 rcu_read_lock();
4404 ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4405 rcu_read_unlock();
4406
4407 return ub->ublksrv_tgid == ublksrv_pid;
4408 }
4409
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4410 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4411 const struct ublksrv_ctrl_cmd *header)
4412 {
4413 const struct ublk_param_basic *p = &ub->params.basic;
4414 int ublksrv_pid = (int)header->data[0];
4415 struct queue_limits lim = {
4416 .logical_block_size = 1 << p->logical_bs_shift,
4417 .physical_block_size = 1 << p->physical_bs_shift,
4418 .io_min = 1 << p->io_min_shift,
4419 .io_opt = 1 << p->io_opt_shift,
4420 .max_hw_sectors = p->max_sectors,
4421 .chunk_sectors = p->chunk_sectors,
4422 .virt_boundary_mask = p->virt_boundary_mask,
4423 .max_segments = USHRT_MAX,
4424 .max_segment_size = UINT_MAX,
4425 .dma_alignment = 3,
4426 };
4427 struct gendisk *disk;
4428 int ret = -EINVAL;
4429
4430 if (ublksrv_pid <= 0)
4431 return -EINVAL;
4432 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4433 return -EINVAL;
4434
4435 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4436 const struct ublk_param_discard *pd = &ub->params.discard;
4437
4438 lim.discard_alignment = pd->discard_alignment;
4439 lim.discard_granularity = pd->discard_granularity;
4440 lim.max_hw_discard_sectors = pd->max_discard_sectors;
4441 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4442 lim.max_discard_segments = pd->max_discard_segments;
4443 }
4444
4445 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4446 const struct ublk_param_zoned *p = &ub->params.zoned;
4447
4448 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4449 return -EOPNOTSUPP;
4450
4451 lim.features |= BLK_FEAT_ZONED;
4452 lim.max_active_zones = p->max_active_zones;
4453 lim.max_open_zones = p->max_open_zones;
4454 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4455 }
4456
4457 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4458 lim.features |= BLK_FEAT_WRITE_CACHE;
4459 if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4460 lim.features |= BLK_FEAT_FUA;
4461 }
4462
4463 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4464 lim.features |= BLK_FEAT_ROTATIONAL;
4465
4466 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4467 lim.dma_alignment = ub->params.dma.alignment;
4468
4469 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4470 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4471 lim.max_segment_size = ub->params.seg.max_segment_size;
4472 lim.max_segments = ub->params.seg.max_segments;
4473 }
4474
4475 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4476 const struct ublk_param_integrity *p = &ub->params.integrity;
4477 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4478
4479 lim.max_integrity_segments =
4480 p->max_integrity_segments ?: USHRT_MAX;
4481 lim.integrity = (struct blk_integrity) {
4482 .flags = ublk_integrity_flags(p->flags),
4483 .csum_type = ublk_integrity_csum_type(p->csum_type),
4484 .metadata_size = p->metadata_size,
4485 .pi_offset = p->pi_offset,
4486 .interval_exp = p->interval_exp,
4487 .tag_size = p->tag_size,
4488 .pi_tuple_size = pi_tuple_size,
4489 };
4490 }
4491
4492 if (wait_for_completion_interruptible(&ub->completion) != 0)
4493 return -EINTR;
4494
4495 if (!ublk_validate_user_pid(ub, ublksrv_pid))
4496 return -EINVAL;
4497
4498 mutex_lock(&ub->mutex);
4499 /* device may become not ready in case of F_BATCH */
4500 if (!ublk_dev_ready(ub)) {
4501 ret = -EINVAL;
4502 goto out_unlock;
4503 }
4504 if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4505 test_bit(UB_STATE_USED, &ub->state)) {
4506 ret = -EEXIST;
4507 goto out_unlock;
4508 }
4509
4510 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4511 if (IS_ERR(disk)) {
4512 ret = PTR_ERR(disk);
4513 goto out_unlock;
4514 }
4515 sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4516 disk->fops = &ub_fops;
4517 disk->private_data = ub;
4518
4519 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4520 ub->ub_disk = disk;
4521
4522 ublk_apply_params(ub);
4523
4524 /*
4525 * Suppress partition scan to avoid potential IO hang.
4526 *
4527 * If ublk server error occurs during partition scan, the IO may
4528 * wait while holding ub->mutex, which can deadlock with other
4529 * operations that need the mutex. Defer partition scan to async
4530 * work.
4531 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4532 * permanently.
4533 */
4534 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4535
4536 ublk_get_device(ub);
4537 ub->dev_info.state = UBLK_S_DEV_LIVE;
4538
4539 if (ublk_dev_is_zoned(ub)) {
4540 ret = ublk_revalidate_disk_zones(ub);
4541 if (ret)
4542 goto out_put_cdev;
4543 }
4544
4545 ret = add_disk(disk);
4546 if (ret)
4547 goto out_put_cdev;
4548
4549 set_bit(UB_STATE_USED, &ub->state);
4550
4551 /* Skip partition scan if disabled by user */
4552 if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4553 /* Not clear for unprivileged daemons, see comment above */
4554 if (!ub->unprivileged_daemons)
4555 clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4556 } else {
4557 /* Schedule async partition scan for trusted daemons */
4558 if (!ub->unprivileged_daemons)
4559 schedule_work(&ub->partition_scan_work);
4560 }
4561
4562 out_put_cdev:
4563 if (ret) {
4564 ublk_detach_disk(ub);
4565 ublk_put_device(ub);
4566 }
4567 if (ret)
4568 put_disk(disk);
4569 out_unlock:
4570 mutex_unlock(&ub->mutex);
4571 return ret;
4572 }
4573
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4574 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4575 const struct ublksrv_ctrl_cmd *header)
4576 {
4577 void __user *argp = (void __user *)(unsigned long)header->addr;
4578 cpumask_var_t cpumask;
4579 unsigned long queue;
4580 unsigned int retlen;
4581 unsigned int i;
4582 int ret;
4583
4584 if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4585 return -EINVAL;
4586 if (header->len & (sizeof(unsigned long)-1))
4587 return -EINVAL;
4588 if (!header->addr)
4589 return -EINVAL;
4590
4591 queue = header->data[0];
4592 if (queue >= ub->dev_info.nr_hw_queues)
4593 return -EINVAL;
4594
4595 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4596 return -ENOMEM;
4597
4598 for_each_possible_cpu(i) {
4599 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4600 cpumask_set_cpu(i, cpumask);
4601 }
4602
4603 ret = -EFAULT;
4604 retlen = min_t(unsigned short, header->len, cpumask_size());
4605 if (copy_to_user(argp, cpumask, retlen))
4606 goto out_free_cpumask;
4607 if (retlen != header->len &&
4608 clear_user(argp + retlen, header->len - retlen))
4609 goto out_free_cpumask;
4610
4611 ret = 0;
4612 out_free_cpumask:
4613 free_cpumask_var(cpumask);
4614 return ret;
4615 }
4616
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)4617 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4618 {
4619 pr_devel("%s: dev id %d flags %llx\n", __func__,
4620 info->dev_id, info->flags);
4621 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4622 info->nr_hw_queues, info->queue_depth);
4623 }
4624
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)4625 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4626 {
4627 void __user *argp = (void __user *)(unsigned long)header->addr;
4628 struct ublksrv_ctrl_dev_info info;
4629 struct ublk_device *ub;
4630 int ret = -EINVAL;
4631
4632 if (header->len < sizeof(info) || !header->addr)
4633 return -EINVAL;
4634 if (header->queue_id != (u16)-1) {
4635 pr_warn("%s: queue_id is wrong %x\n",
4636 __func__, header->queue_id);
4637 return -EINVAL;
4638 }
4639
4640 if (copy_from_user(&info, argp, sizeof(info)))
4641 return -EFAULT;
4642
4643 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4644 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4645 return -EINVAL;
4646
4647 if (capable(CAP_SYS_ADMIN))
4648 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4649 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4650 return -EPERM;
4651
4652 /* forbid nonsense combinations of recovery flags */
4653 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4654 case 0:
4655 case UBLK_F_USER_RECOVERY:
4656 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4657 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4658 break;
4659 default:
4660 pr_warn("%s: invalid recovery flags %llx\n", __func__,
4661 info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4662 return -EINVAL;
4663 }
4664
4665 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4666 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4667 return -EINVAL;
4668 }
4669
4670 /*
4671 * unprivileged device can't be trusted, but RECOVERY and
4672 * RECOVERY_REISSUE still may hang error handling, so can't
4673 * support recovery features for unprivileged ublk now
4674 *
4675 * TODO: provide forward progress for RECOVERY handler, so that
4676 * unprivileged device can benefit from it
4677 */
4678 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4679 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4680 UBLK_F_USER_RECOVERY);
4681
4682 /*
4683 * For USER_COPY, we depends on userspace to fill request
4684 * buffer by pwrite() to ublk char device, which can't be
4685 * used for unprivileged device
4686 *
4687 * Same with zero copy or auto buffer register.
4688 */
4689 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4690 UBLK_F_AUTO_BUF_REG))
4691 return -EINVAL;
4692 }
4693
4694 /* User copy is required to access integrity buffer */
4695 if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4696 return -EINVAL;
4697
4698 /* the created device is always owned by current user */
4699 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4700
4701 if (header->dev_id != info.dev_id) {
4702 pr_warn("%s: dev id not match %u %u\n",
4703 __func__, header->dev_id, info.dev_id);
4704 return -EINVAL;
4705 }
4706
4707 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4708 pr_warn("%s: dev id is too large. Max supported is %d\n",
4709 __func__, UBLK_MAX_UBLKS - 1);
4710 return -EINVAL;
4711 }
4712
4713 ublk_dump_dev_info(&info);
4714
4715 ret = mutex_lock_killable(&ublk_ctl_mutex);
4716 if (ret)
4717 return ret;
4718
4719 ret = -EACCES;
4720 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4721 unprivileged_ublks_added >= unprivileged_ublks_max)
4722 goto out_unlock;
4723
4724 ret = -ENOMEM;
4725 ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4726 if (!ub)
4727 goto out_unlock;
4728 mutex_init(&ub->mutex);
4729 spin_lock_init(&ub->lock);
4730 mutex_init(&ub->cancel_mutex);
4731 mt_init(&ub->buf_tree);
4732 ida_init(&ub->buf_ida);
4733 INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4734
4735 ret = ublk_alloc_dev_number(ub, header->dev_id);
4736 if (ret < 0)
4737 goto out_free_ub;
4738
4739 memcpy(&ub->dev_info, &info, sizeof(info));
4740
4741 /* update device id */
4742 ub->dev_info.dev_id = ub->ub_number;
4743
4744 /*
4745 * 64bit flags will be copied back to userspace as feature
4746 * negotiation result, so have to clear flags which driver
4747 * doesn't support yet, then userspace can get correct flags
4748 * (features) to handle.
4749 */
4750 ub->dev_info.flags &= UBLK_F_ALL;
4751
4752 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4753 UBLK_F_URING_CMD_COMP_IN_TASK |
4754 UBLK_F_PER_IO_DAEMON |
4755 UBLK_F_BUF_REG_OFF_DAEMON |
4756 UBLK_F_SAFE_STOP_DEV;
4757
4758 /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4759 if (ublk_dev_support_batch_io(ub))
4760 ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4761
4762 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4763 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4764 UBLK_F_AUTO_BUF_REG))
4765 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4766
4767 /* UBLK_F_BATCH_IO doesn't support GET_DATA */
4768 if (ublk_dev_support_batch_io(ub))
4769 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4770
4771 /*
4772 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4773 * returning write_append_lba, which is only allowed in case of
4774 * user copy or zero copy
4775 */
4776 if (ublk_dev_is_zoned(ub) &&
4777 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4778 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4779 ret = -EINVAL;
4780 goto out_free_dev_number;
4781 }
4782
4783 ub->dev_info.nr_hw_queues = min_t(unsigned int,
4784 ub->dev_info.nr_hw_queues, nr_cpu_ids);
4785 ublk_align_max_io_size(ub);
4786
4787 ret = ublk_add_tag_set(ub);
4788 if (ret)
4789 goto out_free_dev_number;
4790
4791 ret = ublk_init_queues(ub);
4792 if (ret)
4793 goto out_free_tag_set;
4794
4795 ret = -EFAULT;
4796 if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4797 goto out_deinit_queues;
4798
4799 /*
4800 * Add the char dev so that ublksrv daemon can be setup.
4801 * ublk_add_chdev() will cleanup everything if it fails.
4802 */
4803 ret = ublk_add_chdev(ub);
4804 goto out_unlock;
4805
4806 out_deinit_queues:
4807 ublk_deinit_queues(ub);
4808 out_free_tag_set:
4809 blk_mq_free_tag_set(&ub->tag_set);
4810 out_free_dev_number:
4811 ublk_free_dev_number(ub);
4812 out_free_ub:
4813 mutex_destroy(&ub->mutex);
4814 mutex_destroy(&ub->cancel_mutex);
4815 kfree(ub);
4816 out_unlock:
4817 mutex_unlock(&ublk_ctl_mutex);
4818 return ret;
4819 }
4820
ublk_idr_freed(int id)4821 static inline bool ublk_idr_freed(int id)
4822 {
4823 void *ptr;
4824
4825 spin_lock(&ublk_idr_lock);
4826 ptr = idr_find(&ublk_index_idr, id);
4827 spin_unlock(&ublk_idr_lock);
4828
4829 return ptr == NULL;
4830 }
4831
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)4832 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4833 {
4834 struct ublk_device *ub = *p_ub;
4835 int idx = ub->ub_number;
4836 int ret;
4837
4838 ret = mutex_lock_killable(&ublk_ctl_mutex);
4839 if (ret)
4840 return ret;
4841
4842 if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4843 ublk_remove(ub);
4844 set_bit(UB_STATE_DELETED, &ub->state);
4845 }
4846
4847 /* Mark the reference as consumed */
4848 *p_ub = NULL;
4849 ublk_put_device(ub);
4850 mutex_unlock(&ublk_ctl_mutex);
4851
4852 /*
4853 * Wait until the idr is removed, then it can be reused after
4854 * DEL_DEV command is returned.
4855 *
4856 * If we returns because of user interrupt, future delete command
4857 * may come:
4858 *
4859 * - the device number isn't freed, this device won't or needn't
4860 * be deleted again, since UB_STATE_DELETED is set, and device
4861 * will be released after the last reference is dropped
4862 *
4863 * - the device number is freed already, we will not find this
4864 * device via ublk_get_device_from_id()
4865 */
4866 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4867 return -EINTR;
4868 return 0;
4869 }
4870
ublk_ctrl_cmd_dump(u32 cmd_op,const struct ublksrv_ctrl_cmd * header)4871 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4872 const struct ublksrv_ctrl_cmd *header)
4873 {
4874 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4875 __func__, cmd_op, header->dev_id, header->queue_id,
4876 header->data[0], header->addr, header->len);
4877 }
4878
ublk_ctrl_stop_dev(struct ublk_device * ub)4879 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4880 {
4881 ublk_stop_dev(ub);
4882 }
4883
ublk_ctrl_try_stop_dev(struct ublk_device * ub)4884 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4885 {
4886 struct gendisk *disk;
4887 int ret = 0;
4888
4889 disk = ublk_get_disk(ub);
4890 if (!disk)
4891 return -ENODEV;
4892
4893 mutex_lock(&disk->open_mutex);
4894 if (disk_openers(disk) > 0) {
4895 ret = -EBUSY;
4896 goto unlock;
4897 }
4898 ub->block_open = true;
4899 /* release open_mutex as del_gendisk() will reacquire it */
4900 mutex_unlock(&disk->open_mutex);
4901
4902 ublk_ctrl_stop_dev(ub);
4903 goto out;
4904
4905 unlock:
4906 mutex_unlock(&disk->open_mutex);
4907 out:
4908 ublk_put_disk(disk);
4909 return ret;
4910 }
4911
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4912 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4913 const struct ublksrv_ctrl_cmd *header)
4914 {
4915 struct task_struct *p;
4916 struct pid *pid;
4917 struct ublksrv_ctrl_dev_info dev_info;
4918 pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4919 void __user *argp = (void __user *)(unsigned long)header->addr;
4920
4921 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4922 return -EINVAL;
4923
4924 memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4925 dev_info.ublksrv_pid = -1;
4926
4927 if (init_ublksrv_tgid > 0) {
4928 rcu_read_lock();
4929 pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4930 p = pid_task(pid, PIDTYPE_TGID);
4931 if (p) {
4932 int vnr = task_tgid_vnr(p);
4933
4934 if (vnr)
4935 dev_info.ublksrv_pid = vnr;
4936 }
4937 rcu_read_unlock();
4938 }
4939
4940 if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4941 return -EFAULT;
4942
4943 return 0;
4944 }
4945
4946 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)4947 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4948 {
4949 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4950 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4951
4952 if (ub->ub_disk) {
4953 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4954 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4955 } else {
4956 ub->params.devt.disk_major = 0;
4957 ub->params.devt.disk_minor = 0;
4958 }
4959 ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4960 }
4961
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4962 static int ublk_ctrl_get_params(struct ublk_device *ub,
4963 const struct ublksrv_ctrl_cmd *header)
4964 {
4965 void __user *argp = (void __user *)(unsigned long)header->addr;
4966 struct ublk_params_header ph;
4967 int ret;
4968
4969 if (header->len <= sizeof(ph) || !header->addr)
4970 return -EINVAL;
4971
4972 if (copy_from_user(&ph, argp, sizeof(ph)))
4973 return -EFAULT;
4974
4975 if (ph.len > header->len || !ph.len)
4976 return -EINVAL;
4977
4978 if (ph.len > sizeof(struct ublk_params))
4979 ph.len = sizeof(struct ublk_params);
4980
4981 mutex_lock(&ub->mutex);
4982 ublk_ctrl_fill_params_devt(ub);
4983 if (copy_to_user(argp, &ub->params, ph.len))
4984 ret = -EFAULT;
4985 else
4986 ret = 0;
4987 mutex_unlock(&ub->mutex);
4988
4989 return ret;
4990 }
4991
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4992 static int ublk_ctrl_set_params(struct ublk_device *ub,
4993 const struct ublksrv_ctrl_cmd *header)
4994 {
4995 void __user *argp = (void __user *)(unsigned long)header->addr;
4996 struct ublk_params_header ph;
4997 int ret = -EFAULT;
4998
4999 if (header->len <= sizeof(ph) || !header->addr)
5000 return -EINVAL;
5001
5002 if (copy_from_user(&ph, argp, sizeof(ph)))
5003 return -EFAULT;
5004
5005 if (ph.len > header->len || !ph.len || !ph.types)
5006 return -EINVAL;
5007
5008 if (ph.len > sizeof(struct ublk_params))
5009 ph.len = sizeof(struct ublk_params);
5010
5011 mutex_lock(&ub->mutex);
5012 if (test_bit(UB_STATE_USED, &ub->state)) {
5013 /*
5014 * Parameters can only be changed when device hasn't
5015 * been started yet
5016 */
5017 ret = -EACCES;
5018 } else if (copy_from_user(&ub->params, argp, ph.len)) {
5019 /* zero out partial copy so no stale params survive */
5020 memset(&ub->params, 0, sizeof(ub->params));
5021 ret = -EFAULT;
5022 } else {
5023 /* clear all we don't support yet */
5024 ub->params.types &= UBLK_PARAM_TYPE_ALL;
5025 ret = ublk_validate_params(ub);
5026 if (ret)
5027 memset(&ub->params, 0, sizeof(ub->params));
5028 }
5029 mutex_unlock(&ub->mutex);
5030
5031 return ret;
5032 }
5033
ublk_ctrl_start_recovery(struct ublk_device * ub)5034 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
5035 {
5036 int ret = -EINVAL;
5037
5038 mutex_lock(&ub->mutex);
5039 if (ublk_nosrv_should_stop_dev(ub))
5040 goto out_unlock;
5041 /*
5042 * START_RECOVERY is only allowd after:
5043 *
5044 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5045 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
5046 * released.
5047 *
5048 * and one of the following holds
5049 *
5050 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5051 * (a)has quiesced request queue
5052 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
5053 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5054 * (d)has completed/camceled all ioucmds owned by ther dying process
5055 *
5056 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5057 * quiesced, but all I/O is being immediately errored
5058 */
5059 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5060 ret = -EBUSY;
5061 goto out_unlock;
5062 }
5063 pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5064 init_completion(&ub->completion);
5065 ret = 0;
5066 out_unlock:
5067 mutex_unlock(&ub->mutex);
5068 return ret;
5069 }
5070
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5071 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5072 const struct ublksrv_ctrl_cmd *header)
5073 {
5074 int ublksrv_pid = (int)header->data[0];
5075 int ret = -EINVAL;
5076
5077 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5078 header->dev_id);
5079
5080 if (wait_for_completion_interruptible(&ub->completion))
5081 return -EINTR;
5082
5083 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5084 header->dev_id);
5085
5086 if (!ublk_validate_user_pid(ub, ublksrv_pid))
5087 return -EINVAL;
5088
5089 mutex_lock(&ub->mutex);
5090 if (ublk_nosrv_should_stop_dev(ub))
5091 goto out_unlock;
5092
5093 if (!ublk_dev_in_recoverable_state(ub)) {
5094 ret = -EBUSY;
5095 goto out_unlock;
5096 }
5097 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5098 ub->dev_info.state = UBLK_S_DEV_LIVE;
5099 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5100 __func__, ublksrv_pid, header->dev_id);
5101 blk_mq_kick_requeue_list(ub->ub_disk->queue);
5102 ret = 0;
5103 out_unlock:
5104 mutex_unlock(&ub->mutex);
5105 return ret;
5106 }
5107
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)5108 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5109 {
5110 void __user *argp = (void __user *)(unsigned long)header->addr;
5111 u64 features = UBLK_F_ALL;
5112
5113 if (header->len != UBLK_FEATURES_LEN || !header->addr)
5114 return -EINVAL;
5115
5116 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5117 return -EFAULT;
5118
5119 return 0;
5120 }
5121
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5122 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5123 {
5124 struct ublk_param_basic *p = &ub->params.basic;
5125 u64 new_size = header->data[0];
5126 int ret = 0;
5127
5128 mutex_lock(&ub->mutex);
5129 if (!ub->ub_disk) {
5130 ret = -ENODEV;
5131 goto out;
5132 }
5133 p->dev_sectors = new_size;
5134 set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5135 out:
5136 mutex_unlock(&ub->mutex);
5137 return ret;
5138 }
5139
5140 struct count_busy {
5141 const struct ublk_queue *ubq;
5142 unsigned int nr_busy;
5143 };
5144
ublk_count_busy_req(struct request * rq,void * data)5145 static bool ublk_count_busy_req(struct request *rq, void *data)
5146 {
5147 struct count_busy *idle = data;
5148
5149 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5150 idle->nr_busy += 1;
5151 return true;
5152 }
5153
5154 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)5155 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5156 {
5157 struct count_busy data = {
5158 .ubq = ubq,
5159 };
5160
5161 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5162 return data.nr_busy < ubq->q_depth;
5163 }
5164
5165 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)5166 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5167 unsigned int timeout_ms)
5168 {
5169 unsigned int elapsed = 0;
5170 int ret;
5171
5172 /*
5173 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5174 * or new fetch command, so needn't wait any more
5175 */
5176 if (ublk_dev_support_batch_io(ub))
5177 return 0;
5178
5179 while (elapsed < timeout_ms && !signal_pending(current)) {
5180 unsigned int queues_cancelable = 0;
5181 int i;
5182
5183 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5184 struct ublk_queue *ubq = ublk_get_queue(ub, i);
5185
5186 queues_cancelable += !!ubq_has_idle_io(ubq);
5187 }
5188
5189 /*
5190 * Each queue needs at least one active command for
5191 * notifying ublk server
5192 */
5193 if (queues_cancelable == ub->dev_info.nr_hw_queues)
5194 break;
5195
5196 msleep(UBLK_REQUEUE_DELAY_MS);
5197 elapsed += UBLK_REQUEUE_DELAY_MS;
5198 }
5199
5200 if (signal_pending(current))
5201 ret = -EINTR;
5202 else if (elapsed >= timeout_ms)
5203 ret = -EBUSY;
5204 else
5205 ret = 0;
5206
5207 return ret;
5208 }
5209
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5210 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5211 const struct ublksrv_ctrl_cmd *header)
5212 {
5213 /* zero means wait forever */
5214 u64 timeout_ms = header->data[0];
5215 struct gendisk *disk;
5216 int ret = -ENODEV;
5217
5218 if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5219 return -EOPNOTSUPP;
5220
5221 mutex_lock(&ub->mutex);
5222 disk = ublk_get_disk(ub);
5223 if (!disk)
5224 goto unlock;
5225 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5226 goto put_disk;
5227
5228 ret = 0;
5229 /* already in expected state */
5230 if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5231 goto put_disk;
5232
5233 /* Mark the device as canceling */
5234 mutex_lock(&ub->cancel_mutex);
5235 blk_mq_quiesce_queue(disk->queue);
5236 ublk_set_canceling(ub, true);
5237 blk_mq_unquiesce_queue(disk->queue);
5238 mutex_unlock(&ub->cancel_mutex);
5239
5240 if (!timeout_ms)
5241 timeout_ms = UINT_MAX;
5242 ret = ublk_wait_for_idle_io(ub, timeout_ms);
5243
5244 put_disk:
5245 ublk_put_disk(disk);
5246 unlock:
5247 mutex_unlock(&ub->mutex);
5248
5249 /* Cancel pending uring_cmd */
5250 if (!ret)
5251 ublk_cancel_dev(ub);
5252 return ret;
5253 }
5254
5255 /*
5256 * All control commands are sent via /dev/ublk-control, so we have to check
5257 * the destination device's permission
5258 */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)5259 static int ublk_char_dev_permission(struct ublk_device *ub,
5260 const char *dev_path, int mask)
5261 {
5262 int err;
5263 struct path path;
5264 struct kstat stat;
5265
5266 err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5267 if (err)
5268 return err;
5269
5270 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5271 if (err)
5272 goto exit;
5273
5274 err = -EPERM;
5275 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5276 goto exit;
5277
5278 err = inode_permission(&nop_mnt_idmap,
5279 d_backing_inode(path.dentry), mask);
5280 exit:
5281 path_put(&path);
5282 return err;
5283 }
5284
5285 /*
5286 * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5287 * if device is started. If device is not yet started, only mutex is
5288 * needed since no I/O path can access the tree.
5289 *
5290 * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5291 * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5292 */
ublk_lock_buf_tree(struct ublk_device * ub)5293 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5294 {
5295 unsigned int memflags = 0;
5296
5297 mutex_lock(&ub->mutex);
5298 if (ub->ub_disk)
5299 memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5300
5301 return memflags;
5302 }
5303
ublk_unlock_buf_tree(struct ublk_device * ub,unsigned int memflags)5304 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5305 {
5306 if (ub->ub_disk)
5307 blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5308 mutex_unlock(&ub->mutex);
5309 }
5310
5311 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
ublk_buf_erase_ranges(struct ublk_device * ub,int buf_index)5312 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5313 {
5314 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5315 struct ublk_buf_range *range;
5316
5317 mas_lock(&mas);
5318 mas_for_each(&mas, range, ULONG_MAX) {
5319 if (range->buf_index == buf_index) {
5320 mas_erase(&mas);
5321 kfree(range);
5322 }
5323 }
5324 mas_unlock(&mas);
5325 }
5326
__ublk_ctrl_reg_buf(struct ublk_device * ub,struct page ** pages,unsigned long nr_pages,int index,unsigned short flags)5327 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5328 struct page **pages, unsigned long nr_pages,
5329 int index, unsigned short flags)
5330 {
5331 unsigned long i;
5332 int ret;
5333
5334 for (i = 0; i < nr_pages; i++) {
5335 unsigned long pfn = page_to_pfn(pages[i]);
5336 unsigned long start = i;
5337 struct ublk_buf_range *range;
5338
5339 /* Find run of consecutive PFNs */
5340 while (i + 1 < nr_pages &&
5341 page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5342 i++;
5343
5344 range = kzalloc(sizeof(*range), GFP_KERNEL);
5345 if (!range) {
5346 ret = -ENOMEM;
5347 goto unwind;
5348 }
5349 range->buf_index = index;
5350 range->flags = flags;
5351 range->base_offset = start << PAGE_SHIFT;
5352
5353 ret = mtree_insert_range(&ub->buf_tree, pfn,
5354 pfn + (i - start),
5355 range, GFP_KERNEL);
5356 if (ret) {
5357 kfree(range);
5358 goto unwind;
5359 }
5360 }
5361 return 0;
5362
5363 unwind:
5364 ublk_buf_erase_ranges(ub, index);
5365 return ret;
5366 }
5367
5368 /*
5369 * Register a shared memory buffer for zero-copy I/O.
5370 * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5371 * internally. Returns buffer index (>= 0) on success.
5372 */
ublk_ctrl_reg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5373 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5374 struct ublksrv_ctrl_cmd *header)
5375 {
5376 void __user *argp = (void __user *)(unsigned long)header->addr;
5377 struct ublk_shmem_buf_reg buf_reg;
5378 unsigned long nr_pages;
5379 struct page **pages = NULL;
5380 unsigned int gup_flags;
5381 unsigned int memflags;
5382 long pinned;
5383 int index;
5384 int ret;
5385
5386 if (!ublk_dev_support_shmem_zc(ub))
5387 return -EOPNOTSUPP;
5388
5389 memset(&buf_reg, 0, sizeof(buf_reg));
5390 if (copy_from_user(&buf_reg, argp,
5391 min_t(size_t, header->len, sizeof(buf_reg))))
5392 return -EFAULT;
5393
5394 if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5395 return -EINVAL;
5396
5397 if (buf_reg.reserved)
5398 return -EINVAL;
5399
5400 if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5401 !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5402 return -EINVAL;
5403
5404 nr_pages = buf_reg.len >> PAGE_SHIFT;
5405
5406 /* Pin pages before any locks (may sleep) */
5407 pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5408 if (!pages)
5409 return -ENOMEM;
5410
5411 gup_flags = FOLL_LONGTERM;
5412 if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5413 gup_flags |= FOLL_WRITE;
5414
5415 pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5416 if (pinned < 0) {
5417 ret = pinned;
5418 goto err_free_pages;
5419 }
5420 if (pinned != nr_pages) {
5421 ret = -EFAULT;
5422 goto err_unpin;
5423 }
5424
5425 memflags = ublk_lock_buf_tree(ub);
5426
5427 index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5428 if (index < 0) {
5429 ret = index;
5430 goto err_unlock;
5431 }
5432
5433 ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5434 if (ret) {
5435 ida_free(&ub->buf_ida, index);
5436 goto err_unlock;
5437 }
5438
5439 ublk_unlock_buf_tree(ub, memflags);
5440 kvfree(pages);
5441 return index;
5442
5443 err_unlock:
5444 ublk_unlock_buf_tree(ub, memflags);
5445 err_unpin:
5446 unpin_user_pages(pages, pinned);
5447 err_free_pages:
5448 kvfree(pages);
5449 return ret;
5450 }
5451
ublk_unpin_range_pages(unsigned long base_pfn,unsigned long nr_pages)5452 static void ublk_unpin_range_pages(unsigned long base_pfn,
5453 unsigned long nr_pages)
5454 {
5455 #define UBLK_UNPIN_BATCH 32
5456 struct page *pages[UBLK_UNPIN_BATCH];
5457 unsigned long off;
5458
5459 for (off = 0; off < nr_pages; ) {
5460 unsigned int batch = min_t(unsigned long,
5461 nr_pages - off, UBLK_UNPIN_BATCH);
5462 unsigned int j;
5463
5464 for (j = 0; j < batch; j++)
5465 pages[j] = pfn_to_page(base_pfn + off + j);
5466 unpin_user_pages(pages, batch);
5467 off += batch;
5468 }
5469 }
5470
5471 /*
5472 * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under
5473 * mas_lock, collecting them into an xarray. Then drop the lock and
5474 * unpin pages + free ranges outside spinlock context.
5475 *
5476 * Returns true if the tree walk completed, false if more ranges remain.
5477 * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value().
5478 */
5479 #define UBLK_REMOVE_BATCH 64
5480
__ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index,int * ret)5481 static bool __ublk_shmem_remove_ranges(struct ublk_device *ub,
5482 int buf_index, int *ret)
5483 {
5484 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5485 struct ublk_buf_range *range;
5486 struct xarray to_unpin;
5487 unsigned long idx;
5488 unsigned int count = 0;
5489 bool done = false;
5490 void *entry;
5491
5492 xa_init(&to_unpin);
5493
5494 mas_lock(&mas);
5495 mas_for_each(&mas, range, ULONG_MAX) {
5496 unsigned long nr;
5497
5498 if (buf_index >= 0 && range->buf_index != buf_index)
5499 continue;
5500
5501 *ret = 0;
5502 nr = mas.last - mas.index + 1;
5503 if (xa_err(xa_store(&to_unpin, mas.index,
5504 xa_mk_value(nr), GFP_ATOMIC)))
5505 goto unlock;
5506 mas_erase(&mas);
5507 kfree(range);
5508 if (++count >= UBLK_REMOVE_BATCH)
5509 goto unlock;
5510 }
5511 done = true;
5512 unlock:
5513 mas_unlock(&mas);
5514
5515 xa_for_each(&to_unpin, idx, entry)
5516 ublk_unpin_range_pages(idx, xa_to_value(entry));
5517 xa_destroy(&to_unpin);
5518
5519 return done;
5520 }
5521
5522 /*
5523 * Remove ranges from the maple tree matching buf_index, unpin pages
5524 * and free range structs. If buf_index < 0, remove all ranges.
5525 * Processes ranges in batches to avoid holding the maple tree spinlock
5526 * across potentially expensive page unpinning.
5527 */
ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index)5528 static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index)
5529 {
5530 int ret = -ENOENT;
5531
5532 while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret))
5533 cond_resched();
5534 return ret;
5535 }
5536
ublk_ctrl_unreg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5537 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5538 struct ublksrv_ctrl_cmd *header)
5539 {
5540 int index = (int)header->data[0];
5541 unsigned int memflags;
5542 int ret;
5543
5544 if (!ublk_dev_support_shmem_zc(ub))
5545 return -EOPNOTSUPP;
5546
5547 if (index < 0 || index > USHRT_MAX)
5548 return -EINVAL;
5549
5550 memflags = ublk_lock_buf_tree(ub);
5551
5552 ret = ublk_shmem_remove_ranges(ub, index);
5553 if (!ret)
5554 ida_free(&ub->buf_ida, index);
5555
5556 ublk_unlock_buf_tree(ub, memflags);
5557 return ret;
5558 }
5559
ublk_buf_cleanup(struct ublk_device * ub)5560 static void ublk_buf_cleanup(struct ublk_device *ub)
5561 {
5562 ublk_shmem_remove_ranges(ub, -1);
5563 mtree_destroy(&ub->buf_tree);
5564 ida_destroy(&ub->buf_ida);
5565 }
5566
5567 /* Check if request pages match a registered shared memory buffer */
ublk_try_buf_match(struct ublk_device * ub,struct request * rq,u32 * buf_idx,u32 * buf_off)5568 static bool ublk_try_buf_match(struct ublk_device *ub,
5569 struct request *rq,
5570 u32 *buf_idx, u32 *buf_off)
5571 {
5572 struct req_iterator iter;
5573 struct bio_vec bv;
5574 int index = -1;
5575 unsigned long expected_offset = 0;
5576 bool first = true;
5577
5578 rq_for_each_bvec(bv, rq, iter) {
5579 unsigned long pfn = page_to_pfn(bv.bv_page);
5580 unsigned long end_pfn = pfn +
5581 ((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5582 struct ublk_buf_range *range;
5583 unsigned long off;
5584 MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5585
5586 range = mas_walk(&mas);
5587 if (!range)
5588 return false;
5589
5590 /* verify all pages in this bvec fall within the range */
5591 if (end_pfn > mas.last)
5592 return false;
5593
5594 off = range->base_offset +
5595 (pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5596
5597 if (first) {
5598 /* Read-only buffer can't serve READ (kernel writes) */
5599 if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5600 req_op(rq) != REQ_OP_WRITE)
5601 return false;
5602 index = range->buf_index;
5603 expected_offset = off;
5604 *buf_off = off;
5605 first = false;
5606 } else {
5607 if (range->buf_index != index)
5608 return false;
5609 if (off != expected_offset)
5610 return false;
5611 }
5612 expected_offset += bv.bv_len;
5613 }
5614
5615 if (first)
5616 return false;
5617
5618 *buf_idx = index;
5619 return true;
5620 }
5621
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,u32 cmd_op,struct ublksrv_ctrl_cmd * header)5622 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5623 u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5624 {
5625 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5626 void __user *argp = (void __user *)(unsigned long)header->addr;
5627 char *dev_path = NULL;
5628 int ret = 0;
5629 int mask;
5630
5631 if (!unprivileged) {
5632 if (!capable(CAP_SYS_ADMIN))
5633 return -EPERM;
5634 /*
5635 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5636 * char_dev_path in payload too, since userspace may not
5637 * know if the specified device is created as unprivileged
5638 * mode.
5639 */
5640 if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5641 return 0;
5642 }
5643
5644 /*
5645 * User has to provide the char device path for unprivileged ublk
5646 *
5647 * header->addr always points to the dev path buffer, and
5648 * header->dev_path_len records length of dev path buffer.
5649 */
5650 if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5651 return -EINVAL;
5652
5653 if (header->len < header->dev_path_len)
5654 return -EINVAL;
5655
5656 dev_path = memdup_user_nul(argp, header->dev_path_len);
5657 if (IS_ERR(dev_path))
5658 return PTR_ERR(dev_path);
5659
5660 ret = -EINVAL;
5661 switch (_IOC_NR(cmd_op)) {
5662 case UBLK_CMD_GET_DEV_INFO:
5663 case UBLK_CMD_GET_DEV_INFO2:
5664 case UBLK_CMD_GET_QUEUE_AFFINITY:
5665 case UBLK_CMD_GET_PARAMS:
5666 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5667 mask = MAY_READ;
5668 break;
5669 case UBLK_CMD_START_DEV:
5670 case UBLK_CMD_STOP_DEV:
5671 case UBLK_CMD_ADD_DEV:
5672 case UBLK_CMD_DEL_DEV:
5673 case UBLK_CMD_SET_PARAMS:
5674 case UBLK_CMD_START_USER_RECOVERY:
5675 case UBLK_CMD_END_USER_RECOVERY:
5676 case UBLK_CMD_UPDATE_SIZE:
5677 case UBLK_CMD_QUIESCE_DEV:
5678 case UBLK_CMD_TRY_STOP_DEV:
5679 case UBLK_CMD_REG_BUF:
5680 case UBLK_CMD_UNREG_BUF:
5681 mask = MAY_READ | MAY_WRITE;
5682 break;
5683 default:
5684 goto exit;
5685 }
5686
5687 ret = ublk_char_dev_permission(ub, dev_path, mask);
5688 if (!ret) {
5689 header->len -= header->dev_path_len;
5690 header->addr += header->dev_path_len;
5691 }
5692 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5693 __func__, ub->ub_number, cmd_op,
5694 ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5695 dev_path, ret);
5696 exit:
5697 kfree(dev_path);
5698 return ret;
5699 }
5700
ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)5701 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5702 {
5703 switch (_IOC_NR(cmd_op)) {
5704 case UBLK_CMD_GET_QUEUE_AFFINITY:
5705 case UBLK_CMD_GET_DEV_INFO:
5706 case UBLK_CMD_GET_DEV_INFO2:
5707 case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5708 return false;
5709 default:
5710 return true;
5711 }
5712 }
5713
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)5714 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5715 unsigned int issue_flags)
5716 {
5717 /* May point to userspace-mapped memory */
5718 const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5719 struct ublksrv_ctrl_cmd);
5720 struct ublksrv_ctrl_cmd header;
5721 struct ublk_device *ub = NULL;
5722 u32 cmd_op = cmd->cmd_op;
5723 int ret = -EINVAL;
5724
5725 if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5726 issue_flags & IO_URING_F_NONBLOCK)
5727 return -EAGAIN;
5728
5729 if (!(issue_flags & IO_URING_F_SQE128))
5730 return -EINVAL;
5731
5732 header.dev_id = READ_ONCE(ub_src->dev_id);
5733 header.queue_id = READ_ONCE(ub_src->queue_id);
5734 header.len = READ_ONCE(ub_src->len);
5735 header.addr = READ_ONCE(ub_src->addr);
5736 header.data[0] = READ_ONCE(ub_src->data[0]);
5737 header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5738 ublk_ctrl_cmd_dump(cmd_op, &header);
5739
5740 ret = ublk_check_cmd_op(cmd_op);
5741 if (ret)
5742 goto out;
5743
5744 if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5745 ret = ublk_ctrl_get_features(&header);
5746 goto out;
5747 }
5748
5749 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5750 ret = -ENODEV;
5751 ub = ublk_get_device_from_id(header.dev_id);
5752 if (!ub)
5753 goto out;
5754
5755 ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5756 if (ret)
5757 goto put_dev;
5758 }
5759
5760 switch (_IOC_NR(cmd_op)) {
5761 case UBLK_CMD_START_DEV:
5762 ret = ublk_ctrl_start_dev(ub, &header);
5763 break;
5764 case UBLK_CMD_STOP_DEV:
5765 ublk_ctrl_stop_dev(ub);
5766 ret = 0;
5767 break;
5768 case UBLK_CMD_GET_DEV_INFO:
5769 case UBLK_CMD_GET_DEV_INFO2:
5770 ret = ublk_ctrl_get_dev_info(ub, &header);
5771 break;
5772 case UBLK_CMD_ADD_DEV:
5773 ret = ublk_ctrl_add_dev(&header);
5774 break;
5775 case UBLK_CMD_DEL_DEV:
5776 ret = ublk_ctrl_del_dev(&ub, true);
5777 break;
5778 case UBLK_CMD_DEL_DEV_ASYNC:
5779 ret = ublk_ctrl_del_dev(&ub, false);
5780 break;
5781 case UBLK_CMD_GET_QUEUE_AFFINITY:
5782 ret = ublk_ctrl_get_queue_affinity(ub, &header);
5783 break;
5784 case UBLK_CMD_GET_PARAMS:
5785 ret = ublk_ctrl_get_params(ub, &header);
5786 break;
5787 case UBLK_CMD_SET_PARAMS:
5788 ret = ublk_ctrl_set_params(ub, &header);
5789 break;
5790 case UBLK_CMD_START_USER_RECOVERY:
5791 ret = ublk_ctrl_start_recovery(ub);
5792 break;
5793 case UBLK_CMD_END_USER_RECOVERY:
5794 ret = ublk_ctrl_end_recovery(ub, &header);
5795 break;
5796 case UBLK_CMD_UPDATE_SIZE:
5797 ret = ublk_ctrl_set_size(ub, &header);
5798 break;
5799 case UBLK_CMD_QUIESCE_DEV:
5800 ret = ublk_ctrl_quiesce_dev(ub, &header);
5801 break;
5802 case UBLK_CMD_TRY_STOP_DEV:
5803 ret = ublk_ctrl_try_stop_dev(ub);
5804 break;
5805 case UBLK_CMD_REG_BUF:
5806 ret = ublk_ctrl_reg_buf(ub, &header);
5807 break;
5808 case UBLK_CMD_UNREG_BUF:
5809 ret = ublk_ctrl_unreg_buf(ub, &header);
5810 break;
5811 default:
5812 ret = -EOPNOTSUPP;
5813 break;
5814 }
5815
5816 put_dev:
5817 if (ub)
5818 ublk_put_device(ub);
5819 out:
5820 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5821 __func__, ret, cmd_op, header.dev_id, header.queue_id);
5822 return ret;
5823 }
5824
5825 static const struct file_operations ublk_ctl_fops = {
5826 .open = nonseekable_open,
5827 .uring_cmd = ublk_ctrl_uring_cmd,
5828 .owner = THIS_MODULE,
5829 .llseek = noop_llseek,
5830 };
5831
5832 static struct miscdevice ublk_misc = {
5833 .minor = MISC_DYNAMIC_MINOR,
5834 .name = "ublk-control",
5835 .fops = &ublk_ctl_fops,
5836 };
5837
ublk_init(void)5838 static int __init ublk_init(void)
5839 {
5840 int ret;
5841
5842 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5843 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5844 /*
5845 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5846 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5847 */
5848 BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5849 UBLKSRV_IO_INTEGRITY_FLAG);
5850 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5851
5852 init_waitqueue_head(&ublk_idr_wq);
5853
5854 ret = misc_register(&ublk_misc);
5855 if (ret)
5856 return ret;
5857
5858 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5859 if (ret)
5860 goto unregister_mis;
5861
5862 ret = class_register(&ublk_chr_class);
5863 if (ret)
5864 goto free_chrdev_region;
5865
5866 return 0;
5867
5868 free_chrdev_region:
5869 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5870 unregister_mis:
5871 misc_deregister(&ublk_misc);
5872 return ret;
5873 }
5874
ublk_exit(void)5875 static void __exit ublk_exit(void)
5876 {
5877 struct ublk_device *ub;
5878 int id;
5879
5880 idr_for_each_entry(&ublk_index_idr, ub, id)
5881 ublk_remove(ub);
5882
5883 class_unregister(&ublk_chr_class);
5884 misc_deregister(&ublk_misc);
5885
5886 idr_destroy(&ublk_index_idr);
5887 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5888 }
5889
5890 module_init(ublk_init);
5891 module_exit(ublk_exit);
5892
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)5893 static int ublk_set_max_unprivileged_ublks(const char *buf,
5894 const struct kernel_param *kp)
5895 {
5896 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5897 }
5898
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)5899 static int ublk_get_max_unprivileged_ublks(char *buf,
5900 const struct kernel_param *kp)
5901 {
5902 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5903 }
5904
5905 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5906 .set = ublk_set_max_unprivileged_ublks,
5907 .get = ublk_get_max_unprivileged_ublks,
5908 };
5909
5910 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5911 &unprivileged_ublks_max, 0644);
5912 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5913
5914 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5915 MODULE_DESCRIPTION("Userspace block device");
5916 MODULE_LICENSE("GPL");
5917