1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53
54 #define UBLK_MINORS (1U << MINORBITS)
55
56 #define UBLK_INVALID_BUF_IDX ((u16)-1)
57
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF)
65
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32)
68
69 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 | UBLK_F_URING_CMD_COMP_IN_TASK \
75 | UBLK_F_NEED_GET_DATA \
76 | UBLK_F_USER_RECOVERY \
77 | UBLK_F_USER_RECOVERY_REISSUE \
78 | UBLK_F_UNPRIVILEGED_DEV \
79 | UBLK_F_CMD_IOCTL_ENCODE \
80 | UBLK_F_USER_COPY \
81 | UBLK_F_ZONED \
82 | UBLK_F_USER_RECOVERY_FAIL_IO \
83 | UBLK_F_UPDATE_SIZE \
84 | UBLK_F_AUTO_BUF_REG \
85 | UBLK_F_QUIESCE \
86 | UBLK_F_PER_IO_DAEMON \
87 | UBLK_F_BUF_REG_OFF_DAEMON \
88 | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 | UBLK_F_SAFE_STOP_DEV \
90 | UBLK_F_BATCH_IO \
91 | UBLK_F_NO_AUTO_PART_SCAN \
92 | UBLK_F_SHMEM_ZC)
93
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 | UBLK_F_USER_RECOVERY_REISSUE \
96 | UBLK_F_USER_RECOVERY_FAIL_IO)
97
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL \
100 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
102 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 UBLK_PARAM_TYPE_INTEGRITY)
104
105 #define UBLK_BATCH_F_ALL \
106 (UBLK_BATCH_F_HAS_ZONE_LBA | \
107 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 struct list_head node;
113 struct io_uring_cmd *cmd;
114 unsigned short buf_group;
115 };
116
117 struct ublk_uring_cmd_pdu {
118 /*
119 * Store requests in same batch temporarily for queuing them to
120 * daemon context.
121 *
122 * It should have been stored to request payload, but we do want
123 * to avoid extra pre-allocation, and uring_cmd payload is always
124 * free for us
125 */
126 union {
127 struct request *req;
128 struct request *req_list;
129 };
130
131 /*
132 * The following two are valid in this cmd whole lifetime, and
133 * setup in ublk uring_cmd handler
134 */
135 struct ublk_queue *ubq;
136
137 union {
138 u16 tag;
139 struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 };
141 };
142
143 struct ublk_batch_io_data {
144 struct ublk_device *ub;
145 struct io_uring_cmd *cmd;
146 struct ublk_batch_io header;
147 unsigned int issue_flags;
148 struct io_comp_batch *iob;
149 };
150
151 /*
152 * io command is active: sqe cmd is received, and its cqe isn't done
153 *
154 * If the flag is set, the io command is owned by ublk driver, and waited
155 * for incoming blk-mq request from the ublk block device.
156 *
157 * If the flag is cleared, the io command will be completed, and owned by
158 * ublk server.
159 */
160 #define UBLK_IO_FLAG_ACTIVE 0x01
161
162 /*
163 * IO command is completed via cqe, and it is being handled by ublksrv, and
164 * not committed yet
165 *
166 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167 * cross verification
168 */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170
171 /*
172 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173 * get data buffer address from ublksrv.
174 *
175 * Then, bio data could be copied into this data buffer for a WRITE request
176 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177 */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179
180 /*
181 * request buffer is registered automatically, so we have to unregister it
182 * before completing this request.
183 *
184 * io_uring will unregister buffer automatically for us during exiting.
185 */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
187
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED 0x80000000
190
191 /*
192 * Initialize refcount to a large number to include any registered buffers.
193 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194 * any buffers registered on the io daemon task.
195 */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1)
200
201 union ublk_io_buf {
202 __u64 addr;
203 struct ublk_auto_buf_reg auto_reg;
204 };
205
206 struct ublk_io {
207 union ublk_io_buf buf;
208 unsigned int flags;
209 int res;
210
211 union {
212 /* valid if UBLK_IO_FLAG_ACTIVE is set */
213 struct io_uring_cmd *cmd;
214 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 struct request *req;
216 };
217
218 struct task_struct *task;
219
220 /*
221 * The number of uses of this I/O by the ublk server
222 * if user copy or zero copy are enabled:
223 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 * until UBLK_IO_COMMIT_AND_FETCH_REQ
225 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 * - 1 for each io_uring registered buffer not registered on task
227 * The I/O can only be completed once all references are dropped.
228 * User copy and buffer registration operations are only permitted
229 * if the reference count is nonzero.
230 */
231 refcount_t ref;
232 /* Count of buffers registered on task and not yet unregistered */
233 unsigned task_registered_buffers;
234
235 void *buf_ctx_handle;
236 spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238
239 struct ublk_queue {
240 int q_id;
241 int q_depth;
242
243 unsigned long flags;
244 struct ublksrv_io_desc *io_cmd_buf;
245
246 bool force_abort;
247 bool canceling;
248 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 spinlock_t cancel_lock;
250 struct ublk_device *dev;
251 u32 nr_io_ready;
252
253 /*
254 * For supporting UBLK_F_BATCH_IO only.
255 *
256 * Inflight ublk request tag is saved in this fifo
257 *
258 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 * so lock is required for storing request tag to fifo
260 *
261 * Make sure just one reader for fetching request from task work
262 * function to ublk server, so no need to grab the lock in reader
263 * side.
264 *
265 * Batch I/O State Management:
266 *
267 * The batch I/O system uses implicit state management based on the
268 * combination of three key variables below.
269 *
270 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 * No fetch commands available, events queue in evts_fifo
272 *
273 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 * Fetch commands available but none processing events
275 *
276 * - ACTIVE: active_fcmd
277 * One fetch command actively processing events from evts_fifo
278 *
279 * Key Invariants:
280 * - At most one active_fcmd at any time (single reader)
281 * - active_fcmd is always from fcmd_head list when non-NULL
282 * - evts_fifo can be read locklessly by the single active reader
283 * - All state transitions require evts_lock protection
284 * - Multiple writers to evts_fifo require lock protection
285 */
286 struct {
287 DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 spinlock_t evts_lock;
289
290 /* List of fetch commands available to process events */
291 struct list_head fcmd_head;
292
293 /* Currently active fetch command (NULL = none active) */
294 struct ublk_batch_fetch_cmd *active_fcmd;
295 }____cacheline_aligned_in_smp;
296
297 struct ublk_io ios[] __counted_by(q_depth);
298 };
299
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 unsigned short buf_index;
303 unsigned short flags;
304 unsigned int base_offset; /* byte offset within buffer */
305 };
306
307 struct ublk_device {
308 struct gendisk *ub_disk;
309
310 struct ublksrv_ctrl_dev_info dev_info;
311
312 struct blk_mq_tag_set tag_set;
313
314 struct cdev cdev;
315 struct device cdev_dev;
316
317 #define UB_STATE_OPEN 0
318 #define UB_STATE_USED 1
319 #define UB_STATE_DELETED 2
320 unsigned long state;
321 int ub_number;
322
323 struct mutex mutex;
324
325 spinlock_t lock;
326 struct mm_struct *mm;
327
328 struct ublk_params params;
329
330 struct completion completion;
331 u32 nr_queue_ready;
332 bool unprivileged_daemons;
333 struct mutex cancel_mutex;
334 bool canceling;
335 pid_t ublksrv_tgid;
336 struct delayed_work exit_work;
337 struct work_struct partition_scan_work;
338
339 bool block_open; /* protected by open_mutex */
340
341 /* shared memory zero copy */
342 struct maple_tree buf_tree;
343 struct ida buf_ida;
344
345 struct ublk_queue *queues[];
346 };
347
348 /* header of ublk_params */
349 struct ublk_params_header {
350 __u32 len;
351 __u32 types;
352 };
353
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 u16 q_id, u16 tag, struct ublk_io *io);
362 static inline unsigned int ublk_req_build_flags(struct request *req);
363 static void ublk_batch_dispatch(struct ublk_queue *ubq,
364 const struct ublk_batch_io_data *data,
365 struct ublk_batch_fetch_cmd *fcmd);
366
ublk_dev_support_batch_io(const struct ublk_device * ub)367 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
368 {
369 return ub->dev_info.flags & UBLK_F_BATCH_IO;
370 }
371
ublk_support_batch_io(const struct ublk_queue * ubq)372 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
373 {
374 return ubq->flags & UBLK_F_BATCH_IO;
375 }
376
ublk_io_lock(struct ublk_io * io)377 static inline void ublk_io_lock(struct ublk_io *io)
378 {
379 spin_lock(&io->lock);
380 }
381
ublk_io_unlock(struct ublk_io * io)382 static inline void ublk_io_unlock(struct ublk_io *io)
383 {
384 spin_unlock(&io->lock);
385 }
386
387 /* Initialize the event queue */
ublk_io_evts_init(struct ublk_queue * q,unsigned int size,int numa_node)388 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
389 int numa_node)
390 {
391 spin_lock_init(&q->evts_lock);
392 return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
393 }
394
395 /* Check if event queue is empty */
ublk_io_evts_empty(const struct ublk_queue * q)396 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
397 {
398 return kfifo_is_empty(&q->evts_fifo);
399 }
400
ublk_io_evts_deinit(struct ublk_queue * q)401 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
402 {
403 WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
404 kfifo_free(&q->evts_fifo);
405 }
406
407 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)408 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
409 {
410 return &ubq->io_cmd_buf[tag];
411 }
412
ublk_support_zero_copy(const struct ublk_queue * ubq)413 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
414 {
415 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
416 }
417
ublk_dev_support_zero_copy(const struct ublk_device * ub)418 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
419 {
420 return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
421 }
422
ublk_support_shmem_zc(const struct ublk_queue * ubq)423 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
424 {
425 return ubq->flags & UBLK_F_SHMEM_ZC;
426 }
427
ublk_iod_is_shmem_zc(const struct ublk_queue * ubq,unsigned int tag)428 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
429 unsigned int tag)
430 {
431 return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
432 }
433
ublk_dev_support_shmem_zc(const struct ublk_device * ub)434 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
435 {
436 return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
437 }
438
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)439 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
440 {
441 return ubq->flags & UBLK_F_AUTO_BUF_REG;
442 }
443
ublk_dev_support_auto_buf_reg(const struct ublk_device * ub)444 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
445 {
446 return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
447 }
448
ublk_support_user_copy(const struct ublk_queue * ubq)449 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
450 {
451 return ubq->flags & UBLK_F_USER_COPY;
452 }
453
ublk_dev_support_user_copy(const struct ublk_device * ub)454 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
455 {
456 return ub->dev_info.flags & UBLK_F_USER_COPY;
457 }
458
ublk_dev_is_zoned(const struct ublk_device * ub)459 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
460 {
461 return ub->dev_info.flags & UBLK_F_ZONED;
462 }
463
ublk_queue_is_zoned(const struct ublk_queue * ubq)464 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
465 {
466 return ubq->flags & UBLK_F_ZONED;
467 }
468
ublk_dev_support_integrity(const struct ublk_device * ub)469 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
470 {
471 return ub->dev_info.flags & UBLK_F_INTEGRITY;
472 }
473
474 #ifdef CONFIG_BLK_DEV_ZONED
475
476 struct ublk_zoned_report_desc {
477 __u64 sector;
478 __u32 operation;
479 __u32 nr_zones;
480 };
481
482 static DEFINE_XARRAY(ublk_zoned_report_descs);
483
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)484 static int ublk_zoned_insert_report_desc(const struct request *req,
485 struct ublk_zoned_report_desc *desc)
486 {
487 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
488 desc, GFP_KERNEL);
489 }
490
ublk_zoned_erase_report_desc(const struct request * req)491 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
492 const struct request *req)
493 {
494 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
495 }
496
ublk_zoned_get_report_desc(const struct request * req)497 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
498 const struct request *req)
499 {
500 return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
501 }
502
ublk_get_nr_zones(const struct ublk_device * ub)503 static int ublk_get_nr_zones(const struct ublk_device *ub)
504 {
505 const struct ublk_param_basic *p = &ub->params.basic;
506
507 /* Zone size is a power of 2 */
508 return p->dev_sectors >> ilog2(p->chunk_sectors);
509 }
510
ublk_revalidate_disk_zones(struct ublk_device * ub)511 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
512 {
513 return blk_revalidate_disk_zones(ub->ub_disk);
514 }
515
ublk_dev_param_zoned_validate(const struct ublk_device * ub)516 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
517 {
518 const struct ublk_param_zoned *p = &ub->params.zoned;
519 int nr_zones;
520
521 if (!ublk_dev_is_zoned(ub))
522 return -EINVAL;
523
524 if (!p->max_zone_append_sectors)
525 return -EINVAL;
526
527 nr_zones = ublk_get_nr_zones(ub);
528
529 if (p->max_active_zones > nr_zones)
530 return -EINVAL;
531
532 if (p->max_open_zones > nr_zones)
533 return -EINVAL;
534
535 return 0;
536 }
537
ublk_dev_param_zoned_apply(struct ublk_device * ub)538 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
539 {
540 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
541 }
542
543 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)544 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
545 unsigned int nr_zones, size_t *buflen)
546 {
547 struct request_queue *q = ublk->ub_disk->queue;
548 size_t bufsize;
549 void *buf;
550
551 nr_zones = min_t(unsigned int, nr_zones,
552 ublk->ub_disk->nr_zones);
553
554 bufsize = nr_zones * sizeof(struct blk_zone);
555 bufsize =
556 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
557
558 while (bufsize >= sizeof(struct blk_zone)) {
559 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
560 if (buf) {
561 *buflen = bufsize;
562 return buf;
563 }
564 bufsize >>= 1;
565 }
566
567 *buflen = 0;
568 return NULL;
569 }
570
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)571 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
572 unsigned int nr_zones, struct blk_report_zones_args *args)
573 {
574 struct ublk_device *ub = disk->private_data;
575 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
576 unsigned int first_zone = sector >> ilog2(zone_size_sectors);
577 unsigned int done_zones = 0;
578 unsigned int max_zones_per_request;
579 int ret;
580 struct blk_zone *buffer;
581 size_t buffer_length;
582
583 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
584 nr_zones);
585
586 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
587 if (!buffer)
588 return -ENOMEM;
589
590 max_zones_per_request = buffer_length / sizeof(struct blk_zone);
591
592 while (done_zones < nr_zones) {
593 unsigned int remaining_zones = nr_zones - done_zones;
594 unsigned int zones_in_request =
595 min_t(unsigned int, remaining_zones, max_zones_per_request);
596 struct request *req;
597 struct ublk_zoned_report_desc desc;
598 blk_status_t status;
599
600 memset(buffer, 0, buffer_length);
601
602 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
603 if (IS_ERR(req)) {
604 ret = PTR_ERR(req);
605 goto out;
606 }
607
608 desc.operation = UBLK_IO_OP_REPORT_ZONES;
609 desc.sector = sector;
610 desc.nr_zones = zones_in_request;
611 ret = ublk_zoned_insert_report_desc(req, &desc);
612 if (ret)
613 goto free_req;
614
615 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
616 if (ret)
617 goto erase_desc;
618
619 status = blk_execute_rq(req, 0);
620 ret = blk_status_to_errno(status);
621 erase_desc:
622 ublk_zoned_erase_report_desc(req);
623 free_req:
624 blk_mq_free_request(req);
625 if (ret)
626 goto out;
627
628 for (unsigned int i = 0; i < zones_in_request; i++) {
629 struct blk_zone *zone = buffer + i;
630
631 /* A zero length zone means no more zones in this response */
632 if (!zone->len)
633 break;
634
635 ret = disk_report_zone(disk, zone, i, args);
636 if (ret)
637 goto out;
638
639 done_zones++;
640 sector += zone_size_sectors;
641
642 }
643 }
644
645 ret = done_zones;
646
647 out:
648 kvfree(buffer);
649 return ret;
650 }
651
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)652 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
653 struct request *req)
654 {
655 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
656 struct ublk_io *io = &ubq->ios[req->tag];
657 struct ublk_zoned_report_desc *desc;
658 u32 ublk_op;
659
660 switch (req_op(req)) {
661 case REQ_OP_ZONE_OPEN:
662 ublk_op = UBLK_IO_OP_ZONE_OPEN;
663 break;
664 case REQ_OP_ZONE_CLOSE:
665 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
666 break;
667 case REQ_OP_ZONE_FINISH:
668 ublk_op = UBLK_IO_OP_ZONE_FINISH;
669 break;
670 case REQ_OP_ZONE_RESET:
671 ublk_op = UBLK_IO_OP_ZONE_RESET;
672 break;
673 case REQ_OP_ZONE_APPEND:
674 ublk_op = UBLK_IO_OP_ZONE_APPEND;
675 break;
676 case REQ_OP_ZONE_RESET_ALL:
677 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
678 break;
679 case REQ_OP_DRV_IN:
680 desc = ublk_zoned_get_report_desc(req);
681 if (!desc)
682 return BLK_STS_IOERR;
683 ublk_op = desc->operation;
684 switch (ublk_op) {
685 case UBLK_IO_OP_REPORT_ZONES:
686 iod->op_flags = ublk_op | ublk_req_build_flags(req);
687 iod->nr_zones = desc->nr_zones;
688 iod->start_sector = desc->sector;
689 return BLK_STS_OK;
690 default:
691 return BLK_STS_IOERR;
692 }
693 case REQ_OP_DRV_OUT:
694 /* We do not support drv_out */
695 return BLK_STS_NOTSUPP;
696 default:
697 return BLK_STS_IOERR;
698 }
699
700 iod->op_flags = ublk_op | ublk_req_build_flags(req);
701 iod->nr_sectors = blk_rq_sectors(req);
702 iod->start_sector = blk_rq_pos(req);
703 iod->addr = io->buf.addr;
704
705 return BLK_STS_OK;
706 }
707
708 #else
709
710 #define ublk_report_zones (NULL)
711
ublk_dev_param_zoned_validate(const struct ublk_device * ub)712 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
713 {
714 return -EOPNOTSUPP;
715 }
716
ublk_dev_param_zoned_apply(struct ublk_device * ub)717 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
718 {
719 }
720
ublk_revalidate_disk_zones(struct ublk_device * ub)721 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
722 {
723 return 0;
724 }
725
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)726 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
727 struct request *req)
728 {
729 return BLK_STS_NOTSUPP;
730 }
731
732 #endif
733
734 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
735 bool need_map, struct io_comp_batch *iob);
736
737 static dev_t ublk_chr_devt;
738 static const struct class ublk_chr_class = {
739 .name = "ublk-char",
740 };
741
742 static DEFINE_IDR(ublk_index_idr);
743 static DEFINE_SPINLOCK(ublk_idr_lock);
744 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
745
746 static DEFINE_MUTEX(ublk_ctl_mutex);
747
748 static struct ublk_batch_fetch_cmd *
ublk_batch_alloc_fcmd(struct io_uring_cmd * cmd)749 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
750 {
751 struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
752
753 if (fcmd) {
754 fcmd->cmd = cmd;
755 fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
756 }
757 return fcmd;
758 }
759
ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd * fcmd)760 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
761 {
762 kfree(fcmd);
763 }
764
__ublk_release_fcmd(struct ublk_queue * ubq)765 static void __ublk_release_fcmd(struct ublk_queue *ubq)
766 {
767 WRITE_ONCE(ubq->active_fcmd, NULL);
768 }
769
770 /*
771 * Nothing can move on, so clear ->active_fcmd, and the caller should stop
772 * dispatching
773 */
ublk_batch_deinit_fetch_buf(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd,int res)774 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
775 const struct ublk_batch_io_data *data,
776 struct ublk_batch_fetch_cmd *fcmd,
777 int res)
778 {
779 spin_lock(&ubq->evts_lock);
780 list_del_init(&fcmd->node);
781 WARN_ON_ONCE(fcmd != ubq->active_fcmd);
782 __ublk_release_fcmd(ubq);
783 spin_unlock(&ubq->evts_lock);
784
785 io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
786 ublk_batch_free_fcmd(fcmd);
787 }
788
ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd * fcmd,struct io_br_sel * sel,unsigned int issue_flags)789 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
790 struct io_br_sel *sel,
791 unsigned int issue_flags)
792 {
793 if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
794 return -ENOBUFS;
795 return 0;
796 }
797
ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd * fcmd,void __user * buf,const u16 * tag_buf,unsigned int len)798 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
799 void __user *buf, const u16 *tag_buf,
800 unsigned int len)
801 {
802 if (copy_to_user(buf, tag_buf, len))
803 return -EFAULT;
804 return len;
805 }
806
807 #define UBLK_MAX_UBLKS UBLK_MINORS
808
809 /*
810 * Max unprivileged ublk devices allowed to add
811 *
812 * It can be extended to one per-user limit in future or even controlled
813 * by cgroup.
814 */
815 static unsigned int unprivileged_ublks_max = 64;
816 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
817
818 static struct miscdevice ublk_misc;
819
ublk_pos_to_hwq(loff_t pos)820 static inline unsigned ublk_pos_to_hwq(loff_t pos)
821 {
822 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
823 UBLK_QID_BITS_MASK;
824 }
825
ublk_pos_to_buf_off(loff_t pos)826 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
827 {
828 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
829 }
830
ublk_pos_to_tag(loff_t pos)831 static inline unsigned ublk_pos_to_tag(loff_t pos)
832 {
833 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
834 UBLK_TAG_BITS_MASK;
835 }
836
ublk_dev_param_basic_apply(struct ublk_device * ub)837 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
838 {
839 const struct ublk_param_basic *p = &ub->params.basic;
840
841 if (p->attrs & UBLK_ATTR_READ_ONLY)
842 set_disk_ro(ub->ub_disk, true);
843
844 set_capacity(ub->ub_disk, p->dev_sectors);
845 }
846
ublk_integrity_flags(u32 flags)847 static int ublk_integrity_flags(u32 flags)
848 {
849 int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
850
851 if (flags & LBMD_PI_CAP_INTEGRITY) {
852 flags &= ~LBMD_PI_CAP_INTEGRITY;
853 ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
854 }
855 if (flags & LBMD_PI_CAP_REFTAG) {
856 flags &= ~LBMD_PI_CAP_REFTAG;
857 ret_flags |= BLK_INTEGRITY_REF_TAG;
858 }
859 return flags ? -EINVAL : ret_flags;
860 }
861
ublk_integrity_pi_tuple_size(u8 csum_type)862 static int ublk_integrity_pi_tuple_size(u8 csum_type)
863 {
864 switch (csum_type) {
865 case LBMD_PI_CSUM_NONE:
866 return 0;
867 case LBMD_PI_CSUM_IP:
868 case LBMD_PI_CSUM_CRC16_T10DIF:
869 return 8;
870 case LBMD_PI_CSUM_CRC64_NVME:
871 return 16;
872 default:
873 return -EINVAL;
874 }
875 }
876
ublk_integrity_csum_type(u8 csum_type)877 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
878 {
879 switch (csum_type) {
880 case LBMD_PI_CSUM_NONE:
881 return BLK_INTEGRITY_CSUM_NONE;
882 case LBMD_PI_CSUM_IP:
883 return BLK_INTEGRITY_CSUM_IP;
884 case LBMD_PI_CSUM_CRC16_T10DIF:
885 return BLK_INTEGRITY_CSUM_CRC;
886 case LBMD_PI_CSUM_CRC64_NVME:
887 return BLK_INTEGRITY_CSUM_CRC64;
888 default:
889 WARN_ON_ONCE(1);
890 return BLK_INTEGRITY_CSUM_NONE;
891 }
892 }
893
ublk_validate_params(const struct ublk_device * ub)894 static int ublk_validate_params(const struct ublk_device *ub)
895 {
896 /* basic param is the only one which must be set */
897 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
898 const struct ublk_param_basic *p = &ub->params.basic;
899
900 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
901 return -EINVAL;
902
903 /*
904 * 256M is a reasonable upper bound for physical block size,
905 * io_min and io_opt; it aligns with the maximum physical
906 * block size possible in NVMe.
907 */
908 if (p->physical_bs_shift > ilog2(SZ_256M))
909 return -EINVAL;
910
911 if (p->io_min_shift > ilog2(SZ_256M))
912 return -EINVAL;
913
914 if (p->io_opt_shift > ilog2(SZ_256M))
915 return -EINVAL;
916
917 if (p->logical_bs_shift > p->physical_bs_shift)
918 return -EINVAL;
919
920 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
921 return -EINVAL;
922
923 if (p->max_sectors < PAGE_SECTORS)
924 return -EINVAL;
925
926 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
927 return -EINVAL;
928 } else
929 return -EINVAL;
930
931 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
932 const struct ublk_param_discard *p = &ub->params.discard;
933
934 /* So far, only support single segment discard */
935 if (p->max_discard_sectors && p->max_discard_segments != 1)
936 return -EINVAL;
937
938 if (!p->discard_granularity)
939 return -EINVAL;
940 }
941
942 /* dev_t is read-only */
943 if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
944 return -EINVAL;
945
946 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
947 return ublk_dev_param_zoned_validate(ub);
948 else if (ublk_dev_is_zoned(ub))
949 return -EINVAL;
950
951 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
952 const struct ublk_param_dma_align *p = &ub->params.dma;
953
954 if (p->alignment >= PAGE_SIZE)
955 return -EINVAL;
956
957 if (!is_power_of_2(p->alignment + 1))
958 return -EINVAL;
959 }
960
961 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
962 const struct ublk_param_segment *p = &ub->params.seg;
963
964 if (!is_power_of_2(p->seg_boundary_mask + 1))
965 return -EINVAL;
966
967 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
968 return -EINVAL;
969 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
970 return -EINVAL;
971 }
972
973 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
974 const struct ublk_param_integrity *p = &ub->params.integrity;
975 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
976 int flags = ublk_integrity_flags(p->flags);
977
978 if (!ublk_dev_support_integrity(ub))
979 return -EINVAL;
980 if (flags < 0)
981 return flags;
982 if (pi_tuple_size < 0)
983 return pi_tuple_size;
984 if (!p->metadata_size)
985 return -EINVAL;
986 if (p->csum_type == LBMD_PI_CSUM_NONE &&
987 p->flags & LBMD_PI_CAP_REFTAG)
988 return -EINVAL;
989 if (p->pi_offset + pi_tuple_size > p->metadata_size)
990 return -EINVAL;
991 if (p->interval_exp < SECTOR_SHIFT ||
992 p->interval_exp > ub->params.basic.logical_bs_shift)
993 return -EINVAL;
994 }
995
996 return 0;
997 }
998
ublk_apply_params(struct ublk_device * ub)999 static void ublk_apply_params(struct ublk_device *ub)
1000 {
1001 ublk_dev_param_basic_apply(ub);
1002
1003 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
1004 ublk_dev_param_zoned_apply(ub);
1005 }
1006
ublk_need_map_io(const struct ublk_queue * ubq)1007 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
1008 {
1009 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
1010 !ublk_support_auto_buf_reg(ubq);
1011 }
1012
ublk_dev_need_map_io(const struct ublk_device * ub)1013 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
1014 {
1015 return !ublk_dev_support_user_copy(ub) &&
1016 !ublk_dev_support_zero_copy(ub) &&
1017 !ublk_dev_support_auto_buf_reg(ub);
1018 }
1019
ublk_need_req_ref(const struct ublk_queue * ubq)1020 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1021 {
1022 /*
1023 * read()/write() is involved in user copy, so request reference
1024 * has to be grabbed
1025 *
1026 * for zero copy, request buffer need to be registered to io_uring
1027 * buffer table, so reference is needed
1028 *
1029 * For auto buffer register, ublk server still may issue
1030 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1031 * so reference is required too.
1032 */
1033 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1034 ublk_support_auto_buf_reg(ubq);
1035 }
1036
ublk_dev_need_req_ref(const struct ublk_device * ub)1037 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1038 {
1039 return ublk_dev_support_user_copy(ub) ||
1040 ublk_dev_support_zero_copy(ub) ||
1041 ublk_dev_support_auto_buf_reg(ub);
1042 }
1043
1044 /*
1045 * ublk IO Reference Counting Design
1046 * ==================================
1047 *
1048 * For user-copy and zero-copy modes, ublk uses a split reference model with
1049 * two counters that together track IO lifetime:
1050 *
1051 * - io->ref: refcount for off-task buffer registrations and user-copy ops
1052 * - io->task_registered_buffers: count of buffers registered on the IO task
1053 *
1054 * Key Invariant:
1055 * --------------
1056 * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1057 * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1058 * when no active references exist. After IO completion, both counters become
1059 * zero. For I/Os not currently dispatched to the ublk server, both ref and
1060 * task_registered_buffers are 0.
1061 *
1062 * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1063 * exit to determine if all references have been released.
1064 *
1065 * Why Split Counters:
1066 * -------------------
1067 * Buffers registered on the IO daemon task can use the lightweight
1068 * task_registered_buffers counter (simple increment/decrement) instead of
1069 * atomic refcount operations. The ublk_io_release() callback checks if
1070 * current == io->task to decide which counter to update.
1071 *
1072 * This optimization only applies before IO completion. At completion,
1073 * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1074 * After that, all subsequent buffer unregistrations must use the atomic ref
1075 * since they may be releasing the last reference.
1076 *
1077 * Reference Lifecycle:
1078 * --------------------
1079 * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1080 *
1081 * 2. During IO processing:
1082 * - On-task buffer reg: task_registered_buffers++ (no ref change)
1083 * - Off-task buffer reg: ref++ via ublk_get_req_ref()
1084 * - Buffer unregister callback (ublk_io_release):
1085 * * If on-task: task_registered_buffers--
1086 * * If off-task: ref-- via ublk_put_req_ref()
1087 *
1088 * 3. ublk_sub_req_ref() at IO completion:
1089 * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1090 * - Subtracts sub_refs from ref and zeroes task_registered_buffers
1091 * - This effectively collapses task_registered_buffers into the atomic ref,
1092 * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1093 * buffers that were already counted
1094 *
1095 * Example (zero-copy, register on-task, unregister off-task):
1096 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1097 * - Register buffer on-task: task_registered_buffers = 1
1098 * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1099 * - Completion via ublk_sub_req_ref():
1100 * sub_refs = UBLK_REFCOUNT_INIT - 1,
1101 * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1102 *
1103 * Example (auto buffer registration):
1104 * Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1105 *
1106 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1107 * - Buffer unregister: task_registered_buffers-- (becomes 0)
1108 * - Completion via ublk_sub_req_ref():
1109 * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1110 *
1111 * Example (zero-copy, ublk server killed):
1112 * When daemon is killed, io_uring cleanup unregisters buffers off-task.
1113 * ublk_check_and_reset_active_ref() waits for the invariant to hold.
1114 *
1115 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1116 * - Register buffer on-task: task_registered_buffers = 1
1117 * - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1118 * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1119 * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1120 * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1121 * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1122 * and abort pending requests
1123 *
1124 * Batch IO Special Case:
1125 * ----------------------
1126 * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1127 * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1128 * task_registered_buffers counter still tracks registered buffers for the
1129 * invariant check, even though the callback doesn't decrement it.
1130 *
1131 * Note: updating task_registered_buffers is protected by io->lock.
1132 */
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)1133 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1134 struct ublk_io *io)
1135 {
1136 if (ublk_need_req_ref(ubq))
1137 refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1138 }
1139
ublk_get_req_ref(struct ublk_io * io)1140 static inline bool ublk_get_req_ref(struct ublk_io *io)
1141 {
1142 return refcount_inc_not_zero(&io->ref);
1143 }
1144
ublk_put_req_ref(struct ublk_io * io,struct request * req)1145 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1146 {
1147 if (!refcount_dec_and_test(&io->ref))
1148 return;
1149
1150 /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1151 __ublk_complete_rq(req, io, false, NULL);
1152 }
1153
ublk_sub_req_ref(struct ublk_io * io)1154 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1155 {
1156 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1157
1158 io->task_registered_buffers = 0;
1159 return refcount_sub_and_test(sub_refs, &io->ref);
1160 }
1161
ublk_need_get_data(const struct ublk_queue * ubq)1162 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1163 {
1164 return ubq->flags & UBLK_F_NEED_GET_DATA;
1165 }
1166
ublk_dev_need_get_data(const struct ublk_device * ub)1167 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1168 {
1169 return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1170 }
1171
1172 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)1173 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1174 {
1175 if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1176 return ub;
1177 return NULL;
1178 }
1179
1180 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)1181 static noinline void ublk_put_device(struct ublk_device *ub)
1182 {
1183 put_device(&ub->cdev_dev);
1184 }
1185
ublk_get_queue(struct ublk_device * dev,int qid)1186 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1187 int qid)
1188 {
1189 return dev->queues[qid];
1190 }
1191
ublk_rq_has_data(const struct request * rq)1192 static inline bool ublk_rq_has_data(const struct request *rq)
1193 {
1194 return bio_has_data(rq->bio);
1195 }
1196
1197 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)1198 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1199 {
1200 return ublk_get_queue(ub, q_id)->io_cmd_buf;
1201 }
1202
__ublk_queue_cmd_buf_size(int depth)1203 static inline int __ublk_queue_cmd_buf_size(int depth)
1204 {
1205 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1206 }
1207
ublk_queue_cmd_buf_size(struct ublk_device * ub)1208 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1209 {
1210 return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1211 }
1212
ublk_max_cmd_buf_size(void)1213 static int ublk_max_cmd_buf_size(void)
1214 {
1215 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1216 }
1217
1218 /*
1219 * Should I/O outstanding to the ublk server when it exits be reissued?
1220 * If not, outstanding I/O will get errors.
1221 */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)1222 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1223 {
1224 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1225 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1226 }
1227
1228 /*
1229 * Should I/O issued while there is no ublk server queue? If not, I/O
1230 * issued while there is no ublk server will get errors.
1231 */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)1232 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1233 {
1234 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1235 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1236 }
1237
1238 /*
1239 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1240 * of the device flags for smaller cache footprint - better for fast
1241 * paths.
1242 */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)1243 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1244 {
1245 return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1246 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1247 }
1248
1249 /*
1250 * Should ublk devices be stopped (i.e. no recovery possible) when the
1251 * ublk server exits? If not, devices can be used again by a future
1252 * incarnation of a ublk server via the start_recovery/end_recovery
1253 * commands.
1254 */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)1255 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1256 {
1257 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1258 }
1259
ublk_dev_in_recoverable_state(struct ublk_device * ub)1260 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1261 {
1262 return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1263 ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1264 }
1265
ublk_free_disk(struct gendisk * disk)1266 static void ublk_free_disk(struct gendisk *disk)
1267 {
1268 struct ublk_device *ub = disk->private_data;
1269
1270 clear_bit(UB_STATE_USED, &ub->state);
1271 ublk_put_device(ub);
1272 }
1273
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)1274 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1275 unsigned int *owner_gid)
1276 {
1277 kuid_t uid;
1278 kgid_t gid;
1279
1280 current_uid_gid(&uid, &gid);
1281
1282 *owner_uid = from_kuid(&init_user_ns, uid);
1283 *owner_gid = from_kgid(&init_user_ns, gid);
1284 }
1285
ublk_open(struct gendisk * disk,blk_mode_t mode)1286 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1287 {
1288 struct ublk_device *ub = disk->private_data;
1289
1290 if (capable(CAP_SYS_ADMIN))
1291 return 0;
1292
1293 /*
1294 * If it is one unprivileged device, only owner can open
1295 * the disk. Otherwise it could be one trap made by one
1296 * evil user who grants this disk's privileges to other
1297 * users deliberately.
1298 *
1299 * This way is reasonable too given anyone can create
1300 * unprivileged device, and no need other's grant.
1301 */
1302 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1303 unsigned int curr_uid, curr_gid;
1304
1305 ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1306
1307 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1308 ub->dev_info.owner_gid)
1309 return -EPERM;
1310 }
1311
1312 if (ub->block_open)
1313 return -ENXIO;
1314
1315 return 0;
1316 }
1317
1318 static const struct block_device_operations ub_fops = {
1319 .owner = THIS_MODULE,
1320 .open = ublk_open,
1321 .free_disk = ublk_free_disk,
1322 .report_zones = ublk_report_zones,
1323 };
1324
ublk_copy_user_bvec(const struct bio_vec * bv,unsigned * offset,struct iov_iter * uiter,int dir,size_t * done)1325 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1326 struct iov_iter *uiter, int dir, size_t *done)
1327 {
1328 unsigned len;
1329 void *bv_buf;
1330 size_t copied;
1331
1332 if (*offset >= bv->bv_len) {
1333 *offset -= bv->bv_len;
1334 return true;
1335 }
1336
1337 len = bv->bv_len - *offset;
1338 bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1339 /*
1340 * Bio pages may originate from slab caches without a usercopy region
1341 * (e.g. jbd2 frozen metadata buffers). This is the same data that
1342 * the loop driver writes to its backing file — no exposure risk.
1343 * The bvec length is always trusted, so the size check in
1344 * check_copy_size() is not needed either. Use the unchecked
1345 * helpers to avoid false positives on slab pages.
1346 */
1347 if (dir == ITER_DEST)
1348 copied = _copy_to_iter(bv_buf, len, uiter);
1349 else
1350 copied = _copy_from_iter(bv_buf, len, uiter);
1351
1352 kunmap_local(bv_buf);
1353
1354 *done += copied;
1355 if (copied < len)
1356 return false;
1357
1358 *offset = 0;
1359 return true;
1360 }
1361
1362 /*
1363 * Copy data between request pages and io_iter, and 'offset'
1364 * is the start point of linear offset of request.
1365 */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1366 static size_t ublk_copy_user_pages(const struct request *req,
1367 unsigned offset, struct iov_iter *uiter, int dir)
1368 {
1369 struct req_iterator iter;
1370 struct bio_vec bv;
1371 size_t done = 0;
1372
1373 rq_for_each_segment(bv, req, iter) {
1374 if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1375 break;
1376 }
1377 return done;
1378 }
1379
1380 #ifdef CONFIG_BLK_DEV_INTEGRITY
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1381 static size_t ublk_copy_user_integrity(const struct request *req,
1382 unsigned offset, struct iov_iter *uiter, int dir)
1383 {
1384 size_t done = 0;
1385 struct bio *bio = req->bio;
1386 struct bvec_iter iter;
1387 struct bio_vec iv;
1388
1389 if (!blk_integrity_rq(req))
1390 return 0;
1391
1392 bio_for_each_integrity_vec(iv, bio, iter) {
1393 if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1394 break;
1395 }
1396
1397 return done;
1398 }
1399 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1400 static size_t ublk_copy_user_integrity(const struct request *req,
1401 unsigned offset, struct iov_iter *uiter, int dir)
1402 {
1403 return 0;
1404 }
1405 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1406
ublk_need_map_req(const struct request * req)1407 static inline bool ublk_need_map_req(const struct request *req)
1408 {
1409 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1410 }
1411
ublk_need_unmap_req(const struct request * req)1412 static inline bool ublk_need_unmap_req(const struct request *req)
1413 {
1414 return ublk_rq_has_data(req) &&
1415 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1416 }
1417
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)1418 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1419 const struct request *req,
1420 const struct ublk_io *io)
1421 {
1422 const unsigned int rq_bytes = blk_rq_bytes(req);
1423
1424 if (!ublk_need_map_io(ubq))
1425 return rq_bytes;
1426
1427 /*
1428 * no zero copy, we delay copy WRITE request data into ublksrv
1429 * context and the big benefit is that pinning pages in current
1430 * context is pretty fast, see ublk_pin_user_pages
1431 */
1432 if (ublk_need_map_req(req)) {
1433 struct iov_iter iter;
1434 const int dir = ITER_DEST;
1435
1436 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1437 return ublk_copy_user_pages(req, 0, &iter, dir);
1438 }
1439 return rq_bytes;
1440 }
1441
ublk_unmap_io(bool need_map,const struct request * req,const struct ublk_io * io)1442 static unsigned int ublk_unmap_io(bool need_map,
1443 const struct request *req,
1444 const struct ublk_io *io)
1445 {
1446 const unsigned int rq_bytes = blk_rq_bytes(req);
1447
1448 if (!need_map)
1449 return rq_bytes;
1450
1451 if (ublk_need_unmap_req(req)) {
1452 struct iov_iter iter;
1453 const int dir = ITER_SOURCE;
1454
1455 WARN_ON_ONCE(io->res > rq_bytes);
1456
1457 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1458 return ublk_copy_user_pages(req, 0, &iter, dir);
1459 }
1460 return rq_bytes;
1461 }
1462
ublk_req_build_flags(struct request * req)1463 static inline unsigned int ublk_req_build_flags(struct request *req)
1464 {
1465 unsigned flags = 0;
1466
1467 if (req->cmd_flags & REQ_FAILFAST_DEV)
1468 flags |= UBLK_IO_F_FAILFAST_DEV;
1469
1470 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1471 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1472
1473 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1474 flags |= UBLK_IO_F_FAILFAST_DRIVER;
1475
1476 if (req->cmd_flags & REQ_META)
1477 flags |= UBLK_IO_F_META;
1478
1479 if (req->cmd_flags & REQ_FUA)
1480 flags |= UBLK_IO_F_FUA;
1481
1482 if (req->cmd_flags & REQ_NOUNMAP)
1483 flags |= UBLK_IO_F_NOUNMAP;
1484
1485 if (req->cmd_flags & REQ_SWAP)
1486 flags |= UBLK_IO_F_SWAP;
1487
1488 if (blk_integrity_rq(req))
1489 flags |= UBLK_IO_F_INTEGRITY;
1490
1491 return flags;
1492 }
1493
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1494 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1495 {
1496 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1497 struct ublk_io *io = &ubq->ios[req->tag];
1498 u32 ublk_op;
1499
1500 switch (req_op(req)) {
1501 case REQ_OP_READ:
1502 ublk_op = UBLK_IO_OP_READ;
1503 break;
1504 case REQ_OP_WRITE:
1505 ublk_op = UBLK_IO_OP_WRITE;
1506 break;
1507 case REQ_OP_FLUSH:
1508 ublk_op = UBLK_IO_OP_FLUSH;
1509 break;
1510 case REQ_OP_DISCARD:
1511 ublk_op = UBLK_IO_OP_DISCARD;
1512 break;
1513 case REQ_OP_WRITE_ZEROES:
1514 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1515 break;
1516 default:
1517 if (ublk_queue_is_zoned(ubq))
1518 return ublk_setup_iod_zoned(ubq, req);
1519 return BLK_STS_IOERR;
1520 }
1521
1522 /* need to translate since kernel may change */
1523 iod->op_flags = ublk_op | ublk_req_build_flags(req);
1524 iod->nr_sectors = blk_rq_sectors(req);
1525 iod->start_sector = blk_rq_pos(req);
1526
1527 /* Try shmem zero-copy match before setting addr */
1528 if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
1529 u32 buf_idx, buf_off;
1530
1531 if (ublk_try_buf_match(ubq->dev, req,
1532 &buf_idx, &buf_off)) {
1533 iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
1534 iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
1535 return BLK_STS_OK;
1536 }
1537 }
1538
1539 iod->addr = io->buf.addr;
1540
1541 return BLK_STS_OK;
1542 }
1543
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1544 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1545 struct io_uring_cmd *ioucmd)
1546 {
1547 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1548 }
1549
ublk_end_request(struct request * req,blk_status_t error)1550 static void ublk_end_request(struct request *req, blk_status_t error)
1551 {
1552 local_bh_disable();
1553 blk_mq_end_request(req, error);
1554 local_bh_enable();
1555 }
1556
1557 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req,struct ublk_io * io,bool need_map,struct io_comp_batch * iob)1558 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1559 bool need_map, struct io_comp_batch *iob)
1560 {
1561 unsigned int unmapped_bytes;
1562 blk_status_t res = BLK_STS_OK;
1563 bool requeue;
1564
1565 /* failed read IO if nothing is read */
1566 if (!io->res && req_op(req) == REQ_OP_READ)
1567 io->res = -EIO;
1568
1569 if (io->res < 0) {
1570 res = errno_to_blk_status(io->res);
1571 goto exit;
1572 }
1573
1574 /*
1575 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1576 * directly.
1577 *
1578 * Both the two needn't unmap.
1579 */
1580 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1581 req_op(req) != REQ_OP_DRV_IN)
1582 goto exit;
1583
1584 /* shmem zero copy: no data to unmap, pages already shared */
1585 if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1586 goto exit;
1587
1588 /* for READ request, writing data in iod->addr to rq buffers */
1589 unmapped_bytes = ublk_unmap_io(need_map, req, io);
1590
1591 /*
1592 * Extremely impossible since we got data filled in just before
1593 *
1594 * Re-read simply for this unlikely case.
1595 */
1596 if (unlikely(unmapped_bytes < io->res))
1597 io->res = unmapped_bytes;
1598
1599 /*
1600 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1601 * happens off this path, then that will prevent ublk's blkdev_release()
1602 * from being called on current's task work, see fput() implementation.
1603 *
1604 * Otherwise, ublk server may not provide forward progress in case of
1605 * reading the partition table from bdev_open() with disk->open_mutex
1606 * held, and causes dead lock as we could already be holding
1607 * disk->open_mutex here.
1608 *
1609 * Preferably we would not be doing IO with a mutex held that is also
1610 * used for release, but this work-around will suffice for now.
1611 */
1612 local_bh_disable();
1613 requeue = blk_update_request(req, BLK_STS_OK, io->res);
1614 local_bh_enable();
1615 if (requeue)
1616 blk_mq_requeue_request(req, true);
1617 else if (likely(!blk_should_fake_timeout(req->q))) {
1618 if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1619 return;
1620 __blk_mq_end_request(req, BLK_STS_OK);
1621 }
1622
1623 return;
1624 exit:
1625 ublk_end_request(req, res);
1626 }
1627
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1628 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1629 struct request *req)
1630 {
1631 /* read cmd first because req will overwrite it */
1632 struct io_uring_cmd *cmd = io->cmd;
1633
1634 /* mark this cmd owned by ublksrv */
1635 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1636
1637 /*
1638 * clear ACTIVE since we are done with this sqe/cmd slot
1639 * We can only accept io cmd in case of being not active.
1640 */
1641 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1642
1643 io->req = req;
1644 return cmd;
1645 }
1646
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1647 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1648 int res, unsigned issue_flags)
1649 {
1650 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1651
1652 /* tell ublksrv one io request is coming */
1653 io_uring_cmd_done(cmd, res, issue_flags);
1654 }
1655
1656 #define UBLK_REQUEUE_DELAY_MS 3
1657
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1658 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1659 struct request *rq)
1660 {
1661 /* We cannot process this rq so just requeue it. */
1662 if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1663 blk_mq_requeue_request(rq, false);
1664 else
1665 ublk_end_request(rq, BLK_STS_IOERR);
1666 }
1667
1668 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,unsigned tag)1669 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1670 {
1671 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1672
1673 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1674 }
1675
1676 enum auto_buf_reg_res {
1677 AUTO_BUF_REG_FAIL,
1678 AUTO_BUF_REG_FALLBACK,
1679 AUTO_BUF_REG_OK,
1680 };
1681
1682 /*
1683 * Setup io state after auto buffer registration.
1684 *
1685 * Must be called after ublk_auto_buf_register() is done.
1686 * Caller must hold io->lock in batch context.
1687 */
ublk_auto_buf_io_setup(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,enum auto_buf_reg_res res)1688 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1689 struct request *req, struct ublk_io *io,
1690 struct io_uring_cmd *cmd,
1691 enum auto_buf_reg_res res)
1692 {
1693 if (res == AUTO_BUF_REG_OK) {
1694 io->task_registered_buffers = 1;
1695 io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1696 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1697 }
1698 ublk_init_req_ref(ubq, io);
1699 __ublk_prep_compl_io_cmd(io, req);
1700 }
1701
1702 /* Register request bvec to io_uring for auto buffer registration. */
1703 static enum auto_buf_reg_res
ublk_auto_buf_register(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1704 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1705 struct ublk_io *io, struct io_uring_cmd *cmd,
1706 unsigned int issue_flags)
1707 {
1708 int ret;
1709
1710 ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1711 io->buf.auto_reg.index, issue_flags);
1712 if (ret) {
1713 if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1714 ublk_auto_buf_reg_fallback(ubq, req->tag);
1715 return AUTO_BUF_REG_FALLBACK;
1716 }
1717 ublk_end_request(req, BLK_STS_IOERR);
1718 return AUTO_BUF_REG_FAIL;
1719 }
1720
1721 return AUTO_BUF_REG_OK;
1722 }
1723
1724 /*
1725 * Dispatch IO to userspace with auto buffer registration.
1726 *
1727 * Only called in non-batch context from task work, io->lock not held.
1728 */
ublk_auto_buf_dispatch(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1729 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1730 struct request *req, struct ublk_io *io,
1731 struct io_uring_cmd *cmd,
1732 unsigned int issue_flags)
1733 {
1734 enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1735 issue_flags);
1736
1737 if (res != AUTO_BUF_REG_FAIL) {
1738 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1739 io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1740 }
1741 }
1742
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1743 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1744 struct ublk_io *io)
1745 {
1746 unsigned mapped_bytes;
1747
1748 /* shmem zero copy: skip data copy, pages already shared */
1749 if (ublk_iod_is_shmem_zc(ubq, req->tag))
1750 return true;
1751
1752 mapped_bytes = ublk_map_io(ubq, req, io);
1753
1754 /* partially mapped, update io descriptor */
1755 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1756 /*
1757 * Nothing mapped, retry until we succeed.
1758 *
1759 * We may never succeed in mapping any bytes here because
1760 * of OOM. TODO: reserve one buffer with single page pinned
1761 * for providing forward progress guarantee.
1762 */
1763 if (unlikely(!mapped_bytes)) {
1764 blk_mq_requeue_request(req, false);
1765 blk_mq_delay_kick_requeue_list(req->q,
1766 UBLK_REQUEUE_DELAY_MS);
1767 return false;
1768 }
1769
1770 ublk_get_iod(ubq, req->tag)->nr_sectors =
1771 mapped_bytes >> 9;
1772 }
1773
1774 return true;
1775 }
1776
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req)1777 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1778 {
1779 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1780 int tag = req->tag;
1781 struct ublk_io *io = &ubq->ios[tag];
1782
1783 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1784 __func__, ubq->q_id, req->tag, io->flags,
1785 ublk_get_iod(ubq, req->tag)->addr);
1786
1787 /*
1788 * Task is exiting if either:
1789 *
1790 * (1) current != io->task.
1791 * io_uring_cmd_complete_in_task() tries to run task_work
1792 * in a workqueue if cmd's task is PF_EXITING.
1793 *
1794 * (2) current->flags & PF_EXITING.
1795 */
1796 if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1797 __ublk_abort_rq(ubq, req);
1798 return;
1799 }
1800
1801 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1802 /*
1803 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1804 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1805 * and notify it.
1806 */
1807 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1808 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1809 __func__, ubq->q_id, req->tag, io->flags);
1810 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1811 issue_flags);
1812 return;
1813 }
1814
1815 if (!ublk_start_io(ubq, req, io))
1816 return;
1817
1818 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1819 ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1820 } else {
1821 ublk_init_req_ref(ubq, io);
1822 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1823 }
1824 }
1825
__ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short tag)1826 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1827 const struct ublk_batch_io_data *data,
1828 unsigned short tag)
1829 {
1830 struct ublk_device *ub = data->ub;
1831 struct ublk_io *io = &ubq->ios[tag];
1832 struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1833 enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1834 struct io_uring_cmd *cmd = data->cmd;
1835
1836 if (!ublk_start_io(ubq, req, io))
1837 return false;
1838
1839 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1840 res = ublk_auto_buf_register(ubq, req, io, cmd,
1841 data->issue_flags);
1842
1843 if (res == AUTO_BUF_REG_FAIL)
1844 return false;
1845 }
1846
1847 ublk_io_lock(io);
1848 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1849 ublk_io_unlock(io);
1850
1851 return true;
1852 }
1853
ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,unsigned int len)1854 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1855 const struct ublk_batch_io_data *data,
1856 unsigned short *tag_buf,
1857 unsigned int len)
1858 {
1859 bool has_unused = false;
1860 unsigned int i;
1861
1862 for (i = 0; i < len; i++) {
1863 unsigned short tag = tag_buf[i];
1864
1865 if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1866 tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1867 has_unused = true;
1868 }
1869 }
1870
1871 return has_unused;
1872 }
1873
1874 /*
1875 * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1876 * Returns the new length after filtering.
1877 */
ublk_filter_unused_tags(unsigned short * tag_buf,unsigned int len)1878 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1879 unsigned int len)
1880 {
1881 unsigned int i, j;
1882
1883 for (i = 0, j = 0; i < len; i++) {
1884 if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1885 if (i != j)
1886 tag_buf[j] = tag_buf[i];
1887 j++;
1888 }
1889 }
1890
1891 return j;
1892 }
1893
ublk_batch_dispatch_fail(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,size_t len,int ret)1894 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1895 const struct ublk_batch_io_data *data,
1896 unsigned short *tag_buf, size_t len, int ret)
1897 {
1898 int i, res;
1899
1900 /*
1901 * Undo prep state for all IOs since userspace never received them.
1902 * This restores IOs to pre-prepared state so they can be cleanly
1903 * re-prepared when tags are pulled from FIFO again.
1904 */
1905 for (i = 0; i < len; i++) {
1906 struct ublk_io *io = &ubq->ios[tag_buf[i]];
1907 int index = -1;
1908
1909 ublk_io_lock(io);
1910 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1911 index = io->buf.auto_reg.index;
1912 io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1913 io->flags |= UBLK_IO_FLAG_ACTIVE;
1914 ublk_io_unlock(io);
1915
1916 if (index != -1)
1917 io_buffer_unregister_bvec(data->cmd, index,
1918 data->issue_flags);
1919 }
1920
1921 res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1922 tag_buf, len, &ubq->evts_lock);
1923
1924 pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1925 "tags(%d %zu) ret %d\n", __func__, res, len,
1926 ret);
1927 }
1928
1929 #define MAX_NR_TAG 128
__ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)1930 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1931 const struct ublk_batch_io_data *data,
1932 struct ublk_batch_fetch_cmd *fcmd)
1933 {
1934 const unsigned int tag_sz = sizeof(unsigned short);
1935 unsigned short tag_buf[MAX_NR_TAG];
1936 struct io_br_sel sel;
1937 size_t len = 0;
1938 bool needs_filter;
1939 int ret;
1940
1941 WARN_ON_ONCE(data->cmd != fcmd->cmd);
1942
1943 sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1944 data->issue_flags);
1945 if (sel.val < 0)
1946 return sel.val;
1947 if (!sel.addr)
1948 return -ENOBUFS;
1949
1950 /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1951 len = min(len, sizeof(tag_buf)) / tag_sz;
1952 len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1953
1954 needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1955 /* Filter out unused tags before posting to userspace */
1956 if (unlikely(needs_filter)) {
1957 int new_len = ublk_filter_unused_tags(tag_buf, len);
1958
1959 /* return actual length if all are failed or requeued */
1960 if (!new_len) {
1961 /* release the selected buffer */
1962 sel.val = 0;
1963 WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1964 &sel, data->issue_flags));
1965 return len;
1966 }
1967 len = new_len;
1968 }
1969
1970 sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1971 ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1972 if (unlikely(ret < 0))
1973 ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1974 return ret;
1975 }
1976
__ublk_acquire_fcmd(struct ublk_queue * ubq)1977 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1978 struct ublk_queue *ubq)
1979 {
1980 struct ublk_batch_fetch_cmd *fcmd;
1981
1982 lockdep_assert_held(&ubq->evts_lock);
1983
1984 /*
1985 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1986 *
1987 * The pair is the smp_mb() in ublk_batch_dispatch().
1988 *
1989 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1990 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1991 */
1992 smp_mb();
1993 if (READ_ONCE(ubq->active_fcmd)) {
1994 fcmd = NULL;
1995 } else {
1996 fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1997 struct ublk_batch_fetch_cmd, node);
1998 WRITE_ONCE(ubq->active_fcmd, fcmd);
1999 }
2000 return fcmd;
2001 }
2002
ublk_batch_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2003 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2004 {
2005 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
2006 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2007 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2008 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2009 struct ublk_batch_io_data data = {
2010 .ub = pdu->ubq->dev,
2011 .cmd = fcmd->cmd,
2012 .issue_flags = issue_flags,
2013 };
2014
2015 WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
2016
2017 ublk_batch_dispatch(pdu->ubq, &data, fcmd);
2018 }
2019
2020 static void
ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)2021 ublk_batch_dispatch(struct ublk_queue *ubq,
2022 const struct ublk_batch_io_data *data,
2023 struct ublk_batch_fetch_cmd *fcmd)
2024 {
2025 struct ublk_batch_fetch_cmd *new_fcmd;
2026 unsigned tried = 0;
2027 int ret = 0;
2028
2029 again:
2030 while (!ublk_io_evts_empty(ubq)) {
2031 ret = __ublk_batch_dispatch(ubq, data, fcmd);
2032 if (ret <= 0)
2033 break;
2034 }
2035
2036 if (ret < 0) {
2037 ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2038 return;
2039 }
2040
2041 __ublk_release_fcmd(ubq);
2042 /*
2043 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2044 * checking ubq->evts_fifo.
2045 *
2046 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2047 */
2048 smp_mb();
2049 if (likely(ublk_io_evts_empty(ubq)))
2050 return;
2051
2052 spin_lock(&ubq->evts_lock);
2053 new_fcmd = __ublk_acquire_fcmd(ubq);
2054 spin_unlock(&ubq->evts_lock);
2055
2056 if (!new_fcmd)
2057 return;
2058
2059 /* Avoid lockup by allowing to handle at most 32 batches */
2060 if (new_fcmd == fcmd && tried++ < 32)
2061 goto again;
2062
2063 io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2064 }
2065
ublk_cmd_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2066 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2067 {
2068 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2069 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2070 struct ublk_queue *ubq = pdu->ubq;
2071
2072 ublk_dispatch_req(ubq, pdu->req);
2073 }
2074
ublk_batch_queue_cmd(struct ublk_queue * ubq,struct request * rq,bool last)2075 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2076 {
2077 unsigned short tag = rq->tag;
2078 struct ublk_batch_fetch_cmd *fcmd = NULL;
2079
2080 spin_lock(&ubq->evts_lock);
2081 kfifo_put(&ubq->evts_fifo, tag);
2082 if (last)
2083 fcmd = __ublk_acquire_fcmd(ubq);
2084 spin_unlock(&ubq->evts_lock);
2085
2086 if (fcmd)
2087 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2088 }
2089
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)2090 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2091 {
2092 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2093 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2094
2095 pdu->req = rq;
2096 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2097 }
2098
ublk_cmd_list_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2099 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2100 {
2101 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2102 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2103 struct request *rq = pdu->req_list;
2104 struct request *next;
2105
2106 do {
2107 next = rq->rq_next;
2108 rq->rq_next = NULL;
2109 ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2110 rq = next;
2111 } while (rq);
2112 }
2113
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)2114 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2115 {
2116 struct io_uring_cmd *cmd = io->cmd;
2117 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2118
2119 pdu->req_list = rq_list_peek(l);
2120 rq_list_init(l);
2121 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2122 }
2123
ublk_timeout(struct request * rq)2124 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2125 {
2126 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2127 pid_t tgid = ubq->dev->ublksrv_tgid;
2128 struct task_struct *p;
2129 struct pid *pid;
2130
2131 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2132 return BLK_EH_RESET_TIMER;
2133
2134 if (unlikely(!tgid))
2135 return BLK_EH_RESET_TIMER;
2136
2137 rcu_read_lock();
2138 pid = find_vpid(tgid);
2139 p = pid_task(pid, PIDTYPE_PID);
2140 if (p)
2141 send_sig(SIGKILL, p, 0);
2142 rcu_read_unlock();
2143 return BLK_EH_DONE;
2144 }
2145
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)2146 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2147 bool check_cancel)
2148 {
2149 blk_status_t res;
2150
2151 if (unlikely(READ_ONCE(ubq->fail_io)))
2152 return BLK_STS_TARGET;
2153
2154 /* With recovery feature enabled, force_abort is set in
2155 * ublk_stop_dev() before calling del_gendisk(). We have to
2156 * abort all requeued and new rqs here to let del_gendisk()
2157 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2158 * to avoid UAF on io_uring ctx.
2159 *
2160 * Note: force_abort is guaranteed to be seen because it is set
2161 * before request queue is unqiuesced.
2162 */
2163 if (ublk_nosrv_should_queue_io(ubq) &&
2164 unlikely(READ_ONCE(ubq->force_abort)))
2165 return BLK_STS_IOERR;
2166
2167 if (check_cancel && unlikely(ubq->canceling))
2168 return BLK_STS_IOERR;
2169
2170 /* fill iod to slot in io cmd buffer */
2171 res = ublk_setup_iod(ubq, rq);
2172 if (unlikely(res != BLK_STS_OK))
2173 return BLK_STS_IOERR;
2174
2175 blk_mq_start_request(rq);
2176 return BLK_STS_OK;
2177 }
2178
2179 /*
2180 * Common helper for queue_rq that handles request preparation and
2181 * cancellation checks. Returns status and sets should_queue to indicate
2182 * whether the caller should proceed with queuing the request.
2183 */
__ublk_queue_rq_common(struct ublk_queue * ubq,struct request * rq,bool * should_queue)2184 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2185 struct request *rq,
2186 bool *should_queue)
2187 {
2188 blk_status_t res;
2189
2190 res = ublk_prep_req(ubq, rq, false);
2191 if (res != BLK_STS_OK) {
2192 *should_queue = false;
2193 return res;
2194 }
2195
2196 /*
2197 * ->canceling has to be handled after ->force_abort and ->fail_io
2198 * is dealt with, otherwise this request may not be failed in case
2199 * of recovery, and cause hang when deleting disk
2200 */
2201 if (unlikely(ubq->canceling)) {
2202 *should_queue = false;
2203 __ublk_abort_rq(ubq, rq);
2204 return BLK_STS_OK;
2205 }
2206
2207 *should_queue = true;
2208 return BLK_STS_OK;
2209 }
2210
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2211 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2212 const struct blk_mq_queue_data *bd)
2213 {
2214 struct ublk_queue *ubq = hctx->driver_data;
2215 struct request *rq = bd->rq;
2216 bool should_queue;
2217 blk_status_t res;
2218
2219 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2220 if (!should_queue)
2221 return res;
2222
2223 ublk_queue_cmd(ubq, rq);
2224 return BLK_STS_OK;
2225 }
2226
ublk_batch_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2227 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2228 const struct blk_mq_queue_data *bd)
2229 {
2230 struct ublk_queue *ubq = hctx->driver_data;
2231 struct request *rq = bd->rq;
2232 bool should_queue;
2233 blk_status_t res;
2234
2235 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2236 if (!should_queue)
2237 return res;
2238
2239 ublk_batch_queue_cmd(ubq, rq, bd->last);
2240 return BLK_STS_OK;
2241 }
2242
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)2243 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2244 const struct ublk_io *io2)
2245 {
2246 return (io_uring_cmd_ctx_handle(io->cmd) ==
2247 io_uring_cmd_ctx_handle(io2->cmd)) &&
2248 (io->task == io2->task);
2249 }
2250
ublk_commit_rqs(struct blk_mq_hw_ctx * hctx)2251 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2252 {
2253 struct ublk_queue *ubq = hctx->driver_data;
2254 struct ublk_batch_fetch_cmd *fcmd;
2255
2256 spin_lock(&ubq->evts_lock);
2257 fcmd = __ublk_acquire_fcmd(ubq);
2258 spin_unlock(&ubq->evts_lock);
2259
2260 if (fcmd)
2261 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2262 }
2263
ublk_queue_rqs(struct rq_list * rqlist)2264 static void ublk_queue_rqs(struct rq_list *rqlist)
2265 {
2266 struct rq_list requeue_list = { };
2267 struct rq_list submit_list = { };
2268 struct ublk_io *io = NULL;
2269 struct request *req;
2270
2271 while ((req = rq_list_pop(rqlist))) {
2272 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2273 struct ublk_io *this_io = &this_q->ios[req->tag];
2274
2275 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2276 rq_list_add_tail(&requeue_list, req);
2277 continue;
2278 }
2279
2280 if (io && !ublk_belong_to_same_batch(io, this_io) &&
2281 !rq_list_empty(&submit_list))
2282 ublk_queue_cmd_list(io, &submit_list);
2283 io = this_io;
2284 rq_list_add_tail(&submit_list, req);
2285 }
2286
2287 if (!rq_list_empty(&submit_list))
2288 ublk_queue_cmd_list(io, &submit_list);
2289 *rqlist = requeue_list;
2290 }
2291
ublk_batch_queue_cmd_list(struct ublk_queue * ubq,struct rq_list * l)2292 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2293 {
2294 unsigned short tags[MAX_NR_TAG];
2295 struct ublk_batch_fetch_cmd *fcmd;
2296 struct request *rq;
2297 unsigned cnt = 0;
2298
2299 spin_lock(&ubq->evts_lock);
2300 rq_list_for_each(l, rq) {
2301 tags[cnt++] = (unsigned short)rq->tag;
2302 if (cnt >= MAX_NR_TAG) {
2303 kfifo_in(&ubq->evts_fifo, tags, cnt);
2304 cnt = 0;
2305 }
2306 }
2307 if (cnt)
2308 kfifo_in(&ubq->evts_fifo, tags, cnt);
2309 fcmd = __ublk_acquire_fcmd(ubq);
2310 spin_unlock(&ubq->evts_lock);
2311
2312 rq_list_init(l);
2313 if (fcmd)
2314 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2315 }
2316
ublk_batch_queue_rqs(struct rq_list * rqlist)2317 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2318 {
2319 struct rq_list requeue_list = { };
2320 struct rq_list submit_list = { };
2321 struct ublk_queue *ubq = NULL;
2322 struct request *req;
2323
2324 while ((req = rq_list_pop(rqlist))) {
2325 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2326
2327 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2328 rq_list_add_tail(&requeue_list, req);
2329 continue;
2330 }
2331
2332 if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2333 ublk_batch_queue_cmd_list(ubq, &submit_list);
2334 ubq = this_q;
2335 rq_list_add_tail(&submit_list, req);
2336 }
2337
2338 if (!rq_list_empty(&submit_list))
2339 ublk_batch_queue_cmd_list(ubq, &submit_list);
2340 *rqlist = requeue_list;
2341 }
2342
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)2343 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2344 unsigned int hctx_idx)
2345 {
2346 struct ublk_device *ub = driver_data;
2347 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2348
2349 hctx->driver_data = ubq;
2350 return 0;
2351 }
2352
2353 static const struct blk_mq_ops ublk_mq_ops = {
2354 .queue_rq = ublk_queue_rq,
2355 .queue_rqs = ublk_queue_rqs,
2356 .init_hctx = ublk_init_hctx,
2357 .timeout = ublk_timeout,
2358 };
2359
2360 static const struct blk_mq_ops ublk_batch_mq_ops = {
2361 .commit_rqs = ublk_commit_rqs,
2362 .queue_rq = ublk_batch_queue_rq,
2363 .queue_rqs = ublk_batch_queue_rqs,
2364 .init_hctx = ublk_init_hctx,
2365 .timeout = ublk_timeout,
2366 };
2367
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2368 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2369 {
2370 int i;
2371
2372 ubq->nr_io_ready = 0;
2373
2374 for (i = 0; i < ubq->q_depth; i++) {
2375 struct ublk_io *io = &ubq->ios[i];
2376
2377 /*
2378 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2379 * io->cmd
2380 */
2381 io->flags &= UBLK_IO_FLAG_CANCELED;
2382 io->cmd = NULL;
2383 io->buf.addr = 0;
2384
2385 /*
2386 * old task is PF_EXITING, put it now
2387 *
2388 * It could be NULL in case of closing one quiesced
2389 * device.
2390 */
2391 if (io->task) {
2392 put_task_struct(io->task);
2393 io->task = NULL;
2394 }
2395
2396 WARN_ON_ONCE(refcount_read(&io->ref));
2397 WARN_ON_ONCE(io->task_registered_buffers);
2398 }
2399 }
2400
ublk_ch_open(struct inode * inode,struct file * filp)2401 static int ublk_ch_open(struct inode *inode, struct file *filp)
2402 {
2403 struct ublk_device *ub = container_of(inode->i_cdev,
2404 struct ublk_device, cdev);
2405
2406 if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2407 return -EBUSY;
2408 filp->private_data = ub;
2409 ub->ublksrv_tgid = current->tgid;
2410 return 0;
2411 }
2412
ublk_reset_ch_dev(struct ublk_device * ub)2413 static void ublk_reset_ch_dev(struct ublk_device *ub)
2414 {
2415 int i;
2416
2417 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2418 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2419
2420 /* Sync with ublk_cancel_cmd() */
2421 spin_lock(&ubq->cancel_lock);
2422 ublk_queue_reinit(ub, ubq);
2423 spin_unlock(&ubq->cancel_lock);
2424 }
2425
2426 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2427 ub->mm = NULL;
2428 ub->nr_queue_ready = 0;
2429 ub->unprivileged_daemons = false;
2430 ub->ublksrv_tgid = -1;
2431 }
2432
ublk_get_disk(struct ublk_device * ub)2433 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2434 {
2435 struct gendisk *disk;
2436
2437 spin_lock(&ub->lock);
2438 disk = ub->ub_disk;
2439 if (disk)
2440 get_device(disk_to_dev(disk));
2441 spin_unlock(&ub->lock);
2442
2443 return disk;
2444 }
2445
ublk_put_disk(struct gendisk * disk)2446 static void ublk_put_disk(struct gendisk *disk)
2447 {
2448 if (disk)
2449 put_device(disk_to_dev(disk));
2450 }
2451
ublk_partition_scan_work(struct work_struct * work)2452 static void ublk_partition_scan_work(struct work_struct *work)
2453 {
2454 struct ublk_device *ub =
2455 container_of(work, struct ublk_device, partition_scan_work);
2456 /* Hold disk reference to prevent UAF during concurrent teardown */
2457 struct gendisk *disk = ublk_get_disk(ub);
2458
2459 if (!disk)
2460 return;
2461
2462 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2463 &disk->state)))
2464 goto out;
2465
2466 mutex_lock(&disk->open_mutex);
2467 bdev_disk_changed(disk, false);
2468 mutex_unlock(&disk->open_mutex);
2469 out:
2470 ublk_put_disk(disk);
2471 }
2472
2473 /*
2474 * Use this function to ensure that ->canceling is consistently set for
2475 * the device and all queues. Do not set these flags directly.
2476 *
2477 * Caller must ensure that:
2478 * - cancel_mutex is held. This ensures that there is no concurrent
2479 * access to ub->canceling and no concurrent writes to ubq->canceling.
2480 * - there are no concurrent reads of ubq->canceling from the queue_rq
2481 * path. This can be done by quiescing the queue, or through other
2482 * means.
2483 */
ublk_set_canceling(struct ublk_device * ub,bool canceling)2484 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2485 __must_hold(&ub->cancel_mutex)
2486 {
2487 int i;
2488
2489 ub->canceling = canceling;
2490 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2491 ublk_get_queue(ub, i)->canceling = canceling;
2492 }
2493
ublk_check_and_reset_active_ref(struct ublk_device * ub)2494 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2495 {
2496 int i, j;
2497
2498 if (!ublk_dev_need_req_ref(ub))
2499 return false;
2500
2501 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2502 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2503
2504 for (j = 0; j < ubq->q_depth; j++) {
2505 struct ublk_io *io = &ubq->ios[j];
2506 unsigned int refs = refcount_read(&io->ref) +
2507 io->task_registered_buffers;
2508
2509 /*
2510 * UBLK_REFCOUNT_INIT or zero means no active
2511 * reference
2512 */
2513 if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2514 return true;
2515
2516 /* reset to zero if the io hasn't active references */
2517 refcount_set(&io->ref, 0);
2518 io->task_registered_buffers = 0;
2519 }
2520 }
2521 return false;
2522 }
2523
ublk_ch_release_work_fn(struct work_struct * work)2524 static void ublk_ch_release_work_fn(struct work_struct *work)
2525 {
2526 struct ublk_device *ub =
2527 container_of(work, struct ublk_device, exit_work.work);
2528 struct gendisk *disk;
2529 int i;
2530
2531 /*
2532 * For zero-copy and auto buffer register modes, I/O references
2533 * might not be dropped naturally when the daemon is killed, but
2534 * io_uring guarantees that registered bvec kernel buffers are
2535 * unregistered finally when freeing io_uring context, then the
2536 * active references are dropped.
2537 *
2538 * Wait until active references are dropped for avoiding use-after-free
2539 *
2540 * registered buffer may be unregistered in io_ring's release hander,
2541 * so have to wait by scheduling work function for avoiding the two
2542 * file release dependency.
2543 */
2544 if (ublk_check_and_reset_active_ref(ub)) {
2545 schedule_delayed_work(&ub->exit_work, 1);
2546 return;
2547 }
2548
2549 /*
2550 * disk isn't attached yet, either device isn't live, or it has
2551 * been removed already, so we needn't to do anything
2552 */
2553 disk = ublk_get_disk(ub);
2554 if (!disk)
2555 goto out;
2556
2557 /*
2558 * All uring_cmd are done now, so abort any request outstanding to
2559 * the ublk server
2560 *
2561 * This can be done in lockless way because ublk server has been
2562 * gone
2563 *
2564 * More importantly, we have to provide forward progress guarantee
2565 * without holding ub->mutex, otherwise control task grabbing
2566 * ub->mutex triggers deadlock
2567 *
2568 * All requests may be inflight, so ->canceling may not be set, set
2569 * it now.
2570 */
2571 mutex_lock(&ub->cancel_mutex);
2572 ublk_set_canceling(ub, true);
2573 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2574 ublk_abort_queue(ub, ublk_get_queue(ub, i));
2575 mutex_unlock(&ub->cancel_mutex);
2576 blk_mq_kick_requeue_list(disk->queue);
2577
2578 /*
2579 * All infligh requests have been completed or requeued and any new
2580 * request will be failed or requeued via `->canceling` now, so it is
2581 * fine to grab ub->mutex now.
2582 */
2583 mutex_lock(&ub->mutex);
2584
2585 /* double check after grabbing lock */
2586 if (!ub->ub_disk)
2587 goto unlock;
2588
2589 /*
2590 * Transition the device to the nosrv state. What exactly this
2591 * means depends on the recovery flags
2592 */
2593 if (ublk_nosrv_should_stop_dev(ub)) {
2594 /*
2595 * Allow any pending/future I/O to pass through quickly
2596 * with an error. This is needed because del_gendisk
2597 * waits for all pending I/O to complete
2598 */
2599 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2600 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2601
2602 ublk_stop_dev_unlocked(ub);
2603 } else {
2604 if (ublk_nosrv_dev_should_queue_io(ub)) {
2605 /* ->canceling is set and all requests are aborted */
2606 ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2607 } else {
2608 ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2609 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2610 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2611 }
2612 }
2613 unlock:
2614 mutex_unlock(&ub->mutex);
2615 ublk_put_disk(disk);
2616
2617 /* all uring_cmd has been done now, reset device & ubq */
2618 ublk_reset_ch_dev(ub);
2619 out:
2620 clear_bit(UB_STATE_OPEN, &ub->state);
2621
2622 /* put the reference grabbed in ublk_ch_release() */
2623 ublk_put_device(ub);
2624 }
2625
ublk_ch_release(struct inode * inode,struct file * filp)2626 static int ublk_ch_release(struct inode *inode, struct file *filp)
2627 {
2628 struct ublk_device *ub = filp->private_data;
2629
2630 /*
2631 * Grab ublk device reference, so it won't be gone until we are
2632 * really released from work function.
2633 */
2634 ublk_get_device(ub);
2635
2636 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2637 schedule_delayed_work(&ub->exit_work, 0);
2638 return 0;
2639 }
2640
2641 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)2642 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2643 {
2644 struct ublk_device *ub = filp->private_data;
2645 size_t sz = vma->vm_end - vma->vm_start;
2646 unsigned max_sz = ublk_max_cmd_buf_size();
2647 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2648 int q_id, ret = 0;
2649
2650 spin_lock(&ub->lock);
2651 if (!ub->mm)
2652 ub->mm = current->mm;
2653 if (current->mm != ub->mm)
2654 ret = -EINVAL;
2655 spin_unlock(&ub->lock);
2656
2657 if (ret)
2658 return ret;
2659
2660 if (vma->vm_flags & VM_WRITE)
2661 return -EPERM;
2662
2663 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2664 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2665 return -EINVAL;
2666
2667 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2668 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2669 __func__, q_id, current->pid, vma->vm_start,
2670 phys_off, (unsigned long)sz);
2671
2672 if (sz != ublk_queue_cmd_buf_size(ub))
2673 return -EINVAL;
2674
2675 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2676 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2677 }
2678
__ublk_fail_req(struct ublk_device * ub,struct ublk_io * io,struct request * req)2679 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2680 struct request *req)
2681 {
2682 WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2683 io->flags & UBLK_IO_FLAG_ACTIVE);
2684
2685 if (ublk_nosrv_should_reissue_outstanding(ub))
2686 blk_mq_requeue_request(req, false);
2687 else {
2688 io->res = -EIO;
2689 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2690 }
2691 }
2692
2693 /*
2694 * Request tag may just be filled to event kfifo, not get chance to
2695 * dispatch, abort these requests too
2696 */
ublk_abort_batch_queue(struct ublk_device * ub,struct ublk_queue * ubq)2697 static void ublk_abort_batch_queue(struct ublk_device *ub,
2698 struct ublk_queue *ubq)
2699 {
2700 unsigned short tag;
2701
2702 while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2703 struct request *req = blk_mq_tag_to_rq(
2704 ub->tag_set.tags[ubq->q_id], tag);
2705
2706 if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2707 __ublk_fail_req(ub, &ubq->ios[tag], req);
2708 }
2709 }
2710
2711 /*
2712 * Called from ublk char device release handler, when any uring_cmd is
2713 * done, meantime request queue is "quiesced" since all inflight requests
2714 * can't be completed because ublk server is dead.
2715 *
2716 * So no one can hold our request IO reference any more, simply ignore the
2717 * reference, and complete the request immediately
2718 */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)2719 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2720 {
2721 int i;
2722
2723 for (i = 0; i < ubq->q_depth; i++) {
2724 struct ublk_io *io = &ubq->ios[i];
2725
2726 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2727 __ublk_fail_req(ub, io, io->req);
2728 }
2729
2730 if (ublk_support_batch_io(ubq))
2731 ublk_abort_batch_queue(ub, ubq);
2732 }
2733
ublk_start_cancel(struct ublk_device * ub)2734 static void ublk_start_cancel(struct ublk_device *ub)
2735 {
2736 struct gendisk *disk = ublk_get_disk(ub);
2737
2738 /* Our disk has been dead */
2739 if (!disk)
2740 return;
2741
2742 mutex_lock(&ub->cancel_mutex);
2743 if (ub->canceling)
2744 goto out;
2745 /*
2746 * Now we are serialized with ublk_queue_rq()
2747 *
2748 * Make sure that ubq->canceling is set when queue is frozen,
2749 * because ublk_queue_rq() has to rely on this flag for avoiding to
2750 * touch completed uring_cmd
2751 */
2752 blk_mq_quiesce_queue(disk->queue);
2753 ublk_set_canceling(ub, true);
2754 blk_mq_unquiesce_queue(disk->queue);
2755 out:
2756 mutex_unlock(&ub->cancel_mutex);
2757 ublk_put_disk(disk);
2758 }
2759
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)2760 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2761 unsigned int issue_flags)
2762 {
2763 struct ublk_io *io = &ubq->ios[tag];
2764 struct ublk_device *ub = ubq->dev;
2765 struct io_uring_cmd *cmd = NULL;
2766 struct request *req;
2767 bool done;
2768
2769 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2770 return;
2771
2772 /*
2773 * Don't try to cancel this command if the request is started for
2774 * avoiding race between io_uring_cmd_done() and
2775 * io_uring_cmd_complete_in_task().
2776 *
2777 * Either the started request will be aborted via __ublk_abort_rq(),
2778 * then this uring_cmd is canceled next time, or it will be done in
2779 * task work function ublk_dispatch_req() because io_uring guarantees
2780 * that ublk_dispatch_req() is always called
2781 */
2782 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2783 if (req && blk_mq_request_started(req) && req->tag == tag)
2784 return;
2785
2786 spin_lock(&ubq->cancel_lock);
2787 done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2788 if (!done) {
2789 io->flags |= UBLK_IO_FLAG_CANCELED;
2790 cmd = io->cmd;
2791 io->cmd = NULL;
2792 }
2793 spin_unlock(&ubq->cancel_lock);
2794
2795 if (!done && cmd)
2796 io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, issue_flags);
2797 }
2798
2799 /*
2800 * Cancel a batch fetch command if it hasn't been claimed by another path.
2801 *
2802 * An fcmd can only be cancelled if:
2803 * 1. It's not the active_fcmd (which is currently being processed)
2804 * 2. It's still on the list (!list_empty check) - once removed from the list,
2805 * the fcmd is considered claimed and will be freed by whoever removed it
2806 *
2807 * Use list_del_init() so subsequent list_empty() checks work correctly.
2808 */
ublk_batch_cancel_cmd(struct ublk_queue * ubq,struct ublk_batch_fetch_cmd * fcmd,unsigned int issue_flags)2809 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2810 struct ublk_batch_fetch_cmd *fcmd,
2811 unsigned int issue_flags)
2812 {
2813 bool done;
2814
2815 spin_lock(&ubq->evts_lock);
2816 done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2817 if (done)
2818 list_del_init(&fcmd->node);
2819 spin_unlock(&ubq->evts_lock);
2820
2821 if (done) {
2822 io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2823 ublk_batch_free_fcmd(fcmd);
2824 }
2825 }
2826
ublk_batch_cancel_queue(struct ublk_queue * ubq)2827 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2828 {
2829 struct ublk_batch_fetch_cmd *fcmd;
2830 LIST_HEAD(fcmd_list);
2831
2832 spin_lock(&ubq->evts_lock);
2833 ubq->force_abort = true;
2834 list_splice_init(&ubq->fcmd_head, &fcmd_list);
2835 fcmd = READ_ONCE(ubq->active_fcmd);
2836 if (fcmd)
2837 list_move(&fcmd->node, &ubq->fcmd_head);
2838 spin_unlock(&ubq->evts_lock);
2839
2840 while (!list_empty(&fcmd_list)) {
2841 fcmd = list_first_entry(&fcmd_list,
2842 struct ublk_batch_fetch_cmd, node);
2843 ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2844 }
2845 }
2846
ublk_batch_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2847 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2848 unsigned int issue_flags)
2849 {
2850 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2851 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2852 struct ublk_queue *ubq = pdu->ubq;
2853
2854 ublk_start_cancel(ubq->dev);
2855
2856 ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2857 }
2858
2859 /*
2860 * The ublk char device won't be closed when calling cancel fn, so both
2861 * ublk device and queue are guaranteed to be live
2862 *
2863 * Two-stage cancel:
2864 *
2865 * - make every active uring_cmd done in ->cancel_fn()
2866 *
2867 * - aborting inflight ublk IO requests in ublk char device release handler,
2868 * which depends on 1st stage because device can only be closed iff all
2869 * uring_cmd are done
2870 *
2871 * Do _not_ try to acquire ub->mutex before all inflight requests are
2872 * aborted, otherwise deadlock may be caused.
2873 */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2874 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2875 unsigned int issue_flags)
2876 {
2877 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2878 struct ublk_queue *ubq = pdu->ubq;
2879 struct task_struct *task;
2880 struct ublk_io *io;
2881
2882 if (WARN_ON_ONCE(!ubq))
2883 return;
2884
2885 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2886 return;
2887
2888 task = io_uring_cmd_get_task(cmd);
2889 io = &ubq->ios[pdu->tag];
2890 if (WARN_ON_ONCE(task && task != io->task))
2891 return;
2892
2893 ublk_start_cancel(ubq->dev);
2894
2895 WARN_ON_ONCE(io->cmd != cmd);
2896 ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2897 }
2898
ublk_queue_ready(const struct ublk_queue * ubq)2899 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2900 {
2901 return ubq->nr_io_ready == ubq->q_depth;
2902 }
2903
ublk_dev_ready(const struct ublk_device * ub)2904 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2905 {
2906 return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2907 }
2908
ublk_cancel_queue(struct ublk_queue * ubq)2909 static void ublk_cancel_queue(struct ublk_queue *ubq)
2910 {
2911 int i;
2912
2913 if (ublk_support_batch_io(ubq)) {
2914 ublk_batch_cancel_queue(ubq);
2915 return;
2916 }
2917
2918 for (i = 0; i < ubq->q_depth; i++)
2919 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2920 }
2921
2922 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)2923 static void ublk_cancel_dev(struct ublk_device *ub)
2924 {
2925 int i;
2926
2927 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2928 ublk_cancel_queue(ublk_get_queue(ub, i));
2929 }
2930
ublk_check_inflight_rq(struct request * rq,void * data)2931 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2932 {
2933 bool *idle = data;
2934
2935 if (blk_mq_request_started(rq)) {
2936 *idle = false;
2937 return false;
2938 }
2939 return true;
2940 }
2941
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)2942 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2943 {
2944 bool idle;
2945
2946 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2947 while (true) {
2948 idle = true;
2949 blk_mq_tagset_busy_iter(&ub->tag_set,
2950 ublk_check_inflight_rq, &idle);
2951 if (idle)
2952 break;
2953 msleep(UBLK_REQUEUE_DELAY_MS);
2954 }
2955 }
2956
ublk_force_abort_dev(struct ublk_device * ub)2957 static void ublk_force_abort_dev(struct ublk_device *ub)
2958 {
2959 int i;
2960
2961 pr_devel("%s: force abort ub: dev_id %d state %s\n",
2962 __func__, ub->dev_info.dev_id,
2963 ub->dev_info.state == UBLK_S_DEV_LIVE ?
2964 "LIVE" : "QUIESCED");
2965 blk_mq_quiesce_queue(ub->ub_disk->queue);
2966 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2967 ublk_wait_tagset_rqs_idle(ub);
2968
2969 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2970 ublk_get_queue(ub, i)->force_abort = true;
2971 blk_mq_unquiesce_queue(ub->ub_disk->queue);
2972 /* We may have requeued some rqs in ublk_quiesce_queue() */
2973 blk_mq_kick_requeue_list(ub->ub_disk->queue);
2974 }
2975
ublk_detach_disk(struct ublk_device * ub)2976 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2977 {
2978 struct gendisk *disk;
2979
2980 /* Sync with ublk_abort_queue() by holding the lock */
2981 spin_lock(&ub->lock);
2982 disk = ub->ub_disk;
2983 ub->dev_info.state = UBLK_S_DEV_DEAD;
2984 ub->dev_info.ublksrv_pid = -1;
2985 ub->ub_disk = NULL;
2986 spin_unlock(&ub->lock);
2987
2988 return disk;
2989 }
2990
ublk_stop_dev_unlocked(struct ublk_device * ub)2991 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2992 __must_hold(&ub->mutex)
2993 {
2994 struct gendisk *disk;
2995
2996 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2997 return;
2998
2999 if (ublk_nosrv_dev_should_queue_io(ub))
3000 ublk_force_abort_dev(ub);
3001 del_gendisk(ub->ub_disk);
3002 disk = ublk_detach_disk(ub);
3003 put_disk(disk);
3004 }
3005
ublk_stop_dev(struct ublk_device * ub)3006 static void ublk_stop_dev(struct ublk_device *ub)
3007 {
3008 mutex_lock(&ub->mutex);
3009 ublk_stop_dev_unlocked(ub);
3010 mutex_unlock(&ub->mutex);
3011 cancel_work_sync(&ub->partition_scan_work);
3012 ublk_cancel_dev(ub);
3013 }
3014
ublk_reset_io_flags(struct ublk_queue * ubq,struct ublk_io * io)3015 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
3016 {
3017 /* UBLK_IO_FLAG_CANCELED can be cleared now */
3018 spin_lock(&ubq->cancel_lock);
3019 io->flags &= ~UBLK_IO_FLAG_CANCELED;
3020 spin_unlock(&ubq->cancel_lock);
3021 }
3022
3023 /* reset per-queue io flags */
ublk_queue_reset_io_flags(struct ublk_queue * ubq)3024 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
3025 {
3026 spin_lock(&ubq->cancel_lock);
3027 ubq->canceling = false;
3028 spin_unlock(&ubq->cancel_lock);
3029 ubq->fail_io = false;
3030 }
3031
3032 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,u16 q_id,struct ublk_io * io)3033 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
3034 struct ublk_io *io)
3035 __must_hold(&ub->mutex)
3036 {
3037 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3038
3039 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3040 ub->unprivileged_daemons = true;
3041
3042 ubq->nr_io_ready++;
3043 ublk_reset_io_flags(ubq, io);
3044
3045 /* Check if this specific queue is now fully ready */
3046 if (ublk_queue_ready(ubq)) {
3047 ub->nr_queue_ready++;
3048
3049 /*
3050 * Reset queue flags as soon as this queue is ready.
3051 * This clears the canceling flag, allowing batch FETCH commands
3052 * to succeed during recovery without waiting for all queues.
3053 */
3054 ublk_queue_reset_io_flags(ubq);
3055 }
3056
3057 /* Check if all queues are ready */
3058 if (ublk_dev_ready(ub)) {
3059 /*
3060 * All queues ready - clear device-level canceling flag
3061 * and complete the recovery/initialization.
3062 */
3063 mutex_lock(&ub->cancel_mutex);
3064 ub->canceling = false;
3065 mutex_unlock(&ub->cancel_mutex);
3066 complete_all(&ub->completion);
3067 }
3068 }
3069
ublk_check_cmd_op(u32 cmd_op)3070 static inline int ublk_check_cmd_op(u32 cmd_op)
3071 {
3072 u32 ioc_type = _IOC_TYPE(cmd_op);
3073
3074 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3075 return -EOPNOTSUPP;
3076
3077 if (ioc_type != 'u' && ioc_type != 0)
3078 return -EOPNOTSUPP;
3079
3080 return 0;
3081 }
3082
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)3083 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3084 {
3085 struct ublk_auto_buf_reg buf;
3086
3087 buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3088
3089 if (buf.reserved0 || buf.reserved1)
3090 return -EINVAL;
3091
3092 if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3093 return -EINVAL;
3094 io->buf.auto_reg = buf;
3095 return 0;
3096 }
3097
ublk_clear_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3098 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3099 struct io_uring_cmd *cmd,
3100 u16 *buf_idx)
3101 {
3102 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3103 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3104
3105 /*
3106 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3107 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3108 * `io_ring_ctx`.
3109 *
3110 * If this uring_cmd's io_ring_ctx isn't same with the
3111 * one for registering the buffer, it is ublk server's
3112 * responsibility for unregistering the buffer, otherwise
3113 * this ublk request gets stuck.
3114 */
3115 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3116 *buf_idx = io->buf.auto_reg.index;
3117 }
3118 }
3119
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3120 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3121 struct io_uring_cmd *cmd,
3122 u16 *buf_idx)
3123 {
3124 ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3125 return ublk_set_auto_buf_reg(io, cmd);
3126 }
3127
3128 /* Once we return, `io->req` can't be used any more */
3129 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)3130 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3131 {
3132 struct request *req = io->req;
3133
3134 io->cmd = cmd;
3135 io->flags |= UBLK_IO_FLAG_ACTIVE;
3136 /* now this cmd slot is owned by ublk driver */
3137 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3138
3139 return req;
3140 }
3141
3142 static inline int
ublk_config_io_buf(const struct ublk_device * ub,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)3143 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3144 struct io_uring_cmd *cmd, unsigned long buf_addr,
3145 u16 *buf_idx)
3146 {
3147 if (ublk_dev_support_auto_buf_reg(ub))
3148 return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3149
3150 io->buf.addr = buf_addr;
3151 return 0;
3152 }
3153
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)3154 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3155 unsigned int issue_flags,
3156 struct ublk_queue *ubq, unsigned int tag)
3157 {
3158 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3159
3160 /*
3161 * Safe to refer to @ubq since ublk_queue won't be died until its
3162 * commands are completed
3163 */
3164 pdu->ubq = ubq;
3165 pdu->tag = tag;
3166 io_uring_cmd_mark_cancelable(cmd, issue_flags);
3167 }
3168
ublk_io_release(void * priv)3169 static void ublk_io_release(void *priv)
3170 {
3171 struct request *rq = priv;
3172 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3173 struct ublk_io *io = &ubq->ios[rq->tag];
3174
3175 /*
3176 * task_registered_buffers may be 0 if buffers were registered off task
3177 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3178 */
3179 if (current == io->task && io->task_registered_buffers)
3180 io->task_registered_buffers--;
3181 else
3182 ublk_put_req_ref(io, rq);
3183 }
3184
ublk_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned int index,unsigned int issue_flags)3185 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3186 struct ublk_device *ub,
3187 u16 q_id, u16 tag,
3188 struct ublk_io *io,
3189 unsigned int index, unsigned int issue_flags)
3190 {
3191 struct request *req;
3192 int ret;
3193
3194 if (!ublk_dev_support_zero_copy(ub))
3195 return -EINVAL;
3196
3197 req = __ublk_check_and_get_req(ub, q_id, tag, io);
3198 if (!req)
3199 return -EINVAL;
3200
3201 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3202 issue_flags);
3203 if (ret) {
3204 ublk_put_req_ref(io, req);
3205 return ret;
3206 }
3207
3208 return 0;
3209 }
3210
3211 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned index,unsigned issue_flags)3212 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3213 struct ublk_device *ub,
3214 u16 q_id, u16 tag, struct ublk_io *io,
3215 unsigned index, unsigned issue_flags)
3216 {
3217 unsigned new_registered_buffers;
3218 struct request *req = io->req;
3219 int ret;
3220
3221 /*
3222 * Ensure there are still references for ublk_sub_req_ref() to release.
3223 * If not, fall back on the thread-safe buffer registration.
3224 */
3225 new_registered_buffers = io->task_registered_buffers + 1;
3226 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3227 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3228 issue_flags);
3229
3230 if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3231 return -EINVAL;
3232
3233 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3234 issue_flags);
3235 if (ret)
3236 return ret;
3237
3238 io->task_registered_buffers = new_registered_buffers;
3239 return 0;
3240 }
3241
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)3242 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3243 const struct ublk_device *ub,
3244 unsigned int index, unsigned int issue_flags)
3245 {
3246 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3247 return -EINVAL;
3248
3249 return io_buffer_unregister_bvec(cmd, index, issue_flags);
3250 }
3251
ublk_check_fetch_buf(const struct ublk_device * ub,__u64 buf_addr)3252 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3253 {
3254 if (ublk_dev_need_map_io(ub)) {
3255 /*
3256 * FETCH_RQ has to provide IO buffer if NEED GET
3257 * DATA is not enabled
3258 */
3259 if (!buf_addr && !ublk_dev_need_get_data(ub))
3260 return -EINVAL;
3261 } else if (buf_addr) {
3262 /* User copy requires addr to be unset */
3263 return -EINVAL;
3264 }
3265 return 0;
3266 }
3267
__ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,u16 q_id)3268 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3269 struct ublk_io *io, u16 q_id)
3270 {
3271 /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3272 if (ublk_dev_ready(ub))
3273 return -EBUSY;
3274
3275 /* allow each command to be FETCHed at most once */
3276 if (io->flags & UBLK_IO_FLAG_ACTIVE)
3277 return -EINVAL;
3278
3279 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3280
3281 ublk_fill_io_cmd(io, cmd);
3282
3283 if (ublk_dev_support_batch_io(ub))
3284 WRITE_ONCE(io->task, NULL);
3285 else
3286 WRITE_ONCE(io->task, get_task_struct(current));
3287
3288 return 0;
3289 }
3290
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr,u16 q_id)3291 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3292 struct ublk_io *io, __u64 buf_addr, u16 q_id)
3293 {
3294 int ret;
3295
3296 /*
3297 * When handling FETCH command for setting up ublk uring queue,
3298 * ub->mutex is the innermost lock, and we won't block for handling
3299 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3300 */
3301 mutex_lock(&ub->mutex);
3302 ret = __ublk_fetch(cmd, ub, io, q_id);
3303 if (!ret)
3304 ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3305 if (!ret)
3306 ublk_mark_io_ready(ub, q_id, io);
3307 mutex_unlock(&ub->mutex);
3308 return ret;
3309 }
3310
ublk_check_commit_and_fetch(const struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)3311 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3312 struct ublk_io *io, __u64 buf_addr)
3313 {
3314 struct request *req = io->req;
3315
3316 if (ublk_dev_need_map_io(ub)) {
3317 /*
3318 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3319 * NEED GET DATA is not enabled or it is Read IO.
3320 */
3321 if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3322 req_op(req) == REQ_OP_READ))
3323 return -EINVAL;
3324 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3325 /*
3326 * User copy requires addr to be unset when command is
3327 * not zone append
3328 */
3329 return -EINVAL;
3330 }
3331
3332 return 0;
3333 }
3334
ublk_need_complete_req(const struct ublk_device * ub,struct ublk_io * io)3335 static bool ublk_need_complete_req(const struct ublk_device *ub,
3336 struct ublk_io *io)
3337 {
3338 if (ublk_dev_need_req_ref(ub))
3339 return ublk_sub_req_ref(io);
3340 return true;
3341 }
3342
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)3343 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3344 struct request *req)
3345 {
3346 /*
3347 * We have handled UBLK_IO_NEED_GET_DATA command,
3348 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3349 * do the copy work.
3350 */
3351 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3352 /* update iod->addr because ublksrv may have passed a new io buffer */
3353 ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3354 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3355 __func__, ubq->q_id, req->tag, io->flags,
3356 ublk_get_iod(ubq, req->tag)->addr);
3357
3358 return ublk_start_io(ubq, req, io);
3359 }
3360
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)3361 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3362 unsigned int issue_flags)
3363 {
3364 /* May point to userspace-mapped memory */
3365 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3366 struct ublksrv_io_cmd);
3367 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3368 struct ublk_device *ub = cmd->file->private_data;
3369 struct ublk_queue *ubq;
3370 struct ublk_io *io = NULL;
3371 u32 cmd_op = cmd->cmd_op;
3372 u16 q_id = READ_ONCE(ub_src->q_id);
3373 u16 tag = READ_ONCE(ub_src->tag);
3374 s32 result = READ_ONCE(ub_src->result);
3375 u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3376 struct request *req;
3377 int ret;
3378 bool compl;
3379
3380 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3381
3382 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3383 __func__, cmd->cmd_op, q_id, tag, result);
3384
3385 ret = ublk_check_cmd_op(cmd_op);
3386 if (ret)
3387 goto out;
3388
3389 /*
3390 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3391 * so no need to validate the q_id, tag, or task
3392 */
3393 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3394 return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3395
3396 ret = -EINVAL;
3397 if (q_id >= ub->dev_info.nr_hw_queues)
3398 goto out;
3399
3400 ubq = ublk_get_queue(ub, q_id);
3401
3402 if (tag >= ub->dev_info.queue_depth)
3403 goto out;
3404
3405 io = &ubq->ios[tag];
3406 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3407 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3408 ret = ublk_check_fetch_buf(ub, addr);
3409 if (ret)
3410 goto out;
3411 ret = ublk_fetch(cmd, ub, io, addr, q_id);
3412 if (ret)
3413 goto out;
3414
3415 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3416 return -EIOCBQUEUED;
3417 }
3418
3419 if (READ_ONCE(io->task) != current) {
3420 /*
3421 * ublk_register_io_buf() accesses only the io's refcount,
3422 * so can be handled on any task
3423 */
3424 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3425 return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3426 addr, issue_flags);
3427
3428 goto out;
3429 }
3430
3431 /* there is pending io cmd, something must be wrong */
3432 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3433 ret = -EBUSY;
3434 goto out;
3435 }
3436
3437 /*
3438 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3439 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3440 */
3441 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3442 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3443 goto out;
3444
3445 switch (_IOC_NR(cmd_op)) {
3446 case UBLK_IO_REGISTER_IO_BUF:
3447 return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3448 issue_flags);
3449 case UBLK_IO_COMMIT_AND_FETCH_REQ:
3450 ret = ublk_check_commit_and_fetch(ub, io, addr);
3451 if (ret)
3452 goto out;
3453 io->res = result;
3454 req = ublk_fill_io_cmd(io, cmd);
3455 ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3456 if (buf_idx != UBLK_INVALID_BUF_IDX)
3457 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3458 compl = ublk_need_complete_req(ub, io);
3459
3460 if (req_op(req) == REQ_OP_ZONE_APPEND)
3461 req->__sector = addr;
3462 if (compl)
3463 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3464
3465 if (ret)
3466 goto out;
3467 break;
3468 case UBLK_IO_NEED_GET_DATA:
3469 /*
3470 * ublk_get_data() may fail and fallback to requeue, so keep
3471 * uring_cmd active first and prepare for handling new requeued
3472 * request
3473 */
3474 req = ublk_fill_io_cmd(io, cmd);
3475 ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3476 WARN_ON_ONCE(ret);
3477 if (likely(ublk_get_data(ubq, io, req))) {
3478 __ublk_prep_compl_io_cmd(io, req);
3479 return UBLK_IO_RES_OK;
3480 }
3481 break;
3482 default:
3483 goto out;
3484 }
3485 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3486 return -EIOCBQUEUED;
3487
3488 out:
3489 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3490 __func__, cmd_op, tag, ret, io ? io->flags : 0);
3491 return ret;
3492 }
3493
__ublk_check_and_get_req(struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io)3494 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3495 u16 q_id, u16 tag, struct ublk_io *io)
3496 {
3497 struct request *req;
3498
3499 /*
3500 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3501 * which would overwrite it with io->cmd
3502 */
3503 req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3504 if (!req)
3505 return NULL;
3506
3507 if (!ublk_get_req_ref(io))
3508 return NULL;
3509
3510 if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3511 goto fail_put;
3512
3513 if (!ublk_rq_has_data(req))
3514 goto fail_put;
3515
3516 return req;
3517 fail_put:
3518 ublk_put_req_ref(io, req);
3519 return NULL;
3520 }
3521
ublk_ch_uring_cmd_cb(struct io_tw_req tw_req,io_tw_token_t tw)3522 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3523 {
3524 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3525 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3526 int ret = -ECANCELED;
3527
3528 if (!tw.cancel)
3529 ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3530 if (ret != -EIOCBQUEUED)
3531 io_uring_cmd_done(cmd, ret, issue_flags);
3532 }
3533
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3534 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3535 {
3536 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3537 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3538 return 0;
3539 }
3540
3541 /* well-implemented server won't run into unlocked */
3542 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3543 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3544 return -EIOCBQUEUED;
3545 }
3546
3547 return ublk_ch_uring_cmd_local(cmd, issue_flags);
3548 }
3549
ublk_batch_buf_addr(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3550 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3551 const struct ublk_elem_header *elem)
3552 {
3553 const void *buf = elem;
3554
3555 if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3556 return *(const __u64 *)(buf + sizeof(*elem));
3557 return 0;
3558 }
3559
ublk_batch_zone_lba(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3560 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3561 const struct ublk_elem_header *elem)
3562 {
3563 const void *buf = elem;
3564
3565 if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3566 return *(const __u64 *)(buf + sizeof(*elem) +
3567 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3568 return -1;
3569 }
3570
3571 static struct ublk_auto_buf_reg
ublk_batch_auto_buf_reg(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3572 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3573 const struct ublk_elem_header *elem)
3574 {
3575 struct ublk_auto_buf_reg reg = {
3576 .index = elem->buf_index,
3577 .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3578 UBLK_AUTO_BUF_REG_FALLBACK : 0,
3579 };
3580
3581 return reg;
3582 }
3583
3584 /*
3585 * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3586 * it is the least common multiple(LCM) of 8, 16 and 24
3587 */
3588 #define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10)
3589 struct ublk_batch_io_iter {
3590 void __user *uaddr;
3591 unsigned done, total;
3592 unsigned char elem_bytes;
3593 /* copy to this buffer from user space */
3594 unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3595 };
3596
3597 static inline int
__ublk_walk_cmd_buf(struct ublk_queue * ubq,struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,unsigned bytes,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3598 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3599 struct ublk_batch_io_iter *iter,
3600 const struct ublk_batch_io_data *data,
3601 unsigned bytes,
3602 int (*cb)(struct ublk_queue *q,
3603 const struct ublk_batch_io_data *data,
3604 const struct ublk_elem_header *elem))
3605 {
3606 unsigned int i;
3607 int ret = 0;
3608
3609 for (i = 0; i < bytes; i += iter->elem_bytes) {
3610 const struct ublk_elem_header *elem =
3611 (const struct ublk_elem_header *)&iter->buf[i];
3612
3613 if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3614 ret = -EINVAL;
3615 break;
3616 }
3617
3618 ret = cb(ubq, data, elem);
3619 if (unlikely(ret))
3620 break;
3621 }
3622
3623 iter->done += i;
3624 return ret;
3625 }
3626
ublk_walk_cmd_buf(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3627 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3628 const struct ublk_batch_io_data *data,
3629 int (*cb)(struct ublk_queue *q,
3630 const struct ublk_batch_io_data *data,
3631 const struct ublk_elem_header *elem))
3632 {
3633 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3634 int ret = 0;
3635
3636 while (iter->done < iter->total) {
3637 unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3638
3639 if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3640 pr_warn("ublk%d: read batch cmd buffer failed\n",
3641 data->ub->dev_info.dev_id);
3642 return -EFAULT;
3643 }
3644
3645 ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3646 if (ret)
3647 return ret;
3648 }
3649 return 0;
3650 }
3651
ublk_batch_unprep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3652 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3653 const struct ublk_batch_io_data *data,
3654 const struct ublk_elem_header *elem)
3655 {
3656 struct ublk_io *io = &ubq->ios[elem->tag];
3657
3658 /*
3659 * If queue was ready before this decrement, it won't be anymore,
3660 * so we need to decrement the queue ready count and restore the
3661 * canceling flag to prevent new requests from being queued.
3662 */
3663 if (ublk_queue_ready(ubq)) {
3664 data->ub->nr_queue_ready--;
3665 spin_lock(&ubq->cancel_lock);
3666 ubq->canceling = true;
3667 spin_unlock(&ubq->cancel_lock);
3668 }
3669 ubq->nr_io_ready--;
3670
3671 ublk_io_lock(io);
3672 io->flags = 0;
3673 ublk_io_unlock(io);
3674 return 0;
3675 }
3676
ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data)3677 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3678 const struct ublk_batch_io_data *data)
3679 {
3680 int ret;
3681
3682 /* Re-process only what we've already processed, starting from beginning */
3683 iter->total = iter->done;
3684 iter->done = 0;
3685
3686 ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3687 WARN_ON_ONCE(ret);
3688 }
3689
ublk_batch_prep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3690 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3691 const struct ublk_batch_io_data *data,
3692 const struct ublk_elem_header *elem)
3693 {
3694 struct ublk_io *io = &ubq->ios[elem->tag];
3695 const struct ublk_batch_io *uc = &data->header;
3696 union ublk_io_buf buf = { 0 };
3697 int ret;
3698
3699 if (ublk_dev_support_auto_buf_reg(data->ub))
3700 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3701 else if (ublk_dev_need_map_io(data->ub)) {
3702 buf.addr = ublk_batch_buf_addr(uc, elem);
3703
3704 ret = ublk_check_fetch_buf(data->ub, buf.addr);
3705 if (ret)
3706 return ret;
3707 }
3708
3709 ublk_io_lock(io);
3710 ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3711 if (!ret)
3712 io->buf = buf;
3713 ublk_io_unlock(io);
3714
3715 if (!ret)
3716 ublk_mark_io_ready(data->ub, ubq->q_id, io);
3717
3718 return ret;
3719 }
3720
ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data * data)3721 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3722 {
3723 const struct ublk_batch_io *uc = &data->header;
3724 struct io_uring_cmd *cmd = data->cmd;
3725 struct ublk_batch_io_iter iter = {
3726 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3727 .total = uc->nr_elem * uc->elem_bytes,
3728 .elem_bytes = uc->elem_bytes,
3729 };
3730 int ret;
3731
3732 mutex_lock(&data->ub->mutex);
3733 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3734
3735 if (ret && iter.done)
3736 ublk_batch_revert_prep_cmd(&iter, data);
3737 mutex_unlock(&data->ub->mutex);
3738 return ret;
3739 }
3740
ublk_batch_commit_io_check(const struct ublk_queue * ubq,struct ublk_io * io,union ublk_io_buf * buf)3741 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3742 struct ublk_io *io,
3743 union ublk_io_buf *buf)
3744 {
3745 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3746 return -EBUSY;
3747
3748 /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3749 if (ublk_need_map_io(ubq) && !buf->addr)
3750 return -EINVAL;
3751 return 0;
3752 }
3753
ublk_batch_commit_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3754 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3755 const struct ublk_batch_io_data *data,
3756 const struct ublk_elem_header *elem)
3757 {
3758 struct ublk_io *io = &ubq->ios[elem->tag];
3759 const struct ublk_batch_io *uc = &data->header;
3760 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3761 union ublk_io_buf buf = { 0 };
3762 struct request *req = NULL;
3763 bool auto_reg = false;
3764 bool compl = false;
3765 int ret;
3766
3767 if (ublk_dev_support_auto_buf_reg(data->ub)) {
3768 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3769 auto_reg = true;
3770 } else if (ublk_dev_need_map_io(data->ub))
3771 buf.addr = ublk_batch_buf_addr(uc, elem);
3772
3773 ublk_io_lock(io);
3774 ret = ublk_batch_commit_io_check(ubq, io, &buf);
3775 if (!ret) {
3776 io->res = elem->result;
3777 io->buf = buf;
3778 req = ublk_fill_io_cmd(io, data->cmd);
3779
3780 if (auto_reg)
3781 ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3782 compl = ublk_need_complete_req(data->ub, io);
3783 }
3784 ublk_io_unlock(io);
3785
3786 if (unlikely(ret)) {
3787 pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3788 __func__, data->ub->dev_info.dev_id, ubq->q_id,
3789 elem->tag, ret);
3790 return ret;
3791 }
3792
3793 if (buf_idx != UBLK_INVALID_BUF_IDX)
3794 io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3795 if (req_op(req) == REQ_OP_ZONE_APPEND)
3796 req->__sector = ublk_batch_zone_lba(uc, elem);
3797 if (compl)
3798 __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3799 return 0;
3800 }
3801
ublk_handle_batch_commit_cmd(struct ublk_batch_io_data * data)3802 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3803 {
3804 const struct ublk_batch_io *uc = &data->header;
3805 struct io_uring_cmd *cmd = data->cmd;
3806 struct ublk_batch_io_iter iter = {
3807 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3808 .total = uc->nr_elem * uc->elem_bytes,
3809 .elem_bytes = uc->elem_bytes,
3810 };
3811 DEFINE_IO_COMP_BATCH(iob);
3812 int ret;
3813
3814 data->iob = &iob;
3815 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3816
3817 if (iob.complete)
3818 iob.complete(&iob);
3819
3820 return iter.done == 0 ? ret : iter.done;
3821 }
3822
ublk_check_batch_cmd_flags(const struct ublk_batch_io * uc)3823 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3824 {
3825 unsigned elem_bytes = sizeof(struct ublk_elem_header);
3826
3827 if (uc->flags & ~UBLK_BATCH_F_ALL)
3828 return -EINVAL;
3829
3830 /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3831 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3832 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3833 return -EINVAL;
3834
3835 elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3836 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3837 if (uc->elem_bytes != elem_bytes)
3838 return -EINVAL;
3839 return 0;
3840 }
3841
ublk_check_batch_cmd(const struct ublk_batch_io_data * data)3842 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3843 {
3844 const struct ublk_batch_io *uc = &data->header;
3845
3846 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3847 return -EINVAL;
3848
3849 if (uc->nr_elem > data->ub->dev_info.queue_depth)
3850 return -E2BIG;
3851
3852 if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3853 !ublk_dev_is_zoned(data->ub))
3854 return -EINVAL;
3855
3856 if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3857 !ublk_dev_need_map_io(data->ub))
3858 return -EINVAL;
3859
3860 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3861 !ublk_dev_support_auto_buf_reg(data->ub))
3862 return -EINVAL;
3863
3864 return ublk_check_batch_cmd_flags(uc);
3865 }
3866
ublk_batch_attach(struct ublk_queue * ubq,struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)3867 static int ublk_batch_attach(struct ublk_queue *ubq,
3868 struct ublk_batch_io_data *data,
3869 struct ublk_batch_fetch_cmd *fcmd)
3870 {
3871 struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3872 bool free = false;
3873 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3874
3875 spin_lock(&ubq->evts_lock);
3876 if (unlikely(ubq->force_abort || ubq->canceling)) {
3877 free = true;
3878 } else {
3879 list_add_tail(&fcmd->node, &ubq->fcmd_head);
3880 new_fcmd = __ublk_acquire_fcmd(ubq);
3881 }
3882 spin_unlock(&ubq->evts_lock);
3883
3884 if (unlikely(free)) {
3885 ublk_batch_free_fcmd(fcmd);
3886 return -ENODEV;
3887 }
3888
3889 pdu->ubq = ubq;
3890 pdu->fcmd = fcmd;
3891 io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3892
3893 if (!new_fcmd)
3894 goto out;
3895
3896 /*
3897 * If the two fetch commands are originated from same io_ring_ctx,
3898 * run batch dispatch directly. Otherwise, schedule task work for
3899 * doing it.
3900 */
3901 if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3902 io_uring_cmd_ctx_handle(fcmd->cmd)) {
3903 data->cmd = new_fcmd->cmd;
3904 ublk_batch_dispatch(ubq, data, new_fcmd);
3905 } else {
3906 io_uring_cmd_complete_in_task(new_fcmd->cmd,
3907 ublk_batch_tw_cb);
3908 }
3909 out:
3910 return -EIOCBQUEUED;
3911 }
3912
ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data * data)3913 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3914 {
3915 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3916 struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3917
3918 if (!fcmd)
3919 return -ENOMEM;
3920
3921 return ublk_batch_attach(ubq, data, fcmd);
3922 }
3923
ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data * data)3924 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3925 {
3926 const struct ublk_batch_io *uc = &data->header;
3927
3928 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3929 return -EINVAL;
3930
3931 if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3932 return -EINVAL;
3933
3934 if (uc->elem_bytes != sizeof(__u16))
3935 return -EINVAL;
3936
3937 if (uc->flags != 0)
3938 return -EINVAL;
3939
3940 return 0;
3941 }
3942
ublk_handle_non_batch_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3943 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3944 unsigned int issue_flags)
3945 {
3946 const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3947 struct ublksrv_io_cmd);
3948 struct ublk_device *ub = cmd->file->private_data;
3949 unsigned tag = READ_ONCE(ub_cmd->tag);
3950 unsigned q_id = READ_ONCE(ub_cmd->q_id);
3951 unsigned index = READ_ONCE(ub_cmd->addr);
3952 struct ublk_queue *ubq;
3953 struct ublk_io *io;
3954
3955 if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3956 return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3957
3958 if (q_id >= ub->dev_info.nr_hw_queues)
3959 return -EINVAL;
3960
3961 if (tag >= ub->dev_info.queue_depth)
3962 return -EINVAL;
3963
3964 if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3965 return -EOPNOTSUPP;
3966
3967 ubq = ublk_get_queue(ub, q_id);
3968 io = &ubq->ios[tag];
3969 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3970 issue_flags);
3971 }
3972
ublk_ch_batch_io_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3973 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3974 unsigned int issue_flags)
3975 {
3976 const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3977 struct ublk_batch_io);
3978 struct ublk_device *ub = cmd->file->private_data;
3979 struct ublk_batch_io_data data = {
3980 .ub = ub,
3981 .cmd = cmd,
3982 .header = (struct ublk_batch_io) {
3983 .q_id = READ_ONCE(uc->q_id),
3984 .flags = READ_ONCE(uc->flags),
3985 .nr_elem = READ_ONCE(uc->nr_elem),
3986 .elem_bytes = READ_ONCE(uc->elem_bytes),
3987 },
3988 .issue_flags = issue_flags,
3989 };
3990 u32 cmd_op = cmd->cmd_op;
3991 int ret = -EINVAL;
3992
3993 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3994 ublk_batch_cancel_fn(cmd, issue_flags);
3995 return 0;
3996 }
3997
3998 switch (cmd_op) {
3999 case UBLK_U_IO_PREP_IO_CMDS:
4000 ret = ublk_check_batch_cmd(&data);
4001 if (ret)
4002 goto out;
4003 ret = ublk_handle_batch_prep_cmd(&data);
4004 break;
4005 case UBLK_U_IO_COMMIT_IO_CMDS:
4006 ret = ublk_check_batch_cmd(&data);
4007 if (ret)
4008 goto out;
4009 ret = ublk_handle_batch_commit_cmd(&data);
4010 break;
4011 case UBLK_U_IO_FETCH_IO_CMDS:
4012 ret = ublk_validate_batch_fetch_cmd(&data);
4013 if (ret)
4014 goto out;
4015 ret = ublk_handle_batch_fetch_cmd(&data);
4016 break;
4017 default:
4018 ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
4019 break;
4020 }
4021 out:
4022 return ret;
4023 }
4024
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)4025 static inline bool ublk_check_ubuf_dir(const struct request *req,
4026 int ubuf_dir)
4027 {
4028 /* copy ubuf to request pages */
4029 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
4030 ubuf_dir == ITER_SOURCE)
4031 return true;
4032
4033 /* copy request pages to ubuf */
4034 if ((req_op(req) == REQ_OP_WRITE ||
4035 req_op(req) == REQ_OP_ZONE_APPEND) &&
4036 ubuf_dir == ITER_DEST)
4037 return true;
4038
4039 return false;
4040 }
4041
4042 static ssize_t
ublk_user_copy(struct kiocb * iocb,struct iov_iter * iter,int dir)4043 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4044 {
4045 struct ublk_device *ub = iocb->ki_filp->private_data;
4046 struct ublk_queue *ubq;
4047 struct request *req;
4048 struct ublk_io *io;
4049 unsigned data_len;
4050 bool is_integrity;
4051 bool on_daemon;
4052 size_t buf_off;
4053 u16 tag, q_id;
4054 ssize_t ret;
4055
4056 if (!user_backed_iter(iter))
4057 return -EACCES;
4058
4059 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4060 return -EACCES;
4061
4062 tag = ublk_pos_to_tag(iocb->ki_pos);
4063 q_id = ublk_pos_to_hwq(iocb->ki_pos);
4064 buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4065 is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4066
4067 if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4068 return -EINVAL;
4069
4070 if (q_id >= ub->dev_info.nr_hw_queues)
4071 return -EINVAL;
4072
4073 ubq = ublk_get_queue(ub, q_id);
4074 if (!ublk_dev_support_user_copy(ub))
4075 return -EACCES;
4076
4077 if (tag >= ub->dev_info.queue_depth)
4078 return -EINVAL;
4079
4080 io = &ubq->ios[tag];
4081 on_daemon = current == READ_ONCE(io->task);
4082 if (on_daemon) {
4083 /* On daemon, io can't be completed concurrently, so skip ref */
4084 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4085 return -EINVAL;
4086
4087 req = io->req;
4088 if (!ublk_rq_has_data(req))
4089 return -EINVAL;
4090 } else {
4091 req = __ublk_check_and_get_req(ub, q_id, tag, io);
4092 if (!req)
4093 return -EINVAL;
4094 }
4095
4096 if (is_integrity) {
4097 struct blk_integrity *bi = &req->q->limits.integrity;
4098
4099 data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4100 } else {
4101 data_len = blk_rq_bytes(req);
4102 }
4103 if (buf_off > data_len) {
4104 ret = -EINVAL;
4105 goto out;
4106 }
4107
4108 if (!ublk_check_ubuf_dir(req, dir)) {
4109 ret = -EACCES;
4110 goto out;
4111 }
4112
4113 if (is_integrity)
4114 ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4115 else
4116 ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4117
4118 out:
4119 if (!on_daemon)
4120 ublk_put_req_ref(io, req);
4121 return ret;
4122 }
4123
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)4124 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4125 {
4126 return ublk_user_copy(iocb, to, ITER_DEST);
4127 }
4128
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)4129 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4130 {
4131 return ublk_user_copy(iocb, from, ITER_SOURCE);
4132 }
4133
4134 static const struct file_operations ublk_ch_fops = {
4135 .owner = THIS_MODULE,
4136 .open = ublk_ch_open,
4137 .release = ublk_ch_release,
4138 .read_iter = ublk_ch_read_iter,
4139 .write_iter = ublk_ch_write_iter,
4140 .uring_cmd = ublk_ch_uring_cmd,
4141 .mmap = ublk_ch_mmap,
4142 };
4143
4144 static const struct file_operations ublk_ch_batch_io_fops = {
4145 .owner = THIS_MODULE,
4146 .open = ublk_ch_open,
4147 .release = ublk_ch_release,
4148 .read_iter = ublk_ch_read_iter,
4149 .write_iter = ublk_ch_write_iter,
4150 .uring_cmd = ublk_ch_batch_io_uring_cmd,
4151 .mmap = ublk_ch_mmap,
4152 };
4153
__ublk_deinit_queue(struct ublk_device * ub,struct ublk_queue * ubq)4154 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4155 {
4156 int size, i;
4157
4158 size = ublk_queue_cmd_buf_size(ub);
4159
4160 for (i = 0; i < ubq->q_depth; i++) {
4161 struct ublk_io *io = &ubq->ios[i];
4162 if (io->task)
4163 put_task_struct(io->task);
4164 WARN_ON_ONCE(refcount_read(&io->ref));
4165 WARN_ON_ONCE(io->task_registered_buffers);
4166 }
4167
4168 if (ubq->io_cmd_buf)
4169 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4170
4171 if (ublk_dev_support_batch_io(ub))
4172 ublk_io_evts_deinit(ubq);
4173
4174 kvfree(ubq);
4175 }
4176
ublk_deinit_queue(struct ublk_device * ub,int q_id)4177 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4178 {
4179 struct ublk_queue *ubq = ub->queues[q_id];
4180
4181 if (!ubq)
4182 return;
4183
4184 __ublk_deinit_queue(ub, ubq);
4185 ub->queues[q_id] = NULL;
4186 }
4187
ublk_get_queue_numa_node(struct ublk_device * ub,int q_id)4188 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4189 {
4190 unsigned int cpu;
4191
4192 /* Find first CPU mapped to this queue */
4193 for_each_possible_cpu(cpu) {
4194 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4195 return cpu_to_node(cpu);
4196 }
4197
4198 return NUMA_NO_NODE;
4199 }
4200
ublk_init_queue(struct ublk_device * ub,int q_id)4201 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4202 {
4203 int depth = ub->dev_info.queue_depth;
4204 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4205 struct ublk_queue *ubq;
4206 struct page *page;
4207 int numa_node;
4208 int size, i, ret;
4209
4210 /* Determine NUMA node based on queue's CPU affinity */
4211 numa_node = ublk_get_queue_numa_node(ub, q_id);
4212
4213 /* Allocate queue structure on local NUMA node */
4214 ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4215 numa_node);
4216 if (!ubq)
4217 return -ENOMEM;
4218
4219 spin_lock_init(&ubq->cancel_lock);
4220 ubq->flags = ub->dev_info.flags;
4221 ubq->q_id = q_id;
4222 ubq->q_depth = depth;
4223 size = ublk_queue_cmd_buf_size(ub);
4224
4225 /* Allocate I/O command buffer on local NUMA node */
4226 page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4227 if (!page) {
4228 kvfree(ubq);
4229 return -ENOMEM;
4230 }
4231 ubq->io_cmd_buf = page_address(page);
4232
4233 for (i = 0; i < ubq->q_depth; i++)
4234 spin_lock_init(&ubq->ios[i].lock);
4235
4236 if (ublk_dev_support_batch_io(ub)) {
4237 ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4238 if (ret)
4239 goto fail;
4240 INIT_LIST_HEAD(&ubq->fcmd_head);
4241 }
4242 ub->queues[q_id] = ubq;
4243 ubq->dev = ub;
4244
4245 return 0;
4246 fail:
4247 __ublk_deinit_queue(ub, ubq);
4248 return ret;
4249 }
4250
ublk_deinit_queues(struct ublk_device * ub)4251 static void ublk_deinit_queues(struct ublk_device *ub)
4252 {
4253 int i;
4254
4255 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4256 ublk_deinit_queue(ub, i);
4257 }
4258
ublk_init_queues(struct ublk_device * ub)4259 static int ublk_init_queues(struct ublk_device *ub)
4260 {
4261 int i, ret;
4262
4263 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4264 ret = ublk_init_queue(ub, i);
4265 if (ret)
4266 goto fail;
4267 }
4268
4269 init_completion(&ub->completion);
4270 return 0;
4271
4272 fail:
4273 ublk_deinit_queues(ub);
4274 return ret;
4275 }
4276
ublk_alloc_dev_number(struct ublk_device * ub,int idx)4277 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4278 {
4279 int i = idx;
4280 int err;
4281
4282 spin_lock(&ublk_idr_lock);
4283 /* allocate id, if @id >= 0, we're requesting that specific id */
4284 if (i >= 0) {
4285 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4286 if (err == -ENOSPC)
4287 err = -EEXIST;
4288 } else {
4289 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4290 GFP_NOWAIT);
4291 }
4292 spin_unlock(&ublk_idr_lock);
4293
4294 if (err >= 0)
4295 ub->ub_number = err;
4296
4297 return err;
4298 }
4299
ublk_free_dev_number(struct ublk_device * ub)4300 static void ublk_free_dev_number(struct ublk_device *ub)
4301 {
4302 spin_lock(&ublk_idr_lock);
4303 idr_remove(&ublk_index_idr, ub->ub_number);
4304 wake_up_all(&ublk_idr_wq);
4305 spin_unlock(&ublk_idr_lock);
4306 }
4307
ublk_cdev_rel(struct device * dev)4308 static void ublk_cdev_rel(struct device *dev)
4309 {
4310 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4311
4312 ublk_buf_cleanup(ub);
4313 blk_mq_free_tag_set(&ub->tag_set);
4314 ublk_deinit_queues(ub);
4315 ublk_free_dev_number(ub);
4316 mutex_destroy(&ub->mutex);
4317 mutex_destroy(&ub->cancel_mutex);
4318 kfree(ub);
4319 }
4320
ublk_add_chdev(struct ublk_device * ub)4321 static int ublk_add_chdev(struct ublk_device *ub)
4322 {
4323 struct device *dev = &ub->cdev_dev;
4324 int minor = ub->ub_number;
4325 int ret;
4326
4327 dev->parent = ublk_misc.this_device;
4328 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4329 dev->class = &ublk_chr_class;
4330 dev->release = ublk_cdev_rel;
4331 device_initialize(dev);
4332
4333 ret = dev_set_name(dev, "ublkc%d", minor);
4334 if (ret)
4335 goto fail;
4336
4337 if (ublk_dev_support_batch_io(ub))
4338 cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4339 else
4340 cdev_init(&ub->cdev, &ublk_ch_fops);
4341 ret = cdev_device_add(&ub->cdev, dev);
4342 if (ret)
4343 goto fail;
4344
4345 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4346 unprivileged_ublks_added++;
4347 return 0;
4348 fail:
4349 put_device(dev);
4350 return ret;
4351 }
4352
4353 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)4354 static void ublk_align_max_io_size(struct ublk_device *ub)
4355 {
4356 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4357
4358 ub->dev_info.max_io_buf_bytes =
4359 round_down(max_io_bytes, PAGE_SIZE);
4360 }
4361
ublk_add_tag_set(struct ublk_device * ub)4362 static int ublk_add_tag_set(struct ublk_device *ub)
4363 {
4364 if (ublk_dev_support_batch_io(ub))
4365 ub->tag_set.ops = &ublk_batch_mq_ops;
4366 else
4367 ub->tag_set.ops = &ublk_mq_ops;
4368 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4369 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4370 ub->tag_set.numa_node = NUMA_NO_NODE;
4371 ub->tag_set.driver_data = ub;
4372 return blk_mq_alloc_tag_set(&ub->tag_set);
4373 }
4374
ublk_remove(struct ublk_device * ub)4375 static void ublk_remove(struct ublk_device *ub)
4376 {
4377 bool unprivileged;
4378
4379 ublk_stop_dev(ub);
4380 cdev_device_del(&ub->cdev, &ub->cdev_dev);
4381 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4382 ublk_put_device(ub);
4383
4384 if (unprivileged)
4385 unprivileged_ublks_added--;
4386 }
4387
ublk_get_device_from_id(int idx)4388 static struct ublk_device *ublk_get_device_from_id(int idx)
4389 {
4390 struct ublk_device *ub = NULL;
4391
4392 if (idx < 0)
4393 return NULL;
4394
4395 spin_lock(&ublk_idr_lock);
4396 ub = idr_find(&ublk_index_idr, idx);
4397 if (ub)
4398 ub = ublk_get_device(ub);
4399 spin_unlock(&ublk_idr_lock);
4400
4401 return ub;
4402 }
4403
ublk_validate_user_pid(struct ublk_device * ub,pid_t ublksrv_pid)4404 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4405 {
4406 rcu_read_lock();
4407 ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4408 rcu_read_unlock();
4409
4410 return ub->ublksrv_tgid == ublksrv_pid;
4411 }
4412
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4413 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4414 const struct ublksrv_ctrl_cmd *header)
4415 {
4416 const struct ublk_param_basic *p = &ub->params.basic;
4417 int ublksrv_pid = (int)header->data[0];
4418 struct queue_limits lim = {
4419 .logical_block_size = 1 << p->logical_bs_shift,
4420 .physical_block_size = 1 << p->physical_bs_shift,
4421 .io_min = 1 << p->io_min_shift,
4422 .io_opt = 1 << p->io_opt_shift,
4423 .max_hw_sectors = p->max_sectors,
4424 .chunk_sectors = p->chunk_sectors,
4425 .virt_boundary_mask = p->virt_boundary_mask,
4426 .max_segments = USHRT_MAX,
4427 .max_segment_size = UINT_MAX,
4428 .dma_alignment = 3,
4429 };
4430 struct gendisk *disk;
4431 int ret = -EINVAL;
4432
4433 if (ublksrv_pid <= 0)
4434 return -EINVAL;
4435 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4436 return -EINVAL;
4437
4438 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4439 const struct ublk_param_discard *pd = &ub->params.discard;
4440
4441 lim.discard_alignment = pd->discard_alignment;
4442 lim.discard_granularity = pd->discard_granularity;
4443 lim.max_hw_discard_sectors = pd->max_discard_sectors;
4444 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4445 lim.max_discard_segments = pd->max_discard_segments;
4446 }
4447
4448 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4449 const struct ublk_param_zoned *p = &ub->params.zoned;
4450
4451 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4452 return -EOPNOTSUPP;
4453
4454 lim.features |= BLK_FEAT_ZONED;
4455 lim.max_active_zones = p->max_active_zones;
4456 lim.max_open_zones = p->max_open_zones;
4457 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4458 }
4459
4460 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4461 lim.features |= BLK_FEAT_WRITE_CACHE;
4462 if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4463 lim.features |= BLK_FEAT_FUA;
4464 }
4465
4466 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4467 lim.features |= BLK_FEAT_ROTATIONAL;
4468
4469 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4470 lim.dma_alignment = ub->params.dma.alignment;
4471
4472 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4473 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4474 lim.max_segment_size = ub->params.seg.max_segment_size;
4475 lim.max_segments = ub->params.seg.max_segments;
4476 }
4477
4478 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4479 const struct ublk_param_integrity *p = &ub->params.integrity;
4480 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4481
4482 lim.max_integrity_segments =
4483 p->max_integrity_segments ?: USHRT_MAX;
4484 lim.integrity = (struct blk_integrity) {
4485 .flags = ublk_integrity_flags(p->flags),
4486 .csum_type = ublk_integrity_csum_type(p->csum_type),
4487 .metadata_size = p->metadata_size,
4488 .pi_offset = p->pi_offset,
4489 .interval_exp = p->interval_exp,
4490 .tag_size = p->tag_size,
4491 .pi_tuple_size = pi_tuple_size,
4492 };
4493 }
4494
4495 if (wait_for_completion_interruptible(&ub->completion) != 0)
4496 return -EINTR;
4497
4498 if (!ublk_validate_user_pid(ub, ublksrv_pid))
4499 return -EINVAL;
4500
4501 mutex_lock(&ub->mutex);
4502 /* device may become not ready in case of F_BATCH */
4503 if (!ublk_dev_ready(ub)) {
4504 ret = -EINVAL;
4505 goto out_unlock;
4506 }
4507 if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4508 test_bit(UB_STATE_USED, &ub->state)) {
4509 ret = -EEXIST;
4510 goto out_unlock;
4511 }
4512
4513 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4514 if (IS_ERR(disk)) {
4515 ret = PTR_ERR(disk);
4516 goto out_unlock;
4517 }
4518 sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4519 disk->fops = &ub_fops;
4520 disk->private_data = ub;
4521
4522 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4523 ub->ub_disk = disk;
4524
4525 ublk_apply_params(ub);
4526
4527 /*
4528 * Suppress partition scan to avoid potential IO hang.
4529 *
4530 * If ublk server error occurs during partition scan, the IO may
4531 * wait while holding ub->mutex, which can deadlock with other
4532 * operations that need the mutex. Defer partition scan to async
4533 * work.
4534 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4535 * permanently.
4536 */
4537 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4538
4539 ublk_get_device(ub);
4540 ub->dev_info.state = UBLK_S_DEV_LIVE;
4541
4542 if (ublk_dev_is_zoned(ub)) {
4543 ret = ublk_revalidate_disk_zones(ub);
4544 if (ret)
4545 goto out_put_cdev;
4546 }
4547
4548 ret = add_disk(disk);
4549 if (ret)
4550 goto out_put_cdev;
4551
4552 set_bit(UB_STATE_USED, &ub->state);
4553
4554 /* Skip partition scan if disabled by user */
4555 if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4556 /* Not clear for unprivileged daemons, see comment above */
4557 if (!ub->unprivileged_daemons)
4558 clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4559 } else {
4560 /* Schedule async partition scan for trusted daemons */
4561 if (!ub->unprivileged_daemons)
4562 schedule_work(&ub->partition_scan_work);
4563 }
4564
4565 out_put_cdev:
4566 if (ret) {
4567 ublk_detach_disk(ub);
4568 ublk_put_device(ub);
4569 }
4570 if (ret)
4571 put_disk(disk);
4572 out_unlock:
4573 mutex_unlock(&ub->mutex);
4574 return ret;
4575 }
4576
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4577 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4578 const struct ublksrv_ctrl_cmd *header)
4579 {
4580 void __user *argp = (void __user *)(unsigned long)header->addr;
4581 cpumask_var_t cpumask;
4582 unsigned long queue;
4583 unsigned int retlen;
4584 unsigned int i;
4585 int ret;
4586
4587 if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4588 return -EINVAL;
4589 if (header->len & (sizeof(unsigned long)-1))
4590 return -EINVAL;
4591 if (!header->addr)
4592 return -EINVAL;
4593
4594 queue = header->data[0];
4595 if (queue >= ub->dev_info.nr_hw_queues)
4596 return -EINVAL;
4597
4598 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4599 return -ENOMEM;
4600
4601 for_each_possible_cpu(i) {
4602 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4603 cpumask_set_cpu(i, cpumask);
4604 }
4605
4606 ret = -EFAULT;
4607 retlen = min_t(unsigned short, header->len, cpumask_size());
4608 if (copy_to_user(argp, cpumask, retlen))
4609 goto out_free_cpumask;
4610 if (retlen != header->len &&
4611 clear_user(argp + retlen, header->len - retlen))
4612 goto out_free_cpumask;
4613
4614 ret = 0;
4615 out_free_cpumask:
4616 free_cpumask_var(cpumask);
4617 return ret;
4618 }
4619
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)4620 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4621 {
4622 pr_devel("%s: dev id %d flags %llx\n", __func__,
4623 info->dev_id, info->flags);
4624 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4625 info->nr_hw_queues, info->queue_depth);
4626 }
4627
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)4628 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4629 {
4630 void __user *argp = (void __user *)(unsigned long)header->addr;
4631 struct ublksrv_ctrl_dev_info info;
4632 struct ublk_device *ub;
4633 int ret = -EINVAL;
4634
4635 if (header->len < sizeof(info) || !header->addr)
4636 return -EINVAL;
4637 if (header->queue_id != (u16)-1) {
4638 pr_warn("%s: queue_id is wrong %x\n",
4639 __func__, header->queue_id);
4640 return -EINVAL;
4641 }
4642
4643 if (copy_from_user(&info, argp, sizeof(info)))
4644 return -EFAULT;
4645
4646 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4647 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4648 return -EINVAL;
4649
4650 if (capable(CAP_SYS_ADMIN))
4651 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4652 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4653 return -EPERM;
4654
4655 /* forbid nonsense combinations of recovery flags */
4656 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4657 case 0:
4658 case UBLK_F_USER_RECOVERY:
4659 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4660 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4661 break;
4662 default:
4663 pr_warn("%s: invalid recovery flags %llx\n", __func__,
4664 info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4665 return -EINVAL;
4666 }
4667
4668 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4669 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4670 return -EINVAL;
4671 }
4672
4673 /*
4674 * unprivileged device can't be trusted, but RECOVERY and
4675 * RECOVERY_REISSUE still may hang error handling, so can't
4676 * support recovery features for unprivileged ublk now
4677 *
4678 * TODO: provide forward progress for RECOVERY handler, so that
4679 * unprivileged device can benefit from it
4680 */
4681 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4682 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4683 UBLK_F_USER_RECOVERY);
4684
4685 /*
4686 * For USER_COPY, we depends on userspace to fill request
4687 * buffer by pwrite() to ublk char device, which can't be
4688 * used for unprivileged device
4689 *
4690 * Same with zero copy or auto buffer register.
4691 */
4692 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4693 UBLK_F_AUTO_BUF_REG))
4694 return -EINVAL;
4695 }
4696
4697 /* User copy is required to access integrity buffer */
4698 if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4699 return -EINVAL;
4700
4701 /* the created device is always owned by current user */
4702 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4703
4704 if (header->dev_id != info.dev_id) {
4705 pr_warn("%s: dev id not match %u %u\n",
4706 __func__, header->dev_id, info.dev_id);
4707 return -EINVAL;
4708 }
4709
4710 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4711 pr_warn("%s: dev id is too large. Max supported is %d\n",
4712 __func__, UBLK_MAX_UBLKS - 1);
4713 return -EINVAL;
4714 }
4715
4716 ublk_dump_dev_info(&info);
4717
4718 ret = mutex_lock_killable(&ublk_ctl_mutex);
4719 if (ret)
4720 return ret;
4721
4722 ret = -EACCES;
4723 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4724 unprivileged_ublks_added >= unprivileged_ublks_max)
4725 goto out_unlock;
4726
4727 ret = -ENOMEM;
4728 ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4729 if (!ub)
4730 goto out_unlock;
4731 mutex_init(&ub->mutex);
4732 spin_lock_init(&ub->lock);
4733 mutex_init(&ub->cancel_mutex);
4734 mt_init(&ub->buf_tree);
4735 ida_init(&ub->buf_ida);
4736 INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4737
4738 ret = ublk_alloc_dev_number(ub, header->dev_id);
4739 if (ret < 0)
4740 goto out_free_ub;
4741
4742 memcpy(&ub->dev_info, &info, sizeof(info));
4743
4744 /* update device id */
4745 ub->dev_info.dev_id = ub->ub_number;
4746
4747 /*
4748 * 64bit flags will be copied back to userspace as feature
4749 * negotiation result, so have to clear flags which driver
4750 * doesn't support yet, then userspace can get correct flags
4751 * (features) to handle.
4752 */
4753 ub->dev_info.flags &= UBLK_F_ALL;
4754
4755 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4756 UBLK_F_URING_CMD_COMP_IN_TASK |
4757 UBLK_F_PER_IO_DAEMON |
4758 UBLK_F_BUF_REG_OFF_DAEMON |
4759 UBLK_F_SAFE_STOP_DEV;
4760
4761 /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4762 if (ublk_dev_support_batch_io(ub))
4763 ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4764
4765 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4766 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4767 UBLK_F_AUTO_BUF_REG))
4768 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4769
4770 /* UBLK_F_BATCH_IO doesn't support GET_DATA */
4771 if (ublk_dev_support_batch_io(ub))
4772 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4773
4774 /*
4775 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4776 * returning write_append_lba, which is only allowed in case of
4777 * user copy or zero copy
4778 */
4779 if (ublk_dev_is_zoned(ub) &&
4780 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4781 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4782 ret = -EINVAL;
4783 goto out_free_dev_number;
4784 }
4785
4786 ub->dev_info.nr_hw_queues = min_t(unsigned int,
4787 ub->dev_info.nr_hw_queues, nr_cpu_ids);
4788 ublk_align_max_io_size(ub);
4789
4790 ret = ublk_add_tag_set(ub);
4791 if (ret)
4792 goto out_free_dev_number;
4793
4794 ret = ublk_init_queues(ub);
4795 if (ret)
4796 goto out_free_tag_set;
4797
4798 ret = -EFAULT;
4799 if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4800 goto out_deinit_queues;
4801
4802 /*
4803 * Add the char dev so that ublksrv daemon can be setup.
4804 * ublk_add_chdev() will cleanup everything if it fails.
4805 */
4806 ret = ublk_add_chdev(ub);
4807 goto out_unlock;
4808
4809 out_deinit_queues:
4810 ublk_deinit_queues(ub);
4811 out_free_tag_set:
4812 blk_mq_free_tag_set(&ub->tag_set);
4813 out_free_dev_number:
4814 ublk_free_dev_number(ub);
4815 out_free_ub:
4816 mutex_destroy(&ub->mutex);
4817 mutex_destroy(&ub->cancel_mutex);
4818 kfree(ub);
4819 out_unlock:
4820 mutex_unlock(&ublk_ctl_mutex);
4821 return ret;
4822 }
4823
ublk_idr_freed(int id)4824 static inline bool ublk_idr_freed(int id)
4825 {
4826 void *ptr;
4827
4828 spin_lock(&ublk_idr_lock);
4829 ptr = idr_find(&ublk_index_idr, id);
4830 spin_unlock(&ublk_idr_lock);
4831
4832 return ptr == NULL;
4833 }
4834
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)4835 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4836 {
4837 struct ublk_device *ub = *p_ub;
4838 int idx = ub->ub_number;
4839 int ret;
4840
4841 ret = mutex_lock_killable(&ublk_ctl_mutex);
4842 if (ret)
4843 return ret;
4844
4845 if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4846 ublk_remove(ub);
4847 set_bit(UB_STATE_DELETED, &ub->state);
4848 }
4849
4850 /* Mark the reference as consumed */
4851 *p_ub = NULL;
4852 ublk_put_device(ub);
4853 mutex_unlock(&ublk_ctl_mutex);
4854
4855 /*
4856 * Wait until the idr is removed, then it can be reused after
4857 * DEL_DEV command is returned.
4858 *
4859 * If we returns because of user interrupt, future delete command
4860 * may come:
4861 *
4862 * - the device number isn't freed, this device won't or needn't
4863 * be deleted again, since UB_STATE_DELETED is set, and device
4864 * will be released after the last reference is dropped
4865 *
4866 * - the device number is freed already, we will not find this
4867 * device via ublk_get_device_from_id()
4868 */
4869 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4870 return -EINTR;
4871 return 0;
4872 }
4873
ublk_ctrl_cmd_dump(u32 cmd_op,const struct ublksrv_ctrl_cmd * header)4874 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4875 const struct ublksrv_ctrl_cmd *header)
4876 {
4877 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4878 __func__, cmd_op, header->dev_id, header->queue_id,
4879 header->data[0], header->addr, header->len);
4880 }
4881
ublk_ctrl_stop_dev(struct ublk_device * ub)4882 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4883 {
4884 ublk_stop_dev(ub);
4885 }
4886
ublk_ctrl_try_stop_dev(struct ublk_device * ub)4887 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4888 {
4889 struct gendisk *disk;
4890 int ret = 0;
4891
4892 disk = ublk_get_disk(ub);
4893 if (!disk)
4894 return -ENODEV;
4895
4896 mutex_lock(&disk->open_mutex);
4897 if (disk_openers(disk) > 0) {
4898 ret = -EBUSY;
4899 goto unlock;
4900 }
4901 ub->block_open = true;
4902 /* release open_mutex as del_gendisk() will reacquire it */
4903 mutex_unlock(&disk->open_mutex);
4904
4905 ublk_ctrl_stop_dev(ub);
4906 goto out;
4907
4908 unlock:
4909 mutex_unlock(&disk->open_mutex);
4910 out:
4911 ublk_put_disk(disk);
4912 return ret;
4913 }
4914
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4915 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4916 const struct ublksrv_ctrl_cmd *header)
4917 {
4918 struct task_struct *p;
4919 struct pid *pid;
4920 struct ublksrv_ctrl_dev_info dev_info;
4921 pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4922 void __user *argp = (void __user *)(unsigned long)header->addr;
4923
4924 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4925 return -EINVAL;
4926
4927 memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4928 dev_info.ublksrv_pid = -1;
4929
4930 if (init_ublksrv_tgid > 0) {
4931 rcu_read_lock();
4932 pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4933 p = pid_task(pid, PIDTYPE_TGID);
4934 if (p) {
4935 int vnr = task_tgid_vnr(p);
4936
4937 if (vnr)
4938 dev_info.ublksrv_pid = vnr;
4939 }
4940 rcu_read_unlock();
4941 }
4942
4943 if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4944 return -EFAULT;
4945
4946 return 0;
4947 }
4948
4949 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)4950 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4951 {
4952 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4953 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4954
4955 if (ub->ub_disk) {
4956 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4957 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4958 } else {
4959 ub->params.devt.disk_major = 0;
4960 ub->params.devt.disk_minor = 0;
4961 }
4962 ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4963 }
4964
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4965 static int ublk_ctrl_get_params(struct ublk_device *ub,
4966 const struct ublksrv_ctrl_cmd *header)
4967 {
4968 void __user *argp = (void __user *)(unsigned long)header->addr;
4969 struct ublk_params_header ph;
4970 int ret;
4971
4972 if (header->len <= sizeof(ph) || !header->addr)
4973 return -EINVAL;
4974
4975 if (copy_from_user(&ph, argp, sizeof(ph)))
4976 return -EFAULT;
4977
4978 if (ph.len > header->len || !ph.len)
4979 return -EINVAL;
4980
4981 if (ph.len > sizeof(struct ublk_params))
4982 ph.len = sizeof(struct ublk_params);
4983
4984 mutex_lock(&ub->mutex);
4985 ublk_ctrl_fill_params_devt(ub);
4986 if (copy_to_user(argp, &ub->params, ph.len))
4987 ret = -EFAULT;
4988 else
4989 ret = 0;
4990 mutex_unlock(&ub->mutex);
4991
4992 return ret;
4993 }
4994
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4995 static int ublk_ctrl_set_params(struct ublk_device *ub,
4996 const struct ublksrv_ctrl_cmd *header)
4997 {
4998 void __user *argp = (void __user *)(unsigned long)header->addr;
4999 struct ublk_params_header ph;
5000 int ret = -EFAULT;
5001
5002 if (header->len <= sizeof(ph) || !header->addr)
5003 return -EINVAL;
5004
5005 if (copy_from_user(&ph, argp, sizeof(ph)))
5006 return -EFAULT;
5007
5008 if (ph.len > header->len || !ph.len || !ph.types)
5009 return -EINVAL;
5010
5011 if (ph.len > sizeof(struct ublk_params))
5012 ph.len = sizeof(struct ublk_params);
5013
5014 mutex_lock(&ub->mutex);
5015 if (test_bit(UB_STATE_USED, &ub->state)) {
5016 /*
5017 * Parameters can only be changed when device hasn't
5018 * been started yet
5019 */
5020 ret = -EACCES;
5021 } else if (copy_from_user(&ub->params, argp, ph.len)) {
5022 /* zero out partial copy so no stale params survive */
5023 memset(&ub->params, 0, sizeof(ub->params));
5024 ret = -EFAULT;
5025 } else {
5026 /* clear all we don't support yet */
5027 ub->params.types &= UBLK_PARAM_TYPE_ALL;
5028 ret = ublk_validate_params(ub);
5029 if (ret)
5030 memset(&ub->params, 0, sizeof(ub->params));
5031 }
5032 mutex_unlock(&ub->mutex);
5033
5034 return ret;
5035 }
5036
ublk_ctrl_start_recovery(struct ublk_device * ub)5037 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
5038 {
5039 int ret = -EINVAL;
5040
5041 mutex_lock(&ub->mutex);
5042 if (ublk_nosrv_should_stop_dev(ub))
5043 goto out_unlock;
5044 /*
5045 * START_RECOVERY is only allowd after:
5046 *
5047 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5048 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
5049 * released.
5050 *
5051 * and one of the following holds
5052 *
5053 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5054 * (a)has quiesced request queue
5055 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
5056 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5057 * (d)has completed/camceled all ioucmds owned by ther dying process
5058 *
5059 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5060 * quiesced, but all I/O is being immediately errored
5061 */
5062 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5063 ret = -EBUSY;
5064 goto out_unlock;
5065 }
5066 pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5067 init_completion(&ub->completion);
5068 ret = 0;
5069 out_unlock:
5070 mutex_unlock(&ub->mutex);
5071 return ret;
5072 }
5073
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5074 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5075 const struct ublksrv_ctrl_cmd *header)
5076 {
5077 int ublksrv_pid = (int)header->data[0];
5078 int ret = -EINVAL;
5079
5080 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5081 header->dev_id);
5082
5083 if (wait_for_completion_interruptible(&ub->completion))
5084 return -EINTR;
5085
5086 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5087 header->dev_id);
5088
5089 if (!ublk_validate_user_pid(ub, ublksrv_pid))
5090 return -EINVAL;
5091
5092 mutex_lock(&ub->mutex);
5093 if (ublk_nosrv_should_stop_dev(ub))
5094 goto out_unlock;
5095
5096 if (!ublk_dev_in_recoverable_state(ub)) {
5097 ret = -EBUSY;
5098 goto out_unlock;
5099 }
5100 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5101 ub->dev_info.state = UBLK_S_DEV_LIVE;
5102 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5103 __func__, ublksrv_pid, header->dev_id);
5104 blk_mq_kick_requeue_list(ub->ub_disk->queue);
5105 ret = 0;
5106 out_unlock:
5107 mutex_unlock(&ub->mutex);
5108 return ret;
5109 }
5110
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)5111 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5112 {
5113 void __user *argp = (void __user *)(unsigned long)header->addr;
5114 u64 features = UBLK_F_ALL;
5115
5116 if (header->len != UBLK_FEATURES_LEN || !header->addr)
5117 return -EINVAL;
5118
5119 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5120 return -EFAULT;
5121
5122 return 0;
5123 }
5124
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5125 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5126 {
5127 struct ublk_param_basic *p = &ub->params.basic;
5128 u64 new_size = header->data[0];
5129 int ret = 0;
5130
5131 mutex_lock(&ub->mutex);
5132 if (!ub->ub_disk) {
5133 ret = -ENODEV;
5134 goto out;
5135 }
5136 p->dev_sectors = new_size;
5137 set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5138 out:
5139 mutex_unlock(&ub->mutex);
5140 return ret;
5141 }
5142
5143 struct count_busy {
5144 const struct ublk_queue *ubq;
5145 unsigned int nr_busy;
5146 };
5147
ublk_count_busy_req(struct request * rq,void * data)5148 static bool ublk_count_busy_req(struct request *rq, void *data)
5149 {
5150 struct count_busy *idle = data;
5151
5152 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5153 idle->nr_busy += 1;
5154 return true;
5155 }
5156
5157 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)5158 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5159 {
5160 struct count_busy data = {
5161 .ubq = ubq,
5162 };
5163
5164 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5165 return data.nr_busy < ubq->q_depth;
5166 }
5167
5168 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)5169 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5170 unsigned int timeout_ms)
5171 {
5172 unsigned int elapsed = 0;
5173 int ret;
5174
5175 /*
5176 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5177 * or new fetch command, so needn't wait any more
5178 */
5179 if (ublk_dev_support_batch_io(ub))
5180 return 0;
5181
5182 while (elapsed < timeout_ms && !signal_pending(current)) {
5183 unsigned int queues_cancelable = 0;
5184 int i;
5185
5186 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5187 struct ublk_queue *ubq = ublk_get_queue(ub, i);
5188
5189 queues_cancelable += !!ubq_has_idle_io(ubq);
5190 }
5191
5192 /*
5193 * Each queue needs at least one active command for
5194 * notifying ublk server
5195 */
5196 if (queues_cancelable == ub->dev_info.nr_hw_queues)
5197 break;
5198
5199 msleep(UBLK_REQUEUE_DELAY_MS);
5200 elapsed += UBLK_REQUEUE_DELAY_MS;
5201 }
5202
5203 if (signal_pending(current))
5204 ret = -EINTR;
5205 else if (elapsed >= timeout_ms)
5206 ret = -EBUSY;
5207 else
5208 ret = 0;
5209
5210 return ret;
5211 }
5212
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5213 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5214 const struct ublksrv_ctrl_cmd *header)
5215 {
5216 /* zero means wait forever */
5217 u64 timeout_ms = header->data[0];
5218 struct gendisk *disk;
5219 int ret = -ENODEV;
5220
5221 if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5222 return -EOPNOTSUPP;
5223
5224 mutex_lock(&ub->mutex);
5225 disk = ublk_get_disk(ub);
5226 if (!disk)
5227 goto unlock;
5228 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5229 goto put_disk;
5230
5231 ret = 0;
5232 /* already in expected state */
5233 if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5234 goto put_disk;
5235
5236 /* Mark the device as canceling */
5237 mutex_lock(&ub->cancel_mutex);
5238 blk_mq_quiesce_queue(disk->queue);
5239 ublk_set_canceling(ub, true);
5240 blk_mq_unquiesce_queue(disk->queue);
5241 mutex_unlock(&ub->cancel_mutex);
5242
5243 if (!timeout_ms)
5244 timeout_ms = UINT_MAX;
5245 ret = ublk_wait_for_idle_io(ub, timeout_ms);
5246
5247 put_disk:
5248 ublk_put_disk(disk);
5249 unlock:
5250 mutex_unlock(&ub->mutex);
5251
5252 /* Cancel pending uring_cmd */
5253 if (!ret)
5254 ublk_cancel_dev(ub);
5255 return ret;
5256 }
5257
5258 /*
5259 * All control commands are sent via /dev/ublk-control, so we have to check
5260 * the destination device's permission
5261 */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)5262 static int ublk_char_dev_permission(struct ublk_device *ub,
5263 const char *dev_path, int mask)
5264 {
5265 int err;
5266 struct path path;
5267 struct kstat stat;
5268
5269 err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5270 if (err)
5271 return err;
5272
5273 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5274 if (err)
5275 goto exit;
5276
5277 err = -EPERM;
5278 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5279 goto exit;
5280
5281 err = inode_permission(&nop_mnt_idmap,
5282 d_backing_inode(path.dentry), mask);
5283 exit:
5284 path_put(&path);
5285 return err;
5286 }
5287
5288 /*
5289 * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5290 * if device is started. If device is not yet started, only mutex is
5291 * needed since no I/O path can access the tree.
5292 *
5293 * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5294 * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5295 */
ublk_lock_buf_tree(struct ublk_device * ub)5296 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5297 {
5298 unsigned int memflags = 0;
5299
5300 mutex_lock(&ub->mutex);
5301 if (ub->ub_disk)
5302 memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5303
5304 return memflags;
5305 }
5306
ublk_unlock_buf_tree(struct ublk_device * ub,unsigned int memflags)5307 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5308 {
5309 if (ub->ub_disk)
5310 blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5311 mutex_unlock(&ub->mutex);
5312 }
5313
5314 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
ublk_buf_erase_ranges(struct ublk_device * ub,int buf_index)5315 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5316 {
5317 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5318 struct ublk_buf_range *range;
5319
5320 mas_lock(&mas);
5321 mas_for_each(&mas, range, ULONG_MAX) {
5322 if (range->buf_index == buf_index) {
5323 mas_erase(&mas);
5324 kfree(range);
5325 }
5326 }
5327 mas_unlock(&mas);
5328 }
5329
__ublk_ctrl_reg_buf(struct ublk_device * ub,struct page ** pages,unsigned long nr_pages,int index,unsigned short flags)5330 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5331 struct page **pages, unsigned long nr_pages,
5332 int index, unsigned short flags)
5333 {
5334 unsigned long i;
5335 int ret;
5336
5337 for (i = 0; i < nr_pages; i++) {
5338 unsigned long pfn = page_to_pfn(pages[i]);
5339 unsigned long start = i;
5340 struct ublk_buf_range *range;
5341
5342 /* Find run of consecutive PFNs */
5343 while (i + 1 < nr_pages &&
5344 page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5345 i++;
5346
5347 range = kzalloc(sizeof(*range), GFP_KERNEL);
5348 if (!range) {
5349 ret = -ENOMEM;
5350 goto unwind;
5351 }
5352 range->buf_index = index;
5353 range->flags = flags;
5354 range->base_offset = start << PAGE_SHIFT;
5355
5356 ret = mtree_insert_range(&ub->buf_tree, pfn,
5357 pfn + (i - start),
5358 range, GFP_KERNEL);
5359 if (ret) {
5360 kfree(range);
5361 goto unwind;
5362 }
5363 }
5364 return 0;
5365
5366 unwind:
5367 ublk_buf_erase_ranges(ub, index);
5368 return ret;
5369 }
5370
5371 /*
5372 * Register a shared memory buffer for zero-copy I/O.
5373 * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5374 * internally. Returns buffer index (>= 0) on success.
5375 */
ublk_ctrl_reg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5376 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5377 struct ublksrv_ctrl_cmd *header)
5378 {
5379 void __user *argp = (void __user *)(unsigned long)header->addr;
5380 struct ublk_shmem_buf_reg buf_reg;
5381 unsigned long nr_pages;
5382 struct page **pages = NULL;
5383 unsigned int gup_flags;
5384 unsigned int memflags;
5385 long pinned;
5386 int index;
5387 int ret;
5388
5389 if (!ublk_dev_support_shmem_zc(ub))
5390 return -EOPNOTSUPP;
5391
5392 memset(&buf_reg, 0, sizeof(buf_reg));
5393 if (copy_from_user(&buf_reg, argp,
5394 min_t(size_t, header->len, sizeof(buf_reg))))
5395 return -EFAULT;
5396
5397 if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5398 return -EINVAL;
5399
5400 if (buf_reg.reserved)
5401 return -EINVAL;
5402
5403 if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5404 !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5405 return -EINVAL;
5406
5407 nr_pages = buf_reg.len >> PAGE_SHIFT;
5408
5409 /* Pin pages before any locks (may sleep) */
5410 pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5411 if (!pages)
5412 return -ENOMEM;
5413
5414 gup_flags = FOLL_LONGTERM;
5415 if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5416 gup_flags |= FOLL_WRITE;
5417
5418 pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5419 if (pinned < 0) {
5420 ret = pinned;
5421 goto err_free_pages;
5422 }
5423 if (pinned != nr_pages) {
5424 ret = -EFAULT;
5425 goto err_unpin;
5426 }
5427
5428 memflags = ublk_lock_buf_tree(ub);
5429
5430 index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5431 if (index < 0) {
5432 ret = index;
5433 goto err_unlock;
5434 }
5435
5436 ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5437 if (ret) {
5438 ida_free(&ub->buf_ida, index);
5439 goto err_unlock;
5440 }
5441
5442 ublk_unlock_buf_tree(ub, memflags);
5443 kvfree(pages);
5444 return index;
5445
5446 err_unlock:
5447 ublk_unlock_buf_tree(ub, memflags);
5448 err_unpin:
5449 unpin_user_pages(pages, pinned);
5450 err_free_pages:
5451 kvfree(pages);
5452 return ret;
5453 }
5454
ublk_unpin_range_pages(unsigned long base_pfn,unsigned long nr_pages)5455 static void ublk_unpin_range_pages(unsigned long base_pfn,
5456 unsigned long nr_pages)
5457 {
5458 #define UBLK_UNPIN_BATCH 32
5459 struct page *pages[UBLK_UNPIN_BATCH];
5460 unsigned long off;
5461
5462 for (off = 0; off < nr_pages; ) {
5463 unsigned int batch = min_t(unsigned long,
5464 nr_pages - off, UBLK_UNPIN_BATCH);
5465 unsigned int j;
5466
5467 for (j = 0; j < batch; j++)
5468 pages[j] = pfn_to_page(base_pfn + off + j);
5469 unpin_user_pages(pages, batch);
5470 off += batch;
5471 }
5472 }
5473
5474 /*
5475 * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under
5476 * mas_lock, collecting them into an xarray. Then drop the lock and
5477 * unpin pages + free ranges outside spinlock context.
5478 *
5479 * Returns true if the tree walk completed, false if more ranges remain.
5480 * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value().
5481 */
5482 #define UBLK_REMOVE_BATCH 64
5483
__ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index,int * ret)5484 static bool __ublk_shmem_remove_ranges(struct ublk_device *ub,
5485 int buf_index, int *ret)
5486 {
5487 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5488 struct ublk_buf_range *range;
5489 struct xarray to_unpin;
5490 unsigned long idx;
5491 unsigned int count = 0;
5492 bool done = false;
5493 void *entry;
5494
5495 xa_init(&to_unpin);
5496
5497 mas_lock(&mas);
5498 mas_for_each(&mas, range, ULONG_MAX) {
5499 unsigned long nr;
5500
5501 if (buf_index >= 0 && range->buf_index != buf_index)
5502 continue;
5503
5504 *ret = 0;
5505 nr = mas.last - mas.index + 1;
5506 if (xa_err(xa_store(&to_unpin, mas.index,
5507 xa_mk_value(nr), GFP_ATOMIC)))
5508 goto unlock;
5509 mas_erase(&mas);
5510 kfree(range);
5511 if (++count >= UBLK_REMOVE_BATCH)
5512 goto unlock;
5513 }
5514 done = true;
5515 unlock:
5516 mas_unlock(&mas);
5517
5518 xa_for_each(&to_unpin, idx, entry)
5519 ublk_unpin_range_pages(idx, xa_to_value(entry));
5520 xa_destroy(&to_unpin);
5521
5522 return done;
5523 }
5524
5525 /*
5526 * Remove ranges from the maple tree matching buf_index, unpin pages
5527 * and free range structs. If buf_index < 0, remove all ranges.
5528 * Processes ranges in batches to avoid holding the maple tree spinlock
5529 * across potentially expensive page unpinning.
5530 */
ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index)5531 static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index)
5532 {
5533 int ret = -ENOENT;
5534
5535 while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret))
5536 cond_resched();
5537 return ret;
5538 }
5539
ublk_ctrl_unreg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5540 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5541 struct ublksrv_ctrl_cmd *header)
5542 {
5543 int index = (int)header->data[0];
5544 unsigned int memflags;
5545 int ret;
5546
5547 if (!ublk_dev_support_shmem_zc(ub))
5548 return -EOPNOTSUPP;
5549
5550 if (index < 0 || index > USHRT_MAX)
5551 return -EINVAL;
5552
5553 memflags = ublk_lock_buf_tree(ub);
5554
5555 ret = ublk_shmem_remove_ranges(ub, index);
5556 if (!ret)
5557 ida_free(&ub->buf_ida, index);
5558
5559 ublk_unlock_buf_tree(ub, memflags);
5560 return ret;
5561 }
5562
ublk_buf_cleanup(struct ublk_device * ub)5563 static void ublk_buf_cleanup(struct ublk_device *ub)
5564 {
5565 ublk_shmem_remove_ranges(ub, -1);
5566 mtree_destroy(&ub->buf_tree);
5567 ida_destroy(&ub->buf_ida);
5568 }
5569
5570 /* Check if request pages match a registered shared memory buffer */
ublk_try_buf_match(struct ublk_device * ub,struct request * rq,u32 * buf_idx,u32 * buf_off)5571 static bool ublk_try_buf_match(struct ublk_device *ub,
5572 struct request *rq,
5573 u32 *buf_idx, u32 *buf_off)
5574 {
5575 struct req_iterator iter;
5576 struct bio_vec bv;
5577 int index = -1;
5578 unsigned long expected_offset = 0;
5579 bool first = true;
5580
5581 rq_for_each_bvec(bv, rq, iter) {
5582 unsigned long pfn = page_to_pfn(bv.bv_page);
5583 unsigned long end_pfn = pfn +
5584 ((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5585 struct ublk_buf_range *range;
5586 unsigned long off;
5587 MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5588
5589 range = mas_walk(&mas);
5590 if (!range)
5591 return false;
5592
5593 /* verify all pages in this bvec fall within the range */
5594 if (end_pfn > mas.last)
5595 return false;
5596
5597 off = range->base_offset +
5598 (pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5599
5600 if (first) {
5601 /* Read-only buffer can't serve READ (kernel writes) */
5602 if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5603 req_op(rq) != REQ_OP_WRITE)
5604 return false;
5605 index = range->buf_index;
5606 expected_offset = off;
5607 *buf_off = off;
5608 first = false;
5609 } else {
5610 if (range->buf_index != index)
5611 return false;
5612 if (off != expected_offset)
5613 return false;
5614 }
5615 expected_offset += bv.bv_len;
5616 }
5617
5618 if (first)
5619 return false;
5620
5621 *buf_idx = index;
5622 return true;
5623 }
5624
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,u32 cmd_op,struct ublksrv_ctrl_cmd * header)5625 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5626 u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5627 {
5628 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5629 void __user *argp = (void __user *)(unsigned long)header->addr;
5630 char *dev_path = NULL;
5631 int ret = 0;
5632 int mask;
5633
5634 if (!unprivileged) {
5635 if (!capable(CAP_SYS_ADMIN))
5636 return -EPERM;
5637 /*
5638 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5639 * char_dev_path in payload too, since userspace may not
5640 * know if the specified device is created as unprivileged
5641 * mode.
5642 */
5643 if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5644 return 0;
5645 }
5646
5647 /*
5648 * User has to provide the char device path for unprivileged ublk
5649 *
5650 * header->addr always points to the dev path buffer, and
5651 * header->dev_path_len records length of dev path buffer.
5652 */
5653 if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5654 return -EINVAL;
5655
5656 if (header->len < header->dev_path_len)
5657 return -EINVAL;
5658
5659 dev_path = memdup_user_nul(argp, header->dev_path_len);
5660 if (IS_ERR(dev_path))
5661 return PTR_ERR(dev_path);
5662
5663 ret = -EINVAL;
5664 switch (_IOC_NR(cmd_op)) {
5665 case UBLK_CMD_GET_DEV_INFO:
5666 case UBLK_CMD_GET_DEV_INFO2:
5667 case UBLK_CMD_GET_QUEUE_AFFINITY:
5668 case UBLK_CMD_GET_PARAMS:
5669 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5670 mask = MAY_READ;
5671 break;
5672 case UBLK_CMD_START_DEV:
5673 case UBLK_CMD_STOP_DEV:
5674 case UBLK_CMD_ADD_DEV:
5675 case UBLK_CMD_DEL_DEV:
5676 case UBLK_CMD_SET_PARAMS:
5677 case UBLK_CMD_START_USER_RECOVERY:
5678 case UBLK_CMD_END_USER_RECOVERY:
5679 case UBLK_CMD_UPDATE_SIZE:
5680 case UBLK_CMD_QUIESCE_DEV:
5681 case UBLK_CMD_TRY_STOP_DEV:
5682 case UBLK_CMD_REG_BUF:
5683 case UBLK_CMD_UNREG_BUF:
5684 mask = MAY_READ | MAY_WRITE;
5685 break;
5686 default:
5687 goto exit;
5688 }
5689
5690 ret = ublk_char_dev_permission(ub, dev_path, mask);
5691 if (!ret) {
5692 header->len -= header->dev_path_len;
5693 header->addr += header->dev_path_len;
5694 }
5695 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5696 __func__, ub->ub_number, cmd_op,
5697 ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5698 dev_path, ret);
5699 exit:
5700 kfree(dev_path);
5701 return ret;
5702 }
5703
ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)5704 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5705 {
5706 switch (_IOC_NR(cmd_op)) {
5707 case UBLK_CMD_GET_QUEUE_AFFINITY:
5708 case UBLK_CMD_GET_DEV_INFO:
5709 case UBLK_CMD_GET_DEV_INFO2:
5710 case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5711 return false;
5712 default:
5713 return true;
5714 }
5715 }
5716
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)5717 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5718 unsigned int issue_flags)
5719 {
5720 /* May point to userspace-mapped memory */
5721 const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5722 struct ublksrv_ctrl_cmd);
5723 struct ublksrv_ctrl_cmd header;
5724 struct ublk_device *ub = NULL;
5725 u32 cmd_op = cmd->cmd_op;
5726 int ret = -EINVAL;
5727
5728 if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5729 issue_flags & IO_URING_F_NONBLOCK)
5730 return -EAGAIN;
5731
5732 if (!(issue_flags & IO_URING_F_SQE128))
5733 return -EINVAL;
5734
5735 header.dev_id = READ_ONCE(ub_src->dev_id);
5736 header.queue_id = READ_ONCE(ub_src->queue_id);
5737 header.len = READ_ONCE(ub_src->len);
5738 header.addr = READ_ONCE(ub_src->addr);
5739 header.data[0] = READ_ONCE(ub_src->data[0]);
5740 header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5741 ublk_ctrl_cmd_dump(cmd_op, &header);
5742
5743 ret = ublk_check_cmd_op(cmd_op);
5744 if (ret)
5745 goto out;
5746
5747 if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5748 ret = ublk_ctrl_get_features(&header);
5749 goto out;
5750 }
5751
5752 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5753 ret = -ENODEV;
5754 ub = ublk_get_device_from_id(header.dev_id);
5755 if (!ub)
5756 goto out;
5757
5758 ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5759 if (ret)
5760 goto put_dev;
5761 }
5762
5763 switch (_IOC_NR(cmd_op)) {
5764 case UBLK_CMD_START_DEV:
5765 ret = ublk_ctrl_start_dev(ub, &header);
5766 break;
5767 case UBLK_CMD_STOP_DEV:
5768 ublk_ctrl_stop_dev(ub);
5769 ret = 0;
5770 break;
5771 case UBLK_CMD_GET_DEV_INFO:
5772 case UBLK_CMD_GET_DEV_INFO2:
5773 ret = ublk_ctrl_get_dev_info(ub, &header);
5774 break;
5775 case UBLK_CMD_ADD_DEV:
5776 ret = ublk_ctrl_add_dev(&header);
5777 break;
5778 case UBLK_CMD_DEL_DEV:
5779 ret = ublk_ctrl_del_dev(&ub, true);
5780 break;
5781 case UBLK_CMD_DEL_DEV_ASYNC:
5782 ret = ublk_ctrl_del_dev(&ub, false);
5783 break;
5784 case UBLK_CMD_GET_QUEUE_AFFINITY:
5785 ret = ublk_ctrl_get_queue_affinity(ub, &header);
5786 break;
5787 case UBLK_CMD_GET_PARAMS:
5788 ret = ublk_ctrl_get_params(ub, &header);
5789 break;
5790 case UBLK_CMD_SET_PARAMS:
5791 ret = ublk_ctrl_set_params(ub, &header);
5792 break;
5793 case UBLK_CMD_START_USER_RECOVERY:
5794 ret = ublk_ctrl_start_recovery(ub);
5795 break;
5796 case UBLK_CMD_END_USER_RECOVERY:
5797 ret = ublk_ctrl_end_recovery(ub, &header);
5798 break;
5799 case UBLK_CMD_UPDATE_SIZE:
5800 ret = ublk_ctrl_set_size(ub, &header);
5801 break;
5802 case UBLK_CMD_QUIESCE_DEV:
5803 ret = ublk_ctrl_quiesce_dev(ub, &header);
5804 break;
5805 case UBLK_CMD_TRY_STOP_DEV:
5806 ret = ublk_ctrl_try_stop_dev(ub);
5807 break;
5808 case UBLK_CMD_REG_BUF:
5809 ret = ublk_ctrl_reg_buf(ub, &header);
5810 break;
5811 case UBLK_CMD_UNREG_BUF:
5812 ret = ublk_ctrl_unreg_buf(ub, &header);
5813 break;
5814 default:
5815 ret = -EOPNOTSUPP;
5816 break;
5817 }
5818
5819 put_dev:
5820 if (ub)
5821 ublk_put_device(ub);
5822 out:
5823 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5824 __func__, ret, cmd_op, header.dev_id, header.queue_id);
5825 return ret;
5826 }
5827
5828 static const struct file_operations ublk_ctl_fops = {
5829 .open = nonseekable_open,
5830 .uring_cmd = ublk_ctrl_uring_cmd,
5831 .owner = THIS_MODULE,
5832 .llseek = noop_llseek,
5833 };
5834
5835 static struct miscdevice ublk_misc = {
5836 .minor = MISC_DYNAMIC_MINOR,
5837 .name = "ublk-control",
5838 .fops = &ublk_ctl_fops,
5839 };
5840
ublk_init(void)5841 static int __init ublk_init(void)
5842 {
5843 int ret;
5844
5845 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5846 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5847 /*
5848 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5849 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5850 */
5851 BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5852 UBLKSRV_IO_INTEGRITY_FLAG);
5853 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5854
5855 init_waitqueue_head(&ublk_idr_wq);
5856
5857 ret = misc_register(&ublk_misc);
5858 if (ret)
5859 return ret;
5860
5861 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5862 if (ret)
5863 goto unregister_mis;
5864
5865 ret = class_register(&ublk_chr_class);
5866 if (ret)
5867 goto free_chrdev_region;
5868
5869 return 0;
5870
5871 free_chrdev_region:
5872 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5873 unregister_mis:
5874 misc_deregister(&ublk_misc);
5875 return ret;
5876 }
5877
ublk_exit(void)5878 static void __exit ublk_exit(void)
5879 {
5880 struct ublk_device *ub;
5881 int id;
5882
5883 idr_for_each_entry(&ublk_index_idr, ub, id)
5884 ublk_remove(ub);
5885
5886 class_unregister(&ublk_chr_class);
5887 misc_deregister(&ublk_misc);
5888
5889 idr_destroy(&ublk_index_idr);
5890 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5891 }
5892
5893 module_init(ublk_init);
5894 module_exit(ublk_exit);
5895
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)5896 static int ublk_set_max_unprivileged_ublks(const char *buf,
5897 const struct kernel_param *kp)
5898 {
5899 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5900 }
5901
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)5902 static int ublk_get_max_unprivileged_ublks(char *buf,
5903 const struct kernel_param *kp)
5904 {
5905 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5906 }
5907
5908 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5909 .set = ublk_set_max_unprivileged_ublks,
5910 .get = ublk_get_max_unprivileged_ublks,
5911 };
5912
5913 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5914 &unprivileged_ublks_max, 0644);
5915 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5916
5917 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5918 MODULE_DESCRIPTION("Userspace block device");
5919 MODULE_LICENSE("GPL");
5920