1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53
54 #define UBLK_MINORS (1U << MINORBITS)
55
56 #define UBLK_INVALID_BUF_IDX ((u16)-1)
57
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF)
65
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32)
68
69 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 | UBLK_F_URING_CMD_COMP_IN_TASK \
75 | UBLK_F_NEED_GET_DATA \
76 | UBLK_F_USER_RECOVERY \
77 | UBLK_F_USER_RECOVERY_REISSUE \
78 | UBLK_F_UNPRIVILEGED_DEV \
79 | UBLK_F_CMD_IOCTL_ENCODE \
80 | UBLK_F_USER_COPY \
81 | UBLK_F_ZONED \
82 | UBLK_F_USER_RECOVERY_FAIL_IO \
83 | UBLK_F_UPDATE_SIZE \
84 | UBLK_F_AUTO_BUF_REG \
85 | UBLK_F_QUIESCE \
86 | UBLK_F_PER_IO_DAEMON \
87 | UBLK_F_BUF_REG_OFF_DAEMON \
88 | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 | UBLK_F_SAFE_STOP_DEV \
90 | UBLK_F_BATCH_IO \
91 | UBLK_F_NO_AUTO_PART_SCAN \
92 | UBLK_F_SHMEM_ZC)
93
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 | UBLK_F_USER_RECOVERY_REISSUE \
96 | UBLK_F_USER_RECOVERY_FAIL_IO)
97
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL \
100 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
102 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 UBLK_PARAM_TYPE_INTEGRITY)
104
105 #define UBLK_BATCH_F_ALL \
106 (UBLK_BATCH_F_HAS_ZONE_LBA | \
107 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 struct list_head node;
113 struct io_uring_cmd *cmd;
114 unsigned short buf_group;
115 };
116
117 struct ublk_uring_cmd_pdu {
118 /*
119 * Store requests in same batch temporarily for queuing them to
120 * daemon context.
121 *
122 * It should have been stored to request payload, but we do want
123 * to avoid extra pre-allocation, and uring_cmd payload is always
124 * free for us
125 */
126 union {
127 struct request *req;
128 struct request *req_list;
129 };
130
131 /*
132 * The following two are valid in this cmd whole lifetime, and
133 * setup in ublk uring_cmd handler
134 */
135 struct ublk_queue *ubq;
136
137 union {
138 u16 tag;
139 struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 };
141 };
142
143 struct ublk_batch_io_data {
144 struct ublk_device *ub;
145 struct io_uring_cmd *cmd;
146 struct ublk_batch_io header;
147 unsigned int issue_flags;
148 struct io_comp_batch *iob;
149 };
150
151 /*
152 * io command is active: sqe cmd is received, and its cqe isn't done
153 *
154 * If the flag is set, the io command is owned by ublk driver, and waited
155 * for incoming blk-mq request from the ublk block device.
156 *
157 * If the flag is cleared, the io command will be completed, and owned by
158 * ublk server.
159 */
160 #define UBLK_IO_FLAG_ACTIVE 0x01
161
162 /*
163 * IO command is completed via cqe, and it is being handled by ublksrv, and
164 * not committed yet
165 *
166 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167 * cross verification
168 */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170
171 /*
172 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173 * get data buffer address from ublksrv.
174 *
175 * Then, bio data could be copied into this data buffer for a WRITE request
176 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177 */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179
180 /*
181 * request buffer is registered automatically, so we have to unregister it
182 * before completing this request.
183 *
184 * io_uring will unregister buffer automatically for us during exiting.
185 */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
187
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED 0x80000000
190
191 /*
192 * Initialize refcount to a large number to include any registered buffers.
193 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194 * any buffers registered on the io daemon task.
195 */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1)
200
201 union ublk_io_buf {
202 __u64 addr;
203 struct ublk_auto_buf_reg auto_reg;
204 };
205
206 struct ublk_io {
207 union ublk_io_buf buf;
208 unsigned int flags;
209 int res;
210
211 union {
212 /* valid if UBLK_IO_FLAG_ACTIVE is set */
213 struct io_uring_cmd *cmd;
214 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 struct request *req;
216 };
217
218 struct task_struct *task;
219
220 /*
221 * The number of uses of this I/O by the ublk server
222 * if user copy or zero copy are enabled:
223 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 * until UBLK_IO_COMMIT_AND_FETCH_REQ
225 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 * - 1 for each io_uring registered buffer not registered on task
227 * The I/O can only be completed once all references are dropped.
228 * User copy and buffer registration operations are only permitted
229 * if the reference count is nonzero.
230 */
231 refcount_t ref;
232 /* Count of buffers registered on task and not yet unregistered */
233 unsigned task_registered_buffers;
234
235 void *buf_ctx_handle;
236 spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238
239 struct ublk_queue {
240 int q_id;
241 int q_depth;
242
243 unsigned long flags;
244 struct ublksrv_io_desc *io_cmd_buf;
245
246 bool force_abort;
247 bool canceling;
248 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 spinlock_t cancel_lock;
250 struct ublk_device *dev;
251 u32 nr_io_ready;
252
253 /*
254 * For supporting UBLK_F_BATCH_IO only.
255 *
256 * Inflight ublk request tag is saved in this fifo
257 *
258 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 * so lock is required for storing request tag to fifo
260 *
261 * Make sure just one reader for fetching request from task work
262 * function to ublk server, so no need to grab the lock in reader
263 * side.
264 *
265 * Batch I/O State Management:
266 *
267 * The batch I/O system uses implicit state management based on the
268 * combination of three key variables below.
269 *
270 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 * No fetch commands available, events queue in evts_fifo
272 *
273 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 * Fetch commands available but none processing events
275 *
276 * - ACTIVE: active_fcmd
277 * One fetch command actively processing events from evts_fifo
278 *
279 * Key Invariants:
280 * - At most one active_fcmd at any time (single reader)
281 * - active_fcmd is always from fcmd_head list when non-NULL
282 * - evts_fifo can be read locklessly by the single active reader
283 * - All state transitions require evts_lock protection
284 * - Multiple writers to evts_fifo require lock protection
285 */
286 struct {
287 DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 spinlock_t evts_lock;
289
290 /* List of fetch commands available to process events */
291 struct list_head fcmd_head;
292
293 /* Currently active fetch command (NULL = none active) */
294 struct ublk_batch_fetch_cmd *active_fcmd;
295 }____cacheline_aligned_in_smp;
296
297 struct ublk_io ios[] __counted_by(q_depth);
298 };
299
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 unsigned short buf_index;
303 unsigned short flags;
304 unsigned int base_offset; /* byte offset within buffer */
305 };
306
307 struct ublk_device {
308 struct gendisk *ub_disk;
309
310 struct ublksrv_ctrl_dev_info dev_info;
311
312 struct blk_mq_tag_set tag_set;
313
314 struct cdev cdev;
315 struct device cdev_dev;
316
317 #define UB_STATE_OPEN 0
318 #define UB_STATE_USED 1
319 #define UB_STATE_DELETED 2
320 unsigned long state;
321 int ub_number;
322
323 struct mutex mutex;
324
325 spinlock_t lock;
326 struct mm_struct *mm;
327
328 struct ublk_params params;
329
330 struct completion completion;
331 u32 nr_queue_ready;
332 bool unprivileged_daemons;
333 struct mutex cancel_mutex;
334 bool canceling;
335 pid_t ublksrv_tgid;
336 struct delayed_work exit_work;
337 struct work_struct partition_scan_work;
338
339 bool block_open; /* protected by open_mutex */
340
341 /* shared memory zero copy */
342 struct maple_tree buf_tree;
343 struct ida buf_ida;
344
345 struct ublk_queue *queues[];
346 };
347
348 /* header of ublk_params */
349 struct ublk_params_header {
350 __u32 len;
351 __u32 types;
352 };
353
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 u16 q_id, u16 tag, struct ublk_io *io);
362 static inline unsigned int ublk_req_build_flags(struct request *req);
363 static void ublk_batch_dispatch(struct ublk_queue *ubq,
364 const struct ublk_batch_io_data *data,
365 struct ublk_batch_fetch_cmd *fcmd);
366
ublk_dev_support_batch_io(const struct ublk_device * ub)367 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
368 {
369 return ub->dev_info.flags & UBLK_F_BATCH_IO;
370 }
371
ublk_support_batch_io(const struct ublk_queue * ubq)372 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
373 {
374 return ubq->flags & UBLK_F_BATCH_IO;
375 }
376
ublk_io_lock(struct ublk_io * io)377 static inline void ublk_io_lock(struct ublk_io *io)
378 {
379 spin_lock(&io->lock);
380 }
381
ublk_io_unlock(struct ublk_io * io)382 static inline void ublk_io_unlock(struct ublk_io *io)
383 {
384 spin_unlock(&io->lock);
385 }
386
387 /* Initialize the event queue */
ublk_io_evts_init(struct ublk_queue * q,unsigned int size,int numa_node)388 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
389 int numa_node)
390 {
391 spin_lock_init(&q->evts_lock);
392 return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
393 }
394
395 /* Check if event queue is empty */
ublk_io_evts_empty(const struct ublk_queue * q)396 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
397 {
398 return kfifo_is_empty(&q->evts_fifo);
399 }
400
ublk_io_evts_deinit(struct ublk_queue * q)401 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
402 {
403 WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
404 kfifo_free(&q->evts_fifo);
405 }
406
407 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)408 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
409 {
410 return &ubq->io_cmd_buf[tag];
411 }
412
ublk_support_zero_copy(const struct ublk_queue * ubq)413 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
414 {
415 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
416 }
417
ublk_dev_support_zero_copy(const struct ublk_device * ub)418 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
419 {
420 return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
421 }
422
ublk_support_shmem_zc(const struct ublk_queue * ubq)423 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
424 {
425 return ubq->flags & UBLK_F_SHMEM_ZC;
426 }
427
ublk_iod_is_shmem_zc(const struct ublk_queue * ubq,unsigned int tag)428 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
429 unsigned int tag)
430 {
431 return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
432 }
433
ublk_dev_support_shmem_zc(const struct ublk_device * ub)434 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
435 {
436 return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
437 }
438
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)439 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
440 {
441 return ubq->flags & UBLK_F_AUTO_BUF_REG;
442 }
443
ublk_dev_support_auto_buf_reg(const struct ublk_device * ub)444 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
445 {
446 return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
447 }
448
ublk_support_user_copy(const struct ublk_queue * ubq)449 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
450 {
451 return ubq->flags & UBLK_F_USER_COPY;
452 }
453
ublk_dev_support_user_copy(const struct ublk_device * ub)454 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
455 {
456 return ub->dev_info.flags & UBLK_F_USER_COPY;
457 }
458
ublk_dev_is_zoned(const struct ublk_device * ub)459 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
460 {
461 return ub->dev_info.flags & UBLK_F_ZONED;
462 }
463
ublk_queue_is_zoned(const struct ublk_queue * ubq)464 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
465 {
466 return ubq->flags & UBLK_F_ZONED;
467 }
468
ublk_dev_support_integrity(const struct ublk_device * ub)469 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
470 {
471 return ub->dev_info.flags & UBLK_F_INTEGRITY;
472 }
473
474 #ifdef CONFIG_BLK_DEV_ZONED
475
476 struct ublk_zoned_report_desc {
477 __u64 sector;
478 __u32 operation;
479 __u32 nr_zones;
480 };
481
482 static DEFINE_XARRAY(ublk_zoned_report_descs);
483
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)484 static int ublk_zoned_insert_report_desc(const struct request *req,
485 struct ublk_zoned_report_desc *desc)
486 {
487 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
488 desc, GFP_KERNEL);
489 }
490
ublk_zoned_erase_report_desc(const struct request * req)491 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
492 const struct request *req)
493 {
494 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
495 }
496
ublk_zoned_get_report_desc(const struct request * req)497 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
498 const struct request *req)
499 {
500 return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
501 }
502
ublk_get_nr_zones(const struct ublk_device * ub)503 static int ublk_get_nr_zones(const struct ublk_device *ub)
504 {
505 const struct ublk_param_basic *p = &ub->params.basic;
506
507 /* Zone size is a power of 2 */
508 return p->dev_sectors >> ilog2(p->chunk_sectors);
509 }
510
ublk_revalidate_disk_zones(struct ublk_device * ub)511 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
512 {
513 return blk_revalidate_disk_zones(ub->ub_disk);
514 }
515
ublk_dev_param_zoned_validate(const struct ublk_device * ub)516 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
517 {
518 const struct ublk_param_zoned *p = &ub->params.zoned;
519 int nr_zones;
520
521 if (!ublk_dev_is_zoned(ub))
522 return -EINVAL;
523
524 if (!p->max_zone_append_sectors)
525 return -EINVAL;
526
527 nr_zones = ublk_get_nr_zones(ub);
528
529 if (p->max_active_zones > nr_zones)
530 return -EINVAL;
531
532 if (p->max_open_zones > nr_zones)
533 return -EINVAL;
534
535 return 0;
536 }
537
ublk_dev_param_zoned_apply(struct ublk_device * ub)538 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
539 {
540 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
541 }
542
543 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)544 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
545 unsigned int nr_zones, size_t *buflen)
546 {
547 struct request_queue *q = ublk->ub_disk->queue;
548 size_t bufsize;
549 void *buf;
550
551 nr_zones = min_t(unsigned int, nr_zones,
552 ublk->ub_disk->nr_zones);
553
554 bufsize = nr_zones * sizeof(struct blk_zone);
555 bufsize =
556 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
557
558 while (bufsize >= sizeof(struct blk_zone)) {
559 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
560 if (buf) {
561 *buflen = bufsize;
562 return buf;
563 }
564 bufsize >>= 1;
565 }
566
567 *buflen = 0;
568 return NULL;
569 }
570
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)571 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
572 unsigned int nr_zones, struct blk_report_zones_args *args)
573 {
574 struct ublk_device *ub = disk->private_data;
575 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
576 unsigned int first_zone = sector >> ilog2(zone_size_sectors);
577 unsigned int done_zones = 0;
578 unsigned int max_zones_per_request;
579 int ret;
580 struct blk_zone *buffer;
581 size_t buffer_length;
582
583 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
584 nr_zones);
585
586 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
587 if (!buffer)
588 return -ENOMEM;
589
590 max_zones_per_request = buffer_length / sizeof(struct blk_zone);
591
592 while (done_zones < nr_zones) {
593 unsigned int remaining_zones = nr_zones - done_zones;
594 unsigned int zones_in_request =
595 min_t(unsigned int, remaining_zones, max_zones_per_request);
596 struct request *req;
597 struct ublk_zoned_report_desc desc;
598 blk_status_t status;
599
600 memset(buffer, 0, buffer_length);
601
602 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
603 if (IS_ERR(req)) {
604 ret = PTR_ERR(req);
605 goto out;
606 }
607
608 desc.operation = UBLK_IO_OP_REPORT_ZONES;
609 desc.sector = sector;
610 desc.nr_zones = zones_in_request;
611 ret = ublk_zoned_insert_report_desc(req, &desc);
612 if (ret)
613 goto free_req;
614
615 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
616 if (ret)
617 goto erase_desc;
618
619 status = blk_execute_rq(req, 0);
620 ret = blk_status_to_errno(status);
621 erase_desc:
622 ublk_zoned_erase_report_desc(req);
623 free_req:
624 blk_mq_free_request(req);
625 if (ret)
626 goto out;
627
628 for (unsigned int i = 0; i < zones_in_request; i++) {
629 struct blk_zone *zone = buffer + i;
630
631 /* A zero length zone means no more zones in this response */
632 if (!zone->len)
633 break;
634
635 ret = disk_report_zone(disk, zone, i, args);
636 if (ret)
637 goto out;
638
639 done_zones++;
640 sector += zone_size_sectors;
641
642 }
643 }
644
645 ret = done_zones;
646
647 out:
648 kvfree(buffer);
649 return ret;
650 }
651
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)652 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
653 struct request *req)
654 {
655 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
656 struct ublk_io *io = &ubq->ios[req->tag];
657 struct ublk_zoned_report_desc *desc;
658 u32 ublk_op;
659
660 switch (req_op(req)) {
661 case REQ_OP_ZONE_OPEN:
662 ublk_op = UBLK_IO_OP_ZONE_OPEN;
663 break;
664 case REQ_OP_ZONE_CLOSE:
665 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
666 break;
667 case REQ_OP_ZONE_FINISH:
668 ublk_op = UBLK_IO_OP_ZONE_FINISH;
669 break;
670 case REQ_OP_ZONE_RESET:
671 ublk_op = UBLK_IO_OP_ZONE_RESET;
672 break;
673 case REQ_OP_ZONE_APPEND:
674 ublk_op = UBLK_IO_OP_ZONE_APPEND;
675 break;
676 case REQ_OP_ZONE_RESET_ALL:
677 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
678 break;
679 case REQ_OP_DRV_IN:
680 desc = ublk_zoned_get_report_desc(req);
681 if (!desc)
682 return BLK_STS_IOERR;
683 ublk_op = desc->operation;
684 switch (ublk_op) {
685 case UBLK_IO_OP_REPORT_ZONES:
686 iod->op_flags = ublk_op | ublk_req_build_flags(req);
687 iod->nr_zones = desc->nr_zones;
688 iod->start_sector = desc->sector;
689 return BLK_STS_OK;
690 default:
691 return BLK_STS_IOERR;
692 }
693 case REQ_OP_DRV_OUT:
694 /* We do not support drv_out */
695 return BLK_STS_NOTSUPP;
696 default:
697 return BLK_STS_IOERR;
698 }
699
700 iod->op_flags = ublk_op | ublk_req_build_flags(req);
701 iod->nr_sectors = blk_rq_sectors(req);
702 iod->start_sector = blk_rq_pos(req);
703 iod->addr = io->buf.addr;
704
705 return BLK_STS_OK;
706 }
707
708 #else
709
710 #define ublk_report_zones (NULL)
711
ublk_dev_param_zoned_validate(const struct ublk_device * ub)712 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
713 {
714 return -EOPNOTSUPP;
715 }
716
ublk_dev_param_zoned_apply(struct ublk_device * ub)717 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
718 {
719 }
720
ublk_revalidate_disk_zones(struct ublk_device * ub)721 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
722 {
723 return 0;
724 }
725
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)726 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
727 struct request *req)
728 {
729 return BLK_STS_NOTSUPP;
730 }
731
732 #endif
733
734 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
735 bool need_map, struct io_comp_batch *iob);
736
737 static dev_t ublk_chr_devt;
738 static const struct class ublk_chr_class = {
739 .name = "ublk-char",
740 };
741
742 static DEFINE_IDR(ublk_index_idr);
743 static DEFINE_SPINLOCK(ublk_idr_lock);
744 static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
745
746 static DEFINE_MUTEX(ublk_ctl_mutex);
747
748 static struct ublk_batch_fetch_cmd *
ublk_batch_alloc_fcmd(struct io_uring_cmd * cmd)749 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
750 {
751 struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
752
753 if (fcmd) {
754 fcmd->cmd = cmd;
755 fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
756 }
757 return fcmd;
758 }
759
ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd * fcmd)760 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
761 {
762 kfree(fcmd);
763 }
764
__ublk_release_fcmd(struct ublk_queue * ubq)765 static void __ublk_release_fcmd(struct ublk_queue *ubq)
766 {
767 WRITE_ONCE(ubq->active_fcmd, NULL);
768 }
769
770 /*
771 * Nothing can move on, so clear ->active_fcmd, and the caller should stop
772 * dispatching
773 */
ublk_batch_deinit_fetch_buf(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd,int res)774 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
775 const struct ublk_batch_io_data *data,
776 struct ublk_batch_fetch_cmd *fcmd,
777 int res)
778 {
779 spin_lock(&ubq->evts_lock);
780 list_del_init(&fcmd->node);
781 WARN_ON_ONCE(fcmd != ubq->active_fcmd);
782 __ublk_release_fcmd(ubq);
783 spin_unlock(&ubq->evts_lock);
784
785 io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
786 ublk_batch_free_fcmd(fcmd);
787 }
788
ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd * fcmd,struct io_br_sel * sel,unsigned int issue_flags)789 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
790 struct io_br_sel *sel,
791 unsigned int issue_flags)
792 {
793 if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
794 return -ENOBUFS;
795 return 0;
796 }
797
ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd * fcmd,void __user * buf,const u16 * tag_buf,unsigned int len)798 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
799 void __user *buf, const u16 *tag_buf,
800 unsigned int len)
801 {
802 if (copy_to_user(buf, tag_buf, len))
803 return -EFAULT;
804 return len;
805 }
806
807 #define UBLK_MAX_UBLKS UBLK_MINORS
808
809 /*
810 * Max unprivileged ublk devices allowed to add
811 *
812 * It can be extended to one per-user limit in future or even controlled
813 * by cgroup.
814 */
815 static unsigned int unprivileged_ublks_max = 64;
816 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
817
818 static struct miscdevice ublk_misc;
819
ublk_pos_to_hwq(loff_t pos)820 static inline unsigned ublk_pos_to_hwq(loff_t pos)
821 {
822 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
823 UBLK_QID_BITS_MASK;
824 }
825
ublk_pos_to_buf_off(loff_t pos)826 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
827 {
828 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
829 }
830
ublk_pos_to_tag(loff_t pos)831 static inline unsigned ublk_pos_to_tag(loff_t pos)
832 {
833 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
834 UBLK_TAG_BITS_MASK;
835 }
836
ublk_dev_param_basic_apply(struct ublk_device * ub)837 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
838 {
839 const struct ublk_param_basic *p = &ub->params.basic;
840
841 if (p->attrs & UBLK_ATTR_READ_ONLY)
842 set_disk_ro(ub->ub_disk, true);
843
844 set_capacity(ub->ub_disk, p->dev_sectors);
845 }
846
ublk_integrity_flags(u32 flags)847 static int ublk_integrity_flags(u32 flags)
848 {
849 int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
850
851 if (flags & LBMD_PI_CAP_INTEGRITY) {
852 flags &= ~LBMD_PI_CAP_INTEGRITY;
853 ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
854 }
855 if (flags & LBMD_PI_CAP_REFTAG) {
856 flags &= ~LBMD_PI_CAP_REFTAG;
857 ret_flags |= BLK_INTEGRITY_REF_TAG;
858 }
859 return flags ? -EINVAL : ret_flags;
860 }
861
ublk_integrity_pi_tuple_size(u8 csum_type)862 static int ublk_integrity_pi_tuple_size(u8 csum_type)
863 {
864 switch (csum_type) {
865 case LBMD_PI_CSUM_NONE:
866 return 0;
867 case LBMD_PI_CSUM_IP:
868 case LBMD_PI_CSUM_CRC16_T10DIF:
869 return 8;
870 case LBMD_PI_CSUM_CRC64_NVME:
871 return 16;
872 default:
873 return -EINVAL;
874 }
875 }
876
ublk_integrity_csum_type(u8 csum_type)877 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
878 {
879 switch (csum_type) {
880 case LBMD_PI_CSUM_NONE:
881 return BLK_INTEGRITY_CSUM_NONE;
882 case LBMD_PI_CSUM_IP:
883 return BLK_INTEGRITY_CSUM_IP;
884 case LBMD_PI_CSUM_CRC16_T10DIF:
885 return BLK_INTEGRITY_CSUM_CRC;
886 case LBMD_PI_CSUM_CRC64_NVME:
887 return BLK_INTEGRITY_CSUM_CRC64;
888 default:
889 WARN_ON_ONCE(1);
890 return BLK_INTEGRITY_CSUM_NONE;
891 }
892 }
893
ublk_validate_params(const struct ublk_device * ub)894 static int ublk_validate_params(const struct ublk_device *ub)
895 {
896 /* basic param is the only one which must be set */
897 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
898 const struct ublk_param_basic *p = &ub->params.basic;
899
900 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
901 return -EINVAL;
902
903 if (p->logical_bs_shift > p->physical_bs_shift)
904 return -EINVAL;
905
906 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
907 return -EINVAL;
908
909 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
910 return -EINVAL;
911 } else
912 return -EINVAL;
913
914 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
915 const struct ublk_param_discard *p = &ub->params.discard;
916
917 /* So far, only support single segment discard */
918 if (p->max_discard_sectors && p->max_discard_segments != 1)
919 return -EINVAL;
920
921 if (!p->discard_granularity)
922 return -EINVAL;
923 }
924
925 /* dev_t is read-only */
926 if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
927 return -EINVAL;
928
929 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
930 return ublk_dev_param_zoned_validate(ub);
931 else if (ublk_dev_is_zoned(ub))
932 return -EINVAL;
933
934 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
935 const struct ublk_param_dma_align *p = &ub->params.dma;
936
937 if (p->alignment >= PAGE_SIZE)
938 return -EINVAL;
939
940 if (!is_power_of_2(p->alignment + 1))
941 return -EINVAL;
942 }
943
944 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
945 const struct ublk_param_segment *p = &ub->params.seg;
946
947 if (!is_power_of_2(p->seg_boundary_mask + 1))
948 return -EINVAL;
949
950 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
951 return -EINVAL;
952 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
953 return -EINVAL;
954 }
955
956 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
957 const struct ublk_param_integrity *p = &ub->params.integrity;
958 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
959 int flags = ublk_integrity_flags(p->flags);
960
961 if (!ublk_dev_support_integrity(ub))
962 return -EINVAL;
963 if (flags < 0)
964 return flags;
965 if (pi_tuple_size < 0)
966 return pi_tuple_size;
967 if (!p->metadata_size)
968 return -EINVAL;
969 if (p->csum_type == LBMD_PI_CSUM_NONE &&
970 p->flags & LBMD_PI_CAP_REFTAG)
971 return -EINVAL;
972 if (p->pi_offset + pi_tuple_size > p->metadata_size)
973 return -EINVAL;
974 if (p->interval_exp < SECTOR_SHIFT ||
975 p->interval_exp > ub->params.basic.logical_bs_shift)
976 return -EINVAL;
977 }
978
979 return 0;
980 }
981
ublk_apply_params(struct ublk_device * ub)982 static void ublk_apply_params(struct ublk_device *ub)
983 {
984 ublk_dev_param_basic_apply(ub);
985
986 if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
987 ublk_dev_param_zoned_apply(ub);
988 }
989
ublk_need_map_io(const struct ublk_queue * ubq)990 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
991 {
992 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
993 !ublk_support_auto_buf_reg(ubq);
994 }
995
ublk_dev_need_map_io(const struct ublk_device * ub)996 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
997 {
998 return !ublk_dev_support_user_copy(ub) &&
999 !ublk_dev_support_zero_copy(ub) &&
1000 !ublk_dev_support_auto_buf_reg(ub);
1001 }
1002
ublk_need_req_ref(const struct ublk_queue * ubq)1003 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1004 {
1005 /*
1006 * read()/write() is involved in user copy, so request reference
1007 * has to be grabbed
1008 *
1009 * for zero copy, request buffer need to be registered to io_uring
1010 * buffer table, so reference is needed
1011 *
1012 * For auto buffer register, ublk server still may issue
1013 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1014 * so reference is required too.
1015 */
1016 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1017 ublk_support_auto_buf_reg(ubq);
1018 }
1019
ublk_dev_need_req_ref(const struct ublk_device * ub)1020 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1021 {
1022 return ublk_dev_support_user_copy(ub) ||
1023 ublk_dev_support_zero_copy(ub) ||
1024 ublk_dev_support_auto_buf_reg(ub);
1025 }
1026
1027 /*
1028 * ublk IO Reference Counting Design
1029 * ==================================
1030 *
1031 * For user-copy and zero-copy modes, ublk uses a split reference model with
1032 * two counters that together track IO lifetime:
1033 *
1034 * - io->ref: refcount for off-task buffer registrations and user-copy ops
1035 * - io->task_registered_buffers: count of buffers registered on the IO task
1036 *
1037 * Key Invariant:
1038 * --------------
1039 * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1040 * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1041 * when no active references exist. After IO completion, both counters become
1042 * zero. For I/Os not currently dispatched to the ublk server, both ref and
1043 * task_registered_buffers are 0.
1044 *
1045 * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1046 * exit to determine if all references have been released.
1047 *
1048 * Why Split Counters:
1049 * -------------------
1050 * Buffers registered on the IO daemon task can use the lightweight
1051 * task_registered_buffers counter (simple increment/decrement) instead of
1052 * atomic refcount operations. The ublk_io_release() callback checks if
1053 * current == io->task to decide which counter to update.
1054 *
1055 * This optimization only applies before IO completion. At completion,
1056 * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1057 * After that, all subsequent buffer unregistrations must use the atomic ref
1058 * since they may be releasing the last reference.
1059 *
1060 * Reference Lifecycle:
1061 * --------------------
1062 * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1063 *
1064 * 2. During IO processing:
1065 * - On-task buffer reg: task_registered_buffers++ (no ref change)
1066 * - Off-task buffer reg: ref++ via ublk_get_req_ref()
1067 * - Buffer unregister callback (ublk_io_release):
1068 * * If on-task: task_registered_buffers--
1069 * * If off-task: ref-- via ublk_put_req_ref()
1070 *
1071 * 3. ublk_sub_req_ref() at IO completion:
1072 * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1073 * - Subtracts sub_refs from ref and zeroes task_registered_buffers
1074 * - This effectively collapses task_registered_buffers into the atomic ref,
1075 * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1076 * buffers that were already counted
1077 *
1078 * Example (zero-copy, register on-task, unregister off-task):
1079 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1080 * - Register buffer on-task: task_registered_buffers = 1
1081 * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1082 * - Completion via ublk_sub_req_ref():
1083 * sub_refs = UBLK_REFCOUNT_INIT - 1,
1084 * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1085 *
1086 * Example (auto buffer registration):
1087 * Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1088 *
1089 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1090 * - Buffer unregister: task_registered_buffers-- (becomes 0)
1091 * - Completion via ublk_sub_req_ref():
1092 * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1093 *
1094 * Example (zero-copy, ublk server killed):
1095 * When daemon is killed, io_uring cleanup unregisters buffers off-task.
1096 * ublk_check_and_reset_active_ref() waits for the invariant to hold.
1097 *
1098 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1099 * - Register buffer on-task: task_registered_buffers = 1
1100 * - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1101 * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1102 * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1103 * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1104 * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1105 * and abort pending requests
1106 *
1107 * Batch IO Special Case:
1108 * ----------------------
1109 * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1110 * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1111 * task_registered_buffers counter still tracks registered buffers for the
1112 * invariant check, even though the callback doesn't decrement it.
1113 *
1114 * Note: updating task_registered_buffers is protected by io->lock.
1115 */
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)1116 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1117 struct ublk_io *io)
1118 {
1119 if (ublk_need_req_ref(ubq))
1120 refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1121 }
1122
ublk_get_req_ref(struct ublk_io * io)1123 static inline bool ublk_get_req_ref(struct ublk_io *io)
1124 {
1125 return refcount_inc_not_zero(&io->ref);
1126 }
1127
ublk_put_req_ref(struct ublk_io * io,struct request * req)1128 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1129 {
1130 if (!refcount_dec_and_test(&io->ref))
1131 return;
1132
1133 /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1134 __ublk_complete_rq(req, io, false, NULL);
1135 }
1136
ublk_sub_req_ref(struct ublk_io * io)1137 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1138 {
1139 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1140
1141 io->task_registered_buffers = 0;
1142 return refcount_sub_and_test(sub_refs, &io->ref);
1143 }
1144
ublk_need_get_data(const struct ublk_queue * ubq)1145 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1146 {
1147 return ubq->flags & UBLK_F_NEED_GET_DATA;
1148 }
1149
ublk_dev_need_get_data(const struct ublk_device * ub)1150 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1151 {
1152 return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1153 }
1154
1155 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)1156 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1157 {
1158 if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1159 return ub;
1160 return NULL;
1161 }
1162
1163 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)1164 static noinline void ublk_put_device(struct ublk_device *ub)
1165 {
1166 put_device(&ub->cdev_dev);
1167 }
1168
ublk_get_queue(struct ublk_device * dev,int qid)1169 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1170 int qid)
1171 {
1172 return dev->queues[qid];
1173 }
1174
ublk_rq_has_data(const struct request * rq)1175 static inline bool ublk_rq_has_data(const struct request *rq)
1176 {
1177 return bio_has_data(rq->bio);
1178 }
1179
1180 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)1181 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1182 {
1183 return ublk_get_queue(ub, q_id)->io_cmd_buf;
1184 }
1185
__ublk_queue_cmd_buf_size(int depth)1186 static inline int __ublk_queue_cmd_buf_size(int depth)
1187 {
1188 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1189 }
1190
ublk_queue_cmd_buf_size(struct ublk_device * ub)1191 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1192 {
1193 return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1194 }
1195
ublk_max_cmd_buf_size(void)1196 static int ublk_max_cmd_buf_size(void)
1197 {
1198 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1199 }
1200
1201 /*
1202 * Should I/O outstanding to the ublk server when it exits be reissued?
1203 * If not, outstanding I/O will get errors.
1204 */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)1205 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1206 {
1207 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1208 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1209 }
1210
1211 /*
1212 * Should I/O issued while there is no ublk server queue? If not, I/O
1213 * issued while there is no ublk server will get errors.
1214 */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)1215 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1216 {
1217 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1218 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1219 }
1220
1221 /*
1222 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1223 * of the device flags for smaller cache footprint - better for fast
1224 * paths.
1225 */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)1226 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1227 {
1228 return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1229 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1230 }
1231
1232 /*
1233 * Should ublk devices be stopped (i.e. no recovery possible) when the
1234 * ublk server exits? If not, devices can be used again by a future
1235 * incarnation of a ublk server via the start_recovery/end_recovery
1236 * commands.
1237 */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)1238 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1239 {
1240 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1241 }
1242
ublk_dev_in_recoverable_state(struct ublk_device * ub)1243 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1244 {
1245 return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1246 ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1247 }
1248
ublk_free_disk(struct gendisk * disk)1249 static void ublk_free_disk(struct gendisk *disk)
1250 {
1251 struct ublk_device *ub = disk->private_data;
1252
1253 clear_bit(UB_STATE_USED, &ub->state);
1254 ublk_put_device(ub);
1255 }
1256
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)1257 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1258 unsigned int *owner_gid)
1259 {
1260 kuid_t uid;
1261 kgid_t gid;
1262
1263 current_uid_gid(&uid, &gid);
1264
1265 *owner_uid = from_kuid(&init_user_ns, uid);
1266 *owner_gid = from_kgid(&init_user_ns, gid);
1267 }
1268
ublk_open(struct gendisk * disk,blk_mode_t mode)1269 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1270 {
1271 struct ublk_device *ub = disk->private_data;
1272
1273 if (capable(CAP_SYS_ADMIN))
1274 return 0;
1275
1276 /*
1277 * If it is one unprivileged device, only owner can open
1278 * the disk. Otherwise it could be one trap made by one
1279 * evil user who grants this disk's privileges to other
1280 * users deliberately.
1281 *
1282 * This way is reasonable too given anyone can create
1283 * unprivileged device, and no need other's grant.
1284 */
1285 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1286 unsigned int curr_uid, curr_gid;
1287
1288 ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1289
1290 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1291 ub->dev_info.owner_gid)
1292 return -EPERM;
1293 }
1294
1295 if (ub->block_open)
1296 return -ENXIO;
1297
1298 return 0;
1299 }
1300
1301 static const struct block_device_operations ub_fops = {
1302 .owner = THIS_MODULE,
1303 .open = ublk_open,
1304 .free_disk = ublk_free_disk,
1305 .report_zones = ublk_report_zones,
1306 };
1307
ublk_copy_user_bvec(const struct bio_vec * bv,unsigned * offset,struct iov_iter * uiter,int dir,size_t * done)1308 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1309 struct iov_iter *uiter, int dir, size_t *done)
1310 {
1311 unsigned len;
1312 void *bv_buf;
1313 size_t copied;
1314
1315 if (*offset >= bv->bv_len) {
1316 *offset -= bv->bv_len;
1317 return true;
1318 }
1319
1320 len = bv->bv_len - *offset;
1321 bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1322 /*
1323 * Bio pages may originate from slab caches without a usercopy region
1324 * (e.g. jbd2 frozen metadata buffers). This is the same data that
1325 * the loop driver writes to its backing file — no exposure risk.
1326 * The bvec length is always trusted, so the size check in
1327 * check_copy_size() is not needed either. Use the unchecked
1328 * helpers to avoid false positives on slab pages.
1329 */
1330 if (dir == ITER_DEST)
1331 copied = _copy_to_iter(bv_buf, len, uiter);
1332 else
1333 copied = _copy_from_iter(bv_buf, len, uiter);
1334
1335 kunmap_local(bv_buf);
1336
1337 *done += copied;
1338 if (copied < len)
1339 return false;
1340
1341 *offset = 0;
1342 return true;
1343 }
1344
1345 /*
1346 * Copy data between request pages and io_iter, and 'offset'
1347 * is the start point of linear offset of request.
1348 */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1349 static size_t ublk_copy_user_pages(const struct request *req,
1350 unsigned offset, struct iov_iter *uiter, int dir)
1351 {
1352 struct req_iterator iter;
1353 struct bio_vec bv;
1354 size_t done = 0;
1355
1356 rq_for_each_segment(bv, req, iter) {
1357 if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1358 break;
1359 }
1360 return done;
1361 }
1362
1363 #ifdef CONFIG_BLK_DEV_INTEGRITY
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1364 static size_t ublk_copy_user_integrity(const struct request *req,
1365 unsigned offset, struct iov_iter *uiter, int dir)
1366 {
1367 size_t done = 0;
1368 struct bio *bio = req->bio;
1369 struct bvec_iter iter;
1370 struct bio_vec iv;
1371
1372 if (!blk_integrity_rq(req))
1373 return 0;
1374
1375 bio_for_each_integrity_vec(iv, bio, iter) {
1376 if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1377 break;
1378 }
1379
1380 return done;
1381 }
1382 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1383 static size_t ublk_copy_user_integrity(const struct request *req,
1384 unsigned offset, struct iov_iter *uiter, int dir)
1385 {
1386 return 0;
1387 }
1388 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1389
ublk_need_map_req(const struct request * req)1390 static inline bool ublk_need_map_req(const struct request *req)
1391 {
1392 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1393 }
1394
ublk_need_unmap_req(const struct request * req)1395 static inline bool ublk_need_unmap_req(const struct request *req)
1396 {
1397 return ublk_rq_has_data(req) &&
1398 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1399 }
1400
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)1401 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1402 const struct request *req,
1403 const struct ublk_io *io)
1404 {
1405 const unsigned int rq_bytes = blk_rq_bytes(req);
1406
1407 if (!ublk_need_map_io(ubq))
1408 return rq_bytes;
1409
1410 /*
1411 * no zero copy, we delay copy WRITE request data into ublksrv
1412 * context and the big benefit is that pinning pages in current
1413 * context is pretty fast, see ublk_pin_user_pages
1414 */
1415 if (ublk_need_map_req(req)) {
1416 struct iov_iter iter;
1417 const int dir = ITER_DEST;
1418
1419 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1420 return ublk_copy_user_pages(req, 0, &iter, dir);
1421 }
1422 return rq_bytes;
1423 }
1424
ublk_unmap_io(bool need_map,const struct request * req,const struct ublk_io * io)1425 static unsigned int ublk_unmap_io(bool need_map,
1426 const struct request *req,
1427 const struct ublk_io *io)
1428 {
1429 const unsigned int rq_bytes = blk_rq_bytes(req);
1430
1431 if (!need_map)
1432 return rq_bytes;
1433
1434 if (ublk_need_unmap_req(req)) {
1435 struct iov_iter iter;
1436 const int dir = ITER_SOURCE;
1437
1438 WARN_ON_ONCE(io->res > rq_bytes);
1439
1440 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1441 return ublk_copy_user_pages(req, 0, &iter, dir);
1442 }
1443 return rq_bytes;
1444 }
1445
ublk_req_build_flags(struct request * req)1446 static inline unsigned int ublk_req_build_flags(struct request *req)
1447 {
1448 unsigned flags = 0;
1449
1450 if (req->cmd_flags & REQ_FAILFAST_DEV)
1451 flags |= UBLK_IO_F_FAILFAST_DEV;
1452
1453 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1454 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1455
1456 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1457 flags |= UBLK_IO_F_FAILFAST_DRIVER;
1458
1459 if (req->cmd_flags & REQ_META)
1460 flags |= UBLK_IO_F_META;
1461
1462 if (req->cmd_flags & REQ_FUA)
1463 flags |= UBLK_IO_F_FUA;
1464
1465 if (req->cmd_flags & REQ_NOUNMAP)
1466 flags |= UBLK_IO_F_NOUNMAP;
1467
1468 if (req->cmd_flags & REQ_SWAP)
1469 flags |= UBLK_IO_F_SWAP;
1470
1471 if (blk_integrity_rq(req))
1472 flags |= UBLK_IO_F_INTEGRITY;
1473
1474 return flags;
1475 }
1476
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1477 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1478 {
1479 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1480 struct ublk_io *io = &ubq->ios[req->tag];
1481 u32 ublk_op;
1482
1483 switch (req_op(req)) {
1484 case REQ_OP_READ:
1485 ublk_op = UBLK_IO_OP_READ;
1486 break;
1487 case REQ_OP_WRITE:
1488 ublk_op = UBLK_IO_OP_WRITE;
1489 break;
1490 case REQ_OP_FLUSH:
1491 ublk_op = UBLK_IO_OP_FLUSH;
1492 break;
1493 case REQ_OP_DISCARD:
1494 ublk_op = UBLK_IO_OP_DISCARD;
1495 break;
1496 case REQ_OP_WRITE_ZEROES:
1497 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1498 break;
1499 default:
1500 if (ublk_queue_is_zoned(ubq))
1501 return ublk_setup_iod_zoned(ubq, req);
1502 return BLK_STS_IOERR;
1503 }
1504
1505 /* need to translate since kernel may change */
1506 iod->op_flags = ublk_op | ublk_req_build_flags(req);
1507 iod->nr_sectors = blk_rq_sectors(req);
1508 iod->start_sector = blk_rq_pos(req);
1509
1510 /* Try shmem zero-copy match before setting addr */
1511 if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
1512 u32 buf_idx, buf_off;
1513
1514 if (ublk_try_buf_match(ubq->dev, req,
1515 &buf_idx, &buf_off)) {
1516 iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
1517 iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
1518 return BLK_STS_OK;
1519 }
1520 }
1521
1522 iod->addr = io->buf.addr;
1523
1524 return BLK_STS_OK;
1525 }
1526
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1527 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1528 struct io_uring_cmd *ioucmd)
1529 {
1530 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1531 }
1532
ublk_end_request(struct request * req,blk_status_t error)1533 static void ublk_end_request(struct request *req, blk_status_t error)
1534 {
1535 local_bh_disable();
1536 blk_mq_end_request(req, error);
1537 local_bh_enable();
1538 }
1539
1540 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req,struct ublk_io * io,bool need_map,struct io_comp_batch * iob)1541 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1542 bool need_map, struct io_comp_batch *iob)
1543 {
1544 unsigned int unmapped_bytes;
1545 blk_status_t res = BLK_STS_OK;
1546 bool requeue;
1547
1548 /* failed read IO if nothing is read */
1549 if (!io->res && req_op(req) == REQ_OP_READ)
1550 io->res = -EIO;
1551
1552 if (io->res < 0) {
1553 res = errno_to_blk_status(io->res);
1554 goto exit;
1555 }
1556
1557 /*
1558 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1559 * directly.
1560 *
1561 * Both the two needn't unmap.
1562 */
1563 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1564 req_op(req) != REQ_OP_DRV_IN)
1565 goto exit;
1566
1567 /* shmem zero copy: no data to unmap, pages already shared */
1568 if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1569 goto exit;
1570
1571 /* for READ request, writing data in iod->addr to rq buffers */
1572 unmapped_bytes = ublk_unmap_io(need_map, req, io);
1573
1574 /*
1575 * Extremely impossible since we got data filled in just before
1576 *
1577 * Re-read simply for this unlikely case.
1578 */
1579 if (unlikely(unmapped_bytes < io->res))
1580 io->res = unmapped_bytes;
1581
1582 /*
1583 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1584 * happens off this path, then that will prevent ublk's blkdev_release()
1585 * from being called on current's task work, see fput() implementation.
1586 *
1587 * Otherwise, ublk server may not provide forward progress in case of
1588 * reading the partition table from bdev_open() with disk->open_mutex
1589 * held, and causes dead lock as we could already be holding
1590 * disk->open_mutex here.
1591 *
1592 * Preferably we would not be doing IO with a mutex held that is also
1593 * used for release, but this work-around will suffice for now.
1594 */
1595 local_bh_disable();
1596 requeue = blk_update_request(req, BLK_STS_OK, io->res);
1597 local_bh_enable();
1598 if (requeue)
1599 blk_mq_requeue_request(req, true);
1600 else if (likely(!blk_should_fake_timeout(req->q))) {
1601 if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1602 return;
1603 __blk_mq_end_request(req, BLK_STS_OK);
1604 }
1605
1606 return;
1607 exit:
1608 ublk_end_request(req, res);
1609 }
1610
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1611 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1612 struct request *req)
1613 {
1614 /* read cmd first because req will overwrite it */
1615 struct io_uring_cmd *cmd = io->cmd;
1616
1617 /* mark this cmd owned by ublksrv */
1618 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1619
1620 /*
1621 * clear ACTIVE since we are done with this sqe/cmd slot
1622 * We can only accept io cmd in case of being not active.
1623 */
1624 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1625
1626 io->req = req;
1627 return cmd;
1628 }
1629
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1630 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1631 int res, unsigned issue_flags)
1632 {
1633 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1634
1635 /* tell ublksrv one io request is coming */
1636 io_uring_cmd_done(cmd, res, issue_flags);
1637 }
1638
1639 #define UBLK_REQUEUE_DELAY_MS 3
1640
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1641 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1642 struct request *rq)
1643 {
1644 /* We cannot process this rq so just requeue it. */
1645 if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1646 blk_mq_requeue_request(rq, false);
1647 else
1648 ublk_end_request(rq, BLK_STS_IOERR);
1649 }
1650
1651 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,unsigned tag)1652 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1653 {
1654 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1655
1656 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1657 }
1658
1659 enum auto_buf_reg_res {
1660 AUTO_BUF_REG_FAIL,
1661 AUTO_BUF_REG_FALLBACK,
1662 AUTO_BUF_REG_OK,
1663 };
1664
1665 /*
1666 * Setup io state after auto buffer registration.
1667 *
1668 * Must be called after ublk_auto_buf_register() is done.
1669 * Caller must hold io->lock in batch context.
1670 */
ublk_auto_buf_io_setup(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,enum auto_buf_reg_res res)1671 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1672 struct request *req, struct ublk_io *io,
1673 struct io_uring_cmd *cmd,
1674 enum auto_buf_reg_res res)
1675 {
1676 if (res == AUTO_BUF_REG_OK) {
1677 io->task_registered_buffers = 1;
1678 io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1679 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1680 }
1681 ublk_init_req_ref(ubq, io);
1682 __ublk_prep_compl_io_cmd(io, req);
1683 }
1684
1685 /* Register request bvec to io_uring for auto buffer registration. */
1686 static enum auto_buf_reg_res
ublk_auto_buf_register(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1687 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1688 struct ublk_io *io, struct io_uring_cmd *cmd,
1689 unsigned int issue_flags)
1690 {
1691 int ret;
1692
1693 ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1694 io->buf.auto_reg.index, issue_flags);
1695 if (ret) {
1696 if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1697 ublk_auto_buf_reg_fallback(ubq, req->tag);
1698 return AUTO_BUF_REG_FALLBACK;
1699 }
1700 ublk_end_request(req, BLK_STS_IOERR);
1701 return AUTO_BUF_REG_FAIL;
1702 }
1703
1704 return AUTO_BUF_REG_OK;
1705 }
1706
1707 /*
1708 * Dispatch IO to userspace with auto buffer registration.
1709 *
1710 * Only called in non-batch context from task work, io->lock not held.
1711 */
ublk_auto_buf_dispatch(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1712 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1713 struct request *req, struct ublk_io *io,
1714 struct io_uring_cmd *cmd,
1715 unsigned int issue_flags)
1716 {
1717 enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1718 issue_flags);
1719
1720 if (res != AUTO_BUF_REG_FAIL) {
1721 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1722 io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1723 }
1724 }
1725
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1726 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1727 struct ublk_io *io)
1728 {
1729 unsigned mapped_bytes;
1730
1731 /* shmem zero copy: skip data copy, pages already shared */
1732 if (ublk_iod_is_shmem_zc(ubq, req->tag))
1733 return true;
1734
1735 mapped_bytes = ublk_map_io(ubq, req, io);
1736
1737 /* partially mapped, update io descriptor */
1738 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1739 /*
1740 * Nothing mapped, retry until we succeed.
1741 *
1742 * We may never succeed in mapping any bytes here because
1743 * of OOM. TODO: reserve one buffer with single page pinned
1744 * for providing forward progress guarantee.
1745 */
1746 if (unlikely(!mapped_bytes)) {
1747 blk_mq_requeue_request(req, false);
1748 blk_mq_delay_kick_requeue_list(req->q,
1749 UBLK_REQUEUE_DELAY_MS);
1750 return false;
1751 }
1752
1753 ublk_get_iod(ubq, req->tag)->nr_sectors =
1754 mapped_bytes >> 9;
1755 }
1756
1757 return true;
1758 }
1759
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req)1760 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1761 {
1762 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1763 int tag = req->tag;
1764 struct ublk_io *io = &ubq->ios[tag];
1765
1766 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1767 __func__, ubq->q_id, req->tag, io->flags,
1768 ublk_get_iod(ubq, req->tag)->addr);
1769
1770 /*
1771 * Task is exiting if either:
1772 *
1773 * (1) current != io->task.
1774 * io_uring_cmd_complete_in_task() tries to run task_work
1775 * in a workqueue if cmd's task is PF_EXITING.
1776 *
1777 * (2) current->flags & PF_EXITING.
1778 */
1779 if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1780 __ublk_abort_rq(ubq, req);
1781 return;
1782 }
1783
1784 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1785 /*
1786 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1787 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1788 * and notify it.
1789 */
1790 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1791 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1792 __func__, ubq->q_id, req->tag, io->flags);
1793 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1794 issue_flags);
1795 return;
1796 }
1797
1798 if (!ublk_start_io(ubq, req, io))
1799 return;
1800
1801 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1802 ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1803 } else {
1804 ublk_init_req_ref(ubq, io);
1805 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1806 }
1807 }
1808
__ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short tag)1809 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1810 const struct ublk_batch_io_data *data,
1811 unsigned short tag)
1812 {
1813 struct ublk_device *ub = data->ub;
1814 struct ublk_io *io = &ubq->ios[tag];
1815 struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1816 enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1817 struct io_uring_cmd *cmd = data->cmd;
1818
1819 if (!ublk_start_io(ubq, req, io))
1820 return false;
1821
1822 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1823 res = ublk_auto_buf_register(ubq, req, io, cmd,
1824 data->issue_flags);
1825
1826 if (res == AUTO_BUF_REG_FAIL)
1827 return false;
1828 }
1829
1830 ublk_io_lock(io);
1831 ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1832 ublk_io_unlock(io);
1833
1834 return true;
1835 }
1836
ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,unsigned int len)1837 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1838 const struct ublk_batch_io_data *data,
1839 unsigned short *tag_buf,
1840 unsigned int len)
1841 {
1842 bool has_unused = false;
1843 unsigned int i;
1844
1845 for (i = 0; i < len; i++) {
1846 unsigned short tag = tag_buf[i];
1847
1848 if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1849 tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1850 has_unused = true;
1851 }
1852 }
1853
1854 return has_unused;
1855 }
1856
1857 /*
1858 * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1859 * Returns the new length after filtering.
1860 */
ublk_filter_unused_tags(unsigned short * tag_buf,unsigned int len)1861 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1862 unsigned int len)
1863 {
1864 unsigned int i, j;
1865
1866 for (i = 0, j = 0; i < len; i++) {
1867 if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1868 if (i != j)
1869 tag_buf[j] = tag_buf[i];
1870 j++;
1871 }
1872 }
1873
1874 return j;
1875 }
1876
ublk_batch_dispatch_fail(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,size_t len,int ret)1877 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1878 const struct ublk_batch_io_data *data,
1879 unsigned short *tag_buf, size_t len, int ret)
1880 {
1881 int i, res;
1882
1883 /*
1884 * Undo prep state for all IOs since userspace never received them.
1885 * This restores IOs to pre-prepared state so they can be cleanly
1886 * re-prepared when tags are pulled from FIFO again.
1887 */
1888 for (i = 0; i < len; i++) {
1889 struct ublk_io *io = &ubq->ios[tag_buf[i]];
1890 int index = -1;
1891
1892 ublk_io_lock(io);
1893 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1894 index = io->buf.auto_reg.index;
1895 io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1896 io->flags |= UBLK_IO_FLAG_ACTIVE;
1897 ublk_io_unlock(io);
1898
1899 if (index != -1)
1900 io_buffer_unregister_bvec(data->cmd, index,
1901 data->issue_flags);
1902 }
1903
1904 res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1905 tag_buf, len, &ubq->evts_lock);
1906
1907 pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1908 "tags(%d %zu) ret %d\n", __func__, res, len,
1909 ret);
1910 }
1911
1912 #define MAX_NR_TAG 128
__ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)1913 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1914 const struct ublk_batch_io_data *data,
1915 struct ublk_batch_fetch_cmd *fcmd)
1916 {
1917 const unsigned int tag_sz = sizeof(unsigned short);
1918 unsigned short tag_buf[MAX_NR_TAG];
1919 struct io_br_sel sel;
1920 size_t len = 0;
1921 bool needs_filter;
1922 int ret;
1923
1924 WARN_ON_ONCE(data->cmd != fcmd->cmd);
1925
1926 sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1927 data->issue_flags);
1928 if (sel.val < 0)
1929 return sel.val;
1930 if (!sel.addr)
1931 return -ENOBUFS;
1932
1933 /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1934 len = min(len, sizeof(tag_buf)) / tag_sz;
1935 len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1936
1937 needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1938 /* Filter out unused tags before posting to userspace */
1939 if (unlikely(needs_filter)) {
1940 int new_len = ublk_filter_unused_tags(tag_buf, len);
1941
1942 /* return actual length if all are failed or requeued */
1943 if (!new_len) {
1944 /* release the selected buffer */
1945 sel.val = 0;
1946 WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1947 &sel, data->issue_flags));
1948 return len;
1949 }
1950 len = new_len;
1951 }
1952
1953 sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1954 ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1955 if (unlikely(ret < 0))
1956 ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1957 return ret;
1958 }
1959
__ublk_acquire_fcmd(struct ublk_queue * ubq)1960 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1961 struct ublk_queue *ubq)
1962 {
1963 struct ublk_batch_fetch_cmd *fcmd;
1964
1965 lockdep_assert_held(&ubq->evts_lock);
1966
1967 /*
1968 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1969 *
1970 * The pair is the smp_mb() in ublk_batch_dispatch().
1971 *
1972 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1973 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1974 */
1975 smp_mb();
1976 if (READ_ONCE(ubq->active_fcmd)) {
1977 fcmd = NULL;
1978 } else {
1979 fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1980 struct ublk_batch_fetch_cmd, node);
1981 WRITE_ONCE(ubq->active_fcmd, fcmd);
1982 }
1983 return fcmd;
1984 }
1985
ublk_batch_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)1986 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1987 {
1988 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1989 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1990 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1991 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1992 struct ublk_batch_io_data data = {
1993 .ub = pdu->ubq->dev,
1994 .cmd = fcmd->cmd,
1995 .issue_flags = issue_flags,
1996 };
1997
1998 WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1999
2000 ublk_batch_dispatch(pdu->ubq, &data, fcmd);
2001 }
2002
2003 static void
ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)2004 ublk_batch_dispatch(struct ublk_queue *ubq,
2005 const struct ublk_batch_io_data *data,
2006 struct ublk_batch_fetch_cmd *fcmd)
2007 {
2008 struct ublk_batch_fetch_cmd *new_fcmd;
2009 unsigned tried = 0;
2010 int ret = 0;
2011
2012 again:
2013 while (!ublk_io_evts_empty(ubq)) {
2014 ret = __ublk_batch_dispatch(ubq, data, fcmd);
2015 if (ret <= 0)
2016 break;
2017 }
2018
2019 if (ret < 0) {
2020 ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2021 return;
2022 }
2023
2024 __ublk_release_fcmd(ubq);
2025 /*
2026 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2027 * checking ubq->evts_fifo.
2028 *
2029 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2030 */
2031 smp_mb();
2032 if (likely(ublk_io_evts_empty(ubq)))
2033 return;
2034
2035 spin_lock(&ubq->evts_lock);
2036 new_fcmd = __ublk_acquire_fcmd(ubq);
2037 spin_unlock(&ubq->evts_lock);
2038
2039 if (!new_fcmd)
2040 return;
2041
2042 /* Avoid lockup by allowing to handle at most 32 batches */
2043 if (new_fcmd == fcmd && tried++ < 32)
2044 goto again;
2045
2046 io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2047 }
2048
ublk_cmd_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2049 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2050 {
2051 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2052 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2053 struct ublk_queue *ubq = pdu->ubq;
2054
2055 ublk_dispatch_req(ubq, pdu->req);
2056 }
2057
ublk_batch_queue_cmd(struct ublk_queue * ubq,struct request * rq,bool last)2058 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2059 {
2060 unsigned short tag = rq->tag;
2061 struct ublk_batch_fetch_cmd *fcmd = NULL;
2062
2063 spin_lock(&ubq->evts_lock);
2064 kfifo_put(&ubq->evts_fifo, tag);
2065 if (last)
2066 fcmd = __ublk_acquire_fcmd(ubq);
2067 spin_unlock(&ubq->evts_lock);
2068
2069 if (fcmd)
2070 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2071 }
2072
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)2073 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2074 {
2075 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2076 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2077
2078 pdu->req = rq;
2079 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2080 }
2081
ublk_cmd_list_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2082 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2083 {
2084 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2085 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2086 struct request *rq = pdu->req_list;
2087 struct request *next;
2088
2089 do {
2090 next = rq->rq_next;
2091 rq->rq_next = NULL;
2092 ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2093 rq = next;
2094 } while (rq);
2095 }
2096
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)2097 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2098 {
2099 struct io_uring_cmd *cmd = io->cmd;
2100 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2101
2102 pdu->req_list = rq_list_peek(l);
2103 rq_list_init(l);
2104 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2105 }
2106
ublk_timeout(struct request * rq)2107 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2108 {
2109 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2110 pid_t tgid = ubq->dev->ublksrv_tgid;
2111 struct task_struct *p;
2112 struct pid *pid;
2113
2114 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2115 return BLK_EH_RESET_TIMER;
2116
2117 if (unlikely(!tgid))
2118 return BLK_EH_RESET_TIMER;
2119
2120 rcu_read_lock();
2121 pid = find_vpid(tgid);
2122 p = pid_task(pid, PIDTYPE_PID);
2123 if (p)
2124 send_sig(SIGKILL, p, 0);
2125 rcu_read_unlock();
2126 return BLK_EH_DONE;
2127 }
2128
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)2129 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2130 bool check_cancel)
2131 {
2132 blk_status_t res;
2133
2134 if (unlikely(READ_ONCE(ubq->fail_io)))
2135 return BLK_STS_TARGET;
2136
2137 /* With recovery feature enabled, force_abort is set in
2138 * ublk_stop_dev() before calling del_gendisk(). We have to
2139 * abort all requeued and new rqs here to let del_gendisk()
2140 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2141 * to avoid UAF on io_uring ctx.
2142 *
2143 * Note: force_abort is guaranteed to be seen because it is set
2144 * before request queue is unqiuesced.
2145 */
2146 if (ublk_nosrv_should_queue_io(ubq) &&
2147 unlikely(READ_ONCE(ubq->force_abort)))
2148 return BLK_STS_IOERR;
2149
2150 if (check_cancel && unlikely(ubq->canceling))
2151 return BLK_STS_IOERR;
2152
2153 /* fill iod to slot in io cmd buffer */
2154 res = ublk_setup_iod(ubq, rq);
2155 if (unlikely(res != BLK_STS_OK))
2156 return BLK_STS_IOERR;
2157
2158 blk_mq_start_request(rq);
2159 return BLK_STS_OK;
2160 }
2161
2162 /*
2163 * Common helper for queue_rq that handles request preparation and
2164 * cancellation checks. Returns status and sets should_queue to indicate
2165 * whether the caller should proceed with queuing the request.
2166 */
__ublk_queue_rq_common(struct ublk_queue * ubq,struct request * rq,bool * should_queue)2167 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2168 struct request *rq,
2169 bool *should_queue)
2170 {
2171 blk_status_t res;
2172
2173 res = ublk_prep_req(ubq, rq, false);
2174 if (res != BLK_STS_OK) {
2175 *should_queue = false;
2176 return res;
2177 }
2178
2179 /*
2180 * ->canceling has to be handled after ->force_abort and ->fail_io
2181 * is dealt with, otherwise this request may not be failed in case
2182 * of recovery, and cause hang when deleting disk
2183 */
2184 if (unlikely(ubq->canceling)) {
2185 *should_queue = false;
2186 __ublk_abort_rq(ubq, rq);
2187 return BLK_STS_OK;
2188 }
2189
2190 *should_queue = true;
2191 return BLK_STS_OK;
2192 }
2193
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2194 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2195 const struct blk_mq_queue_data *bd)
2196 {
2197 struct ublk_queue *ubq = hctx->driver_data;
2198 struct request *rq = bd->rq;
2199 bool should_queue;
2200 blk_status_t res;
2201
2202 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2203 if (!should_queue)
2204 return res;
2205
2206 ublk_queue_cmd(ubq, rq);
2207 return BLK_STS_OK;
2208 }
2209
ublk_batch_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2210 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2211 const struct blk_mq_queue_data *bd)
2212 {
2213 struct ublk_queue *ubq = hctx->driver_data;
2214 struct request *rq = bd->rq;
2215 bool should_queue;
2216 blk_status_t res;
2217
2218 res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2219 if (!should_queue)
2220 return res;
2221
2222 ublk_batch_queue_cmd(ubq, rq, bd->last);
2223 return BLK_STS_OK;
2224 }
2225
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)2226 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2227 const struct ublk_io *io2)
2228 {
2229 return (io_uring_cmd_ctx_handle(io->cmd) ==
2230 io_uring_cmd_ctx_handle(io2->cmd)) &&
2231 (io->task == io2->task);
2232 }
2233
ublk_commit_rqs(struct blk_mq_hw_ctx * hctx)2234 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2235 {
2236 struct ublk_queue *ubq = hctx->driver_data;
2237 struct ublk_batch_fetch_cmd *fcmd;
2238
2239 spin_lock(&ubq->evts_lock);
2240 fcmd = __ublk_acquire_fcmd(ubq);
2241 spin_unlock(&ubq->evts_lock);
2242
2243 if (fcmd)
2244 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2245 }
2246
ublk_queue_rqs(struct rq_list * rqlist)2247 static void ublk_queue_rqs(struct rq_list *rqlist)
2248 {
2249 struct rq_list requeue_list = { };
2250 struct rq_list submit_list = { };
2251 struct ublk_io *io = NULL;
2252 struct request *req;
2253
2254 while ((req = rq_list_pop(rqlist))) {
2255 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2256 struct ublk_io *this_io = &this_q->ios[req->tag];
2257
2258 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2259 rq_list_add_tail(&requeue_list, req);
2260 continue;
2261 }
2262
2263 if (io && !ublk_belong_to_same_batch(io, this_io) &&
2264 !rq_list_empty(&submit_list))
2265 ublk_queue_cmd_list(io, &submit_list);
2266 io = this_io;
2267 rq_list_add_tail(&submit_list, req);
2268 }
2269
2270 if (!rq_list_empty(&submit_list))
2271 ublk_queue_cmd_list(io, &submit_list);
2272 *rqlist = requeue_list;
2273 }
2274
ublk_batch_queue_cmd_list(struct ublk_queue * ubq,struct rq_list * l)2275 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2276 {
2277 unsigned short tags[MAX_NR_TAG];
2278 struct ublk_batch_fetch_cmd *fcmd;
2279 struct request *rq;
2280 unsigned cnt = 0;
2281
2282 spin_lock(&ubq->evts_lock);
2283 rq_list_for_each(l, rq) {
2284 tags[cnt++] = (unsigned short)rq->tag;
2285 if (cnt >= MAX_NR_TAG) {
2286 kfifo_in(&ubq->evts_fifo, tags, cnt);
2287 cnt = 0;
2288 }
2289 }
2290 if (cnt)
2291 kfifo_in(&ubq->evts_fifo, tags, cnt);
2292 fcmd = __ublk_acquire_fcmd(ubq);
2293 spin_unlock(&ubq->evts_lock);
2294
2295 rq_list_init(l);
2296 if (fcmd)
2297 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2298 }
2299
ublk_batch_queue_rqs(struct rq_list * rqlist)2300 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2301 {
2302 struct rq_list requeue_list = { };
2303 struct rq_list submit_list = { };
2304 struct ublk_queue *ubq = NULL;
2305 struct request *req;
2306
2307 while ((req = rq_list_pop(rqlist))) {
2308 struct ublk_queue *this_q = req->mq_hctx->driver_data;
2309
2310 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2311 rq_list_add_tail(&requeue_list, req);
2312 continue;
2313 }
2314
2315 if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2316 ublk_batch_queue_cmd_list(ubq, &submit_list);
2317 ubq = this_q;
2318 rq_list_add_tail(&submit_list, req);
2319 }
2320
2321 if (!rq_list_empty(&submit_list))
2322 ublk_batch_queue_cmd_list(ubq, &submit_list);
2323 *rqlist = requeue_list;
2324 }
2325
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)2326 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2327 unsigned int hctx_idx)
2328 {
2329 struct ublk_device *ub = driver_data;
2330 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2331
2332 hctx->driver_data = ubq;
2333 return 0;
2334 }
2335
2336 static const struct blk_mq_ops ublk_mq_ops = {
2337 .queue_rq = ublk_queue_rq,
2338 .queue_rqs = ublk_queue_rqs,
2339 .init_hctx = ublk_init_hctx,
2340 .timeout = ublk_timeout,
2341 };
2342
2343 static const struct blk_mq_ops ublk_batch_mq_ops = {
2344 .commit_rqs = ublk_commit_rqs,
2345 .queue_rq = ublk_batch_queue_rq,
2346 .queue_rqs = ublk_batch_queue_rqs,
2347 .init_hctx = ublk_init_hctx,
2348 .timeout = ublk_timeout,
2349 };
2350
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2351 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2352 {
2353 int i;
2354
2355 ubq->nr_io_ready = 0;
2356
2357 for (i = 0; i < ubq->q_depth; i++) {
2358 struct ublk_io *io = &ubq->ios[i];
2359
2360 /*
2361 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2362 * io->cmd
2363 */
2364 io->flags &= UBLK_IO_FLAG_CANCELED;
2365 io->cmd = NULL;
2366 io->buf.addr = 0;
2367
2368 /*
2369 * old task is PF_EXITING, put it now
2370 *
2371 * It could be NULL in case of closing one quiesced
2372 * device.
2373 */
2374 if (io->task) {
2375 put_task_struct(io->task);
2376 io->task = NULL;
2377 }
2378
2379 WARN_ON_ONCE(refcount_read(&io->ref));
2380 WARN_ON_ONCE(io->task_registered_buffers);
2381 }
2382 }
2383
ublk_ch_open(struct inode * inode,struct file * filp)2384 static int ublk_ch_open(struct inode *inode, struct file *filp)
2385 {
2386 struct ublk_device *ub = container_of(inode->i_cdev,
2387 struct ublk_device, cdev);
2388
2389 if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2390 return -EBUSY;
2391 filp->private_data = ub;
2392 ub->ublksrv_tgid = current->tgid;
2393 return 0;
2394 }
2395
ublk_reset_ch_dev(struct ublk_device * ub)2396 static void ublk_reset_ch_dev(struct ublk_device *ub)
2397 {
2398 int i;
2399
2400 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2401 ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2402
2403 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2404 ub->mm = NULL;
2405 ub->nr_queue_ready = 0;
2406 ub->unprivileged_daemons = false;
2407 ub->ublksrv_tgid = -1;
2408 }
2409
ublk_get_disk(struct ublk_device * ub)2410 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2411 {
2412 struct gendisk *disk;
2413
2414 spin_lock(&ub->lock);
2415 disk = ub->ub_disk;
2416 if (disk)
2417 get_device(disk_to_dev(disk));
2418 spin_unlock(&ub->lock);
2419
2420 return disk;
2421 }
2422
ublk_put_disk(struct gendisk * disk)2423 static void ublk_put_disk(struct gendisk *disk)
2424 {
2425 if (disk)
2426 put_device(disk_to_dev(disk));
2427 }
2428
ublk_partition_scan_work(struct work_struct * work)2429 static void ublk_partition_scan_work(struct work_struct *work)
2430 {
2431 struct ublk_device *ub =
2432 container_of(work, struct ublk_device, partition_scan_work);
2433 /* Hold disk reference to prevent UAF during concurrent teardown */
2434 struct gendisk *disk = ublk_get_disk(ub);
2435
2436 if (!disk)
2437 return;
2438
2439 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2440 &disk->state)))
2441 goto out;
2442
2443 mutex_lock(&disk->open_mutex);
2444 bdev_disk_changed(disk, false);
2445 mutex_unlock(&disk->open_mutex);
2446 out:
2447 ublk_put_disk(disk);
2448 }
2449
2450 /*
2451 * Use this function to ensure that ->canceling is consistently set for
2452 * the device and all queues. Do not set these flags directly.
2453 *
2454 * Caller must ensure that:
2455 * - cancel_mutex is held. This ensures that there is no concurrent
2456 * access to ub->canceling and no concurrent writes to ubq->canceling.
2457 * - there are no concurrent reads of ubq->canceling from the queue_rq
2458 * path. This can be done by quiescing the queue, or through other
2459 * means.
2460 */
ublk_set_canceling(struct ublk_device * ub,bool canceling)2461 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2462 __must_hold(&ub->cancel_mutex)
2463 {
2464 int i;
2465
2466 ub->canceling = canceling;
2467 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2468 ublk_get_queue(ub, i)->canceling = canceling;
2469 }
2470
ublk_check_and_reset_active_ref(struct ublk_device * ub)2471 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2472 {
2473 int i, j;
2474
2475 if (!ublk_dev_need_req_ref(ub))
2476 return false;
2477
2478 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2479 struct ublk_queue *ubq = ublk_get_queue(ub, i);
2480
2481 for (j = 0; j < ubq->q_depth; j++) {
2482 struct ublk_io *io = &ubq->ios[j];
2483 unsigned int refs = refcount_read(&io->ref) +
2484 io->task_registered_buffers;
2485
2486 /*
2487 * UBLK_REFCOUNT_INIT or zero means no active
2488 * reference
2489 */
2490 if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2491 return true;
2492
2493 /* reset to zero if the io hasn't active references */
2494 refcount_set(&io->ref, 0);
2495 io->task_registered_buffers = 0;
2496 }
2497 }
2498 return false;
2499 }
2500
ublk_ch_release_work_fn(struct work_struct * work)2501 static void ublk_ch_release_work_fn(struct work_struct *work)
2502 {
2503 struct ublk_device *ub =
2504 container_of(work, struct ublk_device, exit_work.work);
2505 struct gendisk *disk;
2506 int i;
2507
2508 /*
2509 * For zero-copy and auto buffer register modes, I/O references
2510 * might not be dropped naturally when the daemon is killed, but
2511 * io_uring guarantees that registered bvec kernel buffers are
2512 * unregistered finally when freeing io_uring context, then the
2513 * active references are dropped.
2514 *
2515 * Wait until active references are dropped for avoiding use-after-free
2516 *
2517 * registered buffer may be unregistered in io_ring's release hander,
2518 * so have to wait by scheduling work function for avoiding the two
2519 * file release dependency.
2520 */
2521 if (ublk_check_and_reset_active_ref(ub)) {
2522 schedule_delayed_work(&ub->exit_work, 1);
2523 return;
2524 }
2525
2526 /*
2527 * disk isn't attached yet, either device isn't live, or it has
2528 * been removed already, so we needn't to do anything
2529 */
2530 disk = ublk_get_disk(ub);
2531 if (!disk)
2532 goto out;
2533
2534 /*
2535 * All uring_cmd are done now, so abort any request outstanding to
2536 * the ublk server
2537 *
2538 * This can be done in lockless way because ublk server has been
2539 * gone
2540 *
2541 * More importantly, we have to provide forward progress guarantee
2542 * without holding ub->mutex, otherwise control task grabbing
2543 * ub->mutex triggers deadlock
2544 *
2545 * All requests may be inflight, so ->canceling may not be set, set
2546 * it now.
2547 */
2548 mutex_lock(&ub->cancel_mutex);
2549 ublk_set_canceling(ub, true);
2550 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2551 ublk_abort_queue(ub, ublk_get_queue(ub, i));
2552 mutex_unlock(&ub->cancel_mutex);
2553 blk_mq_kick_requeue_list(disk->queue);
2554
2555 /*
2556 * All infligh requests have been completed or requeued and any new
2557 * request will be failed or requeued via `->canceling` now, so it is
2558 * fine to grab ub->mutex now.
2559 */
2560 mutex_lock(&ub->mutex);
2561
2562 /* double check after grabbing lock */
2563 if (!ub->ub_disk)
2564 goto unlock;
2565
2566 /*
2567 * Transition the device to the nosrv state. What exactly this
2568 * means depends on the recovery flags
2569 */
2570 if (ublk_nosrv_should_stop_dev(ub)) {
2571 /*
2572 * Allow any pending/future I/O to pass through quickly
2573 * with an error. This is needed because del_gendisk
2574 * waits for all pending I/O to complete
2575 */
2576 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2577 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2578
2579 ublk_stop_dev_unlocked(ub);
2580 } else {
2581 if (ublk_nosrv_dev_should_queue_io(ub)) {
2582 /* ->canceling is set and all requests are aborted */
2583 ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2584 } else {
2585 ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2586 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2587 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2588 }
2589 }
2590 unlock:
2591 mutex_unlock(&ub->mutex);
2592 ublk_put_disk(disk);
2593
2594 /* all uring_cmd has been done now, reset device & ubq */
2595 ublk_reset_ch_dev(ub);
2596 out:
2597 clear_bit(UB_STATE_OPEN, &ub->state);
2598
2599 /* put the reference grabbed in ublk_ch_release() */
2600 ublk_put_device(ub);
2601 }
2602
ublk_ch_release(struct inode * inode,struct file * filp)2603 static int ublk_ch_release(struct inode *inode, struct file *filp)
2604 {
2605 struct ublk_device *ub = filp->private_data;
2606
2607 /*
2608 * Grab ublk device reference, so it won't be gone until we are
2609 * really released from work function.
2610 */
2611 ublk_get_device(ub);
2612
2613 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2614 schedule_delayed_work(&ub->exit_work, 0);
2615 return 0;
2616 }
2617
2618 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)2619 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2620 {
2621 struct ublk_device *ub = filp->private_data;
2622 size_t sz = vma->vm_end - vma->vm_start;
2623 unsigned max_sz = ublk_max_cmd_buf_size();
2624 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2625 int q_id, ret = 0;
2626
2627 spin_lock(&ub->lock);
2628 if (!ub->mm)
2629 ub->mm = current->mm;
2630 if (current->mm != ub->mm)
2631 ret = -EINVAL;
2632 spin_unlock(&ub->lock);
2633
2634 if (ret)
2635 return ret;
2636
2637 if (vma->vm_flags & VM_WRITE)
2638 return -EPERM;
2639
2640 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2641 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2642 return -EINVAL;
2643
2644 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2645 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2646 __func__, q_id, current->pid, vma->vm_start,
2647 phys_off, (unsigned long)sz);
2648
2649 if (sz != ublk_queue_cmd_buf_size(ub))
2650 return -EINVAL;
2651
2652 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2653 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2654 }
2655
__ublk_fail_req(struct ublk_device * ub,struct ublk_io * io,struct request * req)2656 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2657 struct request *req)
2658 {
2659 WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2660 io->flags & UBLK_IO_FLAG_ACTIVE);
2661
2662 if (ublk_nosrv_should_reissue_outstanding(ub))
2663 blk_mq_requeue_request(req, false);
2664 else {
2665 io->res = -EIO;
2666 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2667 }
2668 }
2669
2670 /*
2671 * Request tag may just be filled to event kfifo, not get chance to
2672 * dispatch, abort these requests too
2673 */
ublk_abort_batch_queue(struct ublk_device * ub,struct ublk_queue * ubq)2674 static void ublk_abort_batch_queue(struct ublk_device *ub,
2675 struct ublk_queue *ubq)
2676 {
2677 unsigned short tag;
2678
2679 while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2680 struct request *req = blk_mq_tag_to_rq(
2681 ub->tag_set.tags[ubq->q_id], tag);
2682
2683 if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2684 __ublk_fail_req(ub, &ubq->ios[tag], req);
2685 }
2686 }
2687
2688 /*
2689 * Called from ublk char device release handler, when any uring_cmd is
2690 * done, meantime request queue is "quiesced" since all inflight requests
2691 * can't be completed because ublk server is dead.
2692 *
2693 * So no one can hold our request IO reference any more, simply ignore the
2694 * reference, and complete the request immediately
2695 */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)2696 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2697 {
2698 int i;
2699
2700 for (i = 0; i < ubq->q_depth; i++) {
2701 struct ublk_io *io = &ubq->ios[i];
2702
2703 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2704 __ublk_fail_req(ub, io, io->req);
2705 }
2706
2707 if (ublk_support_batch_io(ubq))
2708 ublk_abort_batch_queue(ub, ubq);
2709 }
2710
ublk_start_cancel(struct ublk_device * ub)2711 static void ublk_start_cancel(struct ublk_device *ub)
2712 {
2713 struct gendisk *disk = ublk_get_disk(ub);
2714
2715 /* Our disk has been dead */
2716 if (!disk)
2717 return;
2718
2719 mutex_lock(&ub->cancel_mutex);
2720 if (ub->canceling)
2721 goto out;
2722 /*
2723 * Now we are serialized with ublk_queue_rq()
2724 *
2725 * Make sure that ubq->canceling is set when queue is frozen,
2726 * because ublk_queue_rq() has to rely on this flag for avoiding to
2727 * touch completed uring_cmd
2728 */
2729 blk_mq_quiesce_queue(disk->queue);
2730 ublk_set_canceling(ub, true);
2731 blk_mq_unquiesce_queue(disk->queue);
2732 out:
2733 mutex_unlock(&ub->cancel_mutex);
2734 ublk_put_disk(disk);
2735 }
2736
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)2737 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2738 unsigned int issue_flags)
2739 {
2740 struct ublk_io *io = &ubq->ios[tag];
2741 struct ublk_device *ub = ubq->dev;
2742 struct request *req;
2743 bool done;
2744
2745 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2746 return;
2747
2748 /*
2749 * Don't try to cancel this command if the request is started for
2750 * avoiding race between io_uring_cmd_done() and
2751 * io_uring_cmd_complete_in_task().
2752 *
2753 * Either the started request will be aborted via __ublk_abort_rq(),
2754 * then this uring_cmd is canceled next time, or it will be done in
2755 * task work function ublk_dispatch_req() because io_uring guarantees
2756 * that ublk_dispatch_req() is always called
2757 */
2758 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2759 if (req && blk_mq_request_started(req) && req->tag == tag)
2760 return;
2761
2762 spin_lock(&ubq->cancel_lock);
2763 done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2764 if (!done)
2765 io->flags |= UBLK_IO_FLAG_CANCELED;
2766 spin_unlock(&ubq->cancel_lock);
2767
2768 if (!done)
2769 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2770 }
2771
2772 /*
2773 * Cancel a batch fetch command if it hasn't been claimed by another path.
2774 *
2775 * An fcmd can only be cancelled if:
2776 * 1. It's not the active_fcmd (which is currently being processed)
2777 * 2. It's still on the list (!list_empty check) - once removed from the list,
2778 * the fcmd is considered claimed and will be freed by whoever removed it
2779 *
2780 * Use list_del_init() so subsequent list_empty() checks work correctly.
2781 */
ublk_batch_cancel_cmd(struct ublk_queue * ubq,struct ublk_batch_fetch_cmd * fcmd,unsigned int issue_flags)2782 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2783 struct ublk_batch_fetch_cmd *fcmd,
2784 unsigned int issue_flags)
2785 {
2786 bool done;
2787
2788 spin_lock(&ubq->evts_lock);
2789 done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2790 if (done)
2791 list_del_init(&fcmd->node);
2792 spin_unlock(&ubq->evts_lock);
2793
2794 if (done) {
2795 io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2796 ublk_batch_free_fcmd(fcmd);
2797 }
2798 }
2799
ublk_batch_cancel_queue(struct ublk_queue * ubq)2800 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2801 {
2802 struct ublk_batch_fetch_cmd *fcmd;
2803 LIST_HEAD(fcmd_list);
2804
2805 spin_lock(&ubq->evts_lock);
2806 ubq->force_abort = true;
2807 list_splice_init(&ubq->fcmd_head, &fcmd_list);
2808 fcmd = READ_ONCE(ubq->active_fcmd);
2809 if (fcmd)
2810 list_move(&fcmd->node, &ubq->fcmd_head);
2811 spin_unlock(&ubq->evts_lock);
2812
2813 while (!list_empty(&fcmd_list)) {
2814 fcmd = list_first_entry(&fcmd_list,
2815 struct ublk_batch_fetch_cmd, node);
2816 ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2817 }
2818 }
2819
ublk_batch_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2820 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2821 unsigned int issue_flags)
2822 {
2823 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2824 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2825 struct ublk_queue *ubq = pdu->ubq;
2826
2827 ublk_start_cancel(ubq->dev);
2828
2829 ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2830 }
2831
2832 /*
2833 * The ublk char device won't be closed when calling cancel fn, so both
2834 * ublk device and queue are guaranteed to be live
2835 *
2836 * Two-stage cancel:
2837 *
2838 * - make every active uring_cmd done in ->cancel_fn()
2839 *
2840 * - aborting inflight ublk IO requests in ublk char device release handler,
2841 * which depends on 1st stage because device can only be closed iff all
2842 * uring_cmd are done
2843 *
2844 * Do _not_ try to acquire ub->mutex before all inflight requests are
2845 * aborted, otherwise deadlock may be caused.
2846 */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2847 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2848 unsigned int issue_flags)
2849 {
2850 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2851 struct ublk_queue *ubq = pdu->ubq;
2852 struct task_struct *task;
2853 struct ublk_io *io;
2854
2855 if (WARN_ON_ONCE(!ubq))
2856 return;
2857
2858 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2859 return;
2860
2861 task = io_uring_cmd_get_task(cmd);
2862 io = &ubq->ios[pdu->tag];
2863 if (WARN_ON_ONCE(task && task != io->task))
2864 return;
2865
2866 ublk_start_cancel(ubq->dev);
2867
2868 WARN_ON_ONCE(io->cmd != cmd);
2869 ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2870 }
2871
ublk_queue_ready(const struct ublk_queue * ubq)2872 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2873 {
2874 return ubq->nr_io_ready == ubq->q_depth;
2875 }
2876
ublk_dev_ready(const struct ublk_device * ub)2877 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2878 {
2879 return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2880 }
2881
ublk_cancel_queue(struct ublk_queue * ubq)2882 static void ublk_cancel_queue(struct ublk_queue *ubq)
2883 {
2884 int i;
2885
2886 if (ublk_support_batch_io(ubq)) {
2887 ublk_batch_cancel_queue(ubq);
2888 return;
2889 }
2890
2891 for (i = 0; i < ubq->q_depth; i++)
2892 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2893 }
2894
2895 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)2896 static void ublk_cancel_dev(struct ublk_device *ub)
2897 {
2898 int i;
2899
2900 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2901 ublk_cancel_queue(ublk_get_queue(ub, i));
2902 }
2903
ublk_check_inflight_rq(struct request * rq,void * data)2904 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2905 {
2906 bool *idle = data;
2907
2908 if (blk_mq_request_started(rq)) {
2909 *idle = false;
2910 return false;
2911 }
2912 return true;
2913 }
2914
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)2915 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2916 {
2917 bool idle;
2918
2919 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2920 while (true) {
2921 idle = true;
2922 blk_mq_tagset_busy_iter(&ub->tag_set,
2923 ublk_check_inflight_rq, &idle);
2924 if (idle)
2925 break;
2926 msleep(UBLK_REQUEUE_DELAY_MS);
2927 }
2928 }
2929
ublk_force_abort_dev(struct ublk_device * ub)2930 static void ublk_force_abort_dev(struct ublk_device *ub)
2931 {
2932 int i;
2933
2934 pr_devel("%s: force abort ub: dev_id %d state %s\n",
2935 __func__, ub->dev_info.dev_id,
2936 ub->dev_info.state == UBLK_S_DEV_LIVE ?
2937 "LIVE" : "QUIESCED");
2938 blk_mq_quiesce_queue(ub->ub_disk->queue);
2939 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2940 ublk_wait_tagset_rqs_idle(ub);
2941
2942 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2943 ublk_get_queue(ub, i)->force_abort = true;
2944 blk_mq_unquiesce_queue(ub->ub_disk->queue);
2945 /* We may have requeued some rqs in ublk_quiesce_queue() */
2946 blk_mq_kick_requeue_list(ub->ub_disk->queue);
2947 }
2948
ublk_detach_disk(struct ublk_device * ub)2949 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2950 {
2951 struct gendisk *disk;
2952
2953 /* Sync with ublk_abort_queue() by holding the lock */
2954 spin_lock(&ub->lock);
2955 disk = ub->ub_disk;
2956 ub->dev_info.state = UBLK_S_DEV_DEAD;
2957 ub->dev_info.ublksrv_pid = -1;
2958 ub->ub_disk = NULL;
2959 spin_unlock(&ub->lock);
2960
2961 return disk;
2962 }
2963
ublk_stop_dev_unlocked(struct ublk_device * ub)2964 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2965 __must_hold(&ub->mutex)
2966 {
2967 struct gendisk *disk;
2968
2969 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2970 return;
2971
2972 if (ublk_nosrv_dev_should_queue_io(ub))
2973 ublk_force_abort_dev(ub);
2974 del_gendisk(ub->ub_disk);
2975 disk = ublk_detach_disk(ub);
2976 put_disk(disk);
2977 }
2978
ublk_stop_dev(struct ublk_device * ub)2979 static void ublk_stop_dev(struct ublk_device *ub)
2980 {
2981 mutex_lock(&ub->mutex);
2982 ublk_stop_dev_unlocked(ub);
2983 mutex_unlock(&ub->mutex);
2984 cancel_work_sync(&ub->partition_scan_work);
2985 ublk_cancel_dev(ub);
2986 }
2987
ublk_reset_io_flags(struct ublk_queue * ubq,struct ublk_io * io)2988 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
2989 {
2990 /* UBLK_IO_FLAG_CANCELED can be cleared now */
2991 spin_lock(&ubq->cancel_lock);
2992 io->flags &= ~UBLK_IO_FLAG_CANCELED;
2993 spin_unlock(&ubq->cancel_lock);
2994 }
2995
2996 /* reset per-queue io flags */
ublk_queue_reset_io_flags(struct ublk_queue * ubq)2997 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2998 {
2999 spin_lock(&ubq->cancel_lock);
3000 ubq->canceling = false;
3001 spin_unlock(&ubq->cancel_lock);
3002 ubq->fail_io = false;
3003 }
3004
3005 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,u16 q_id,struct ublk_io * io)3006 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
3007 struct ublk_io *io)
3008 __must_hold(&ub->mutex)
3009 {
3010 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3011
3012 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3013 ub->unprivileged_daemons = true;
3014
3015 ubq->nr_io_ready++;
3016 ublk_reset_io_flags(ubq, io);
3017
3018 /* Check if this specific queue is now fully ready */
3019 if (ublk_queue_ready(ubq)) {
3020 ub->nr_queue_ready++;
3021
3022 /*
3023 * Reset queue flags as soon as this queue is ready.
3024 * This clears the canceling flag, allowing batch FETCH commands
3025 * to succeed during recovery without waiting for all queues.
3026 */
3027 ublk_queue_reset_io_flags(ubq);
3028 }
3029
3030 /* Check if all queues are ready */
3031 if (ublk_dev_ready(ub)) {
3032 /*
3033 * All queues ready - clear device-level canceling flag
3034 * and complete the recovery/initialization.
3035 */
3036 mutex_lock(&ub->cancel_mutex);
3037 ub->canceling = false;
3038 mutex_unlock(&ub->cancel_mutex);
3039 complete_all(&ub->completion);
3040 }
3041 }
3042
ublk_check_cmd_op(u32 cmd_op)3043 static inline int ublk_check_cmd_op(u32 cmd_op)
3044 {
3045 u32 ioc_type = _IOC_TYPE(cmd_op);
3046
3047 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3048 return -EOPNOTSUPP;
3049
3050 if (ioc_type != 'u' && ioc_type != 0)
3051 return -EOPNOTSUPP;
3052
3053 return 0;
3054 }
3055
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)3056 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3057 {
3058 struct ublk_auto_buf_reg buf;
3059
3060 buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3061
3062 if (buf.reserved0 || buf.reserved1)
3063 return -EINVAL;
3064
3065 if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3066 return -EINVAL;
3067 io->buf.auto_reg = buf;
3068 return 0;
3069 }
3070
ublk_clear_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3071 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3072 struct io_uring_cmd *cmd,
3073 u16 *buf_idx)
3074 {
3075 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3076 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3077
3078 /*
3079 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3080 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3081 * `io_ring_ctx`.
3082 *
3083 * If this uring_cmd's io_ring_ctx isn't same with the
3084 * one for registering the buffer, it is ublk server's
3085 * responsibility for unregistering the buffer, otherwise
3086 * this ublk request gets stuck.
3087 */
3088 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3089 *buf_idx = io->buf.auto_reg.index;
3090 }
3091 }
3092
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3093 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3094 struct io_uring_cmd *cmd,
3095 u16 *buf_idx)
3096 {
3097 ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3098 return ublk_set_auto_buf_reg(io, cmd);
3099 }
3100
3101 /* Once we return, `io->req` can't be used any more */
3102 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)3103 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3104 {
3105 struct request *req = io->req;
3106
3107 io->cmd = cmd;
3108 io->flags |= UBLK_IO_FLAG_ACTIVE;
3109 /* now this cmd slot is owned by ublk driver */
3110 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3111
3112 return req;
3113 }
3114
3115 static inline int
ublk_config_io_buf(const struct ublk_device * ub,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)3116 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3117 struct io_uring_cmd *cmd, unsigned long buf_addr,
3118 u16 *buf_idx)
3119 {
3120 if (ublk_dev_support_auto_buf_reg(ub))
3121 return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3122
3123 io->buf.addr = buf_addr;
3124 return 0;
3125 }
3126
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)3127 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3128 unsigned int issue_flags,
3129 struct ublk_queue *ubq, unsigned int tag)
3130 {
3131 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3132
3133 /*
3134 * Safe to refer to @ubq since ublk_queue won't be died until its
3135 * commands are completed
3136 */
3137 pdu->ubq = ubq;
3138 pdu->tag = tag;
3139 io_uring_cmd_mark_cancelable(cmd, issue_flags);
3140 }
3141
ublk_io_release(void * priv)3142 static void ublk_io_release(void *priv)
3143 {
3144 struct request *rq = priv;
3145 struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3146 struct ublk_io *io = &ubq->ios[rq->tag];
3147
3148 /*
3149 * task_registered_buffers may be 0 if buffers were registered off task
3150 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3151 */
3152 if (current == io->task && io->task_registered_buffers)
3153 io->task_registered_buffers--;
3154 else
3155 ublk_put_req_ref(io, rq);
3156 }
3157
ublk_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned int index,unsigned int issue_flags)3158 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3159 struct ublk_device *ub,
3160 u16 q_id, u16 tag,
3161 struct ublk_io *io,
3162 unsigned int index, unsigned int issue_flags)
3163 {
3164 struct request *req;
3165 int ret;
3166
3167 if (!ublk_dev_support_zero_copy(ub))
3168 return -EINVAL;
3169
3170 req = __ublk_check_and_get_req(ub, q_id, tag, io);
3171 if (!req)
3172 return -EINVAL;
3173
3174 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3175 issue_flags);
3176 if (ret) {
3177 ublk_put_req_ref(io, req);
3178 return ret;
3179 }
3180
3181 return 0;
3182 }
3183
3184 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned index,unsigned issue_flags)3185 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3186 struct ublk_device *ub,
3187 u16 q_id, u16 tag, struct ublk_io *io,
3188 unsigned index, unsigned issue_flags)
3189 {
3190 unsigned new_registered_buffers;
3191 struct request *req = io->req;
3192 int ret;
3193
3194 /*
3195 * Ensure there are still references for ublk_sub_req_ref() to release.
3196 * If not, fall back on the thread-safe buffer registration.
3197 */
3198 new_registered_buffers = io->task_registered_buffers + 1;
3199 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3200 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3201 issue_flags);
3202
3203 if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3204 return -EINVAL;
3205
3206 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3207 issue_flags);
3208 if (ret)
3209 return ret;
3210
3211 io->task_registered_buffers = new_registered_buffers;
3212 return 0;
3213 }
3214
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)3215 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3216 const struct ublk_device *ub,
3217 unsigned int index, unsigned int issue_flags)
3218 {
3219 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3220 return -EINVAL;
3221
3222 return io_buffer_unregister_bvec(cmd, index, issue_flags);
3223 }
3224
ublk_check_fetch_buf(const struct ublk_device * ub,__u64 buf_addr)3225 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3226 {
3227 if (ublk_dev_need_map_io(ub)) {
3228 /*
3229 * FETCH_RQ has to provide IO buffer if NEED GET
3230 * DATA is not enabled
3231 */
3232 if (!buf_addr && !ublk_dev_need_get_data(ub))
3233 return -EINVAL;
3234 } else if (buf_addr) {
3235 /* User copy requires addr to be unset */
3236 return -EINVAL;
3237 }
3238 return 0;
3239 }
3240
__ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,u16 q_id)3241 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3242 struct ublk_io *io, u16 q_id)
3243 {
3244 /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3245 if (ublk_dev_ready(ub))
3246 return -EBUSY;
3247
3248 /* allow each command to be FETCHed at most once */
3249 if (io->flags & UBLK_IO_FLAG_ACTIVE)
3250 return -EINVAL;
3251
3252 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3253
3254 ublk_fill_io_cmd(io, cmd);
3255
3256 if (ublk_dev_support_batch_io(ub))
3257 WRITE_ONCE(io->task, NULL);
3258 else
3259 WRITE_ONCE(io->task, get_task_struct(current));
3260
3261 return 0;
3262 }
3263
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr,u16 q_id)3264 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3265 struct ublk_io *io, __u64 buf_addr, u16 q_id)
3266 {
3267 int ret;
3268
3269 /*
3270 * When handling FETCH command for setting up ublk uring queue,
3271 * ub->mutex is the innermost lock, and we won't block for handling
3272 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3273 */
3274 mutex_lock(&ub->mutex);
3275 ret = __ublk_fetch(cmd, ub, io, q_id);
3276 if (!ret)
3277 ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3278 if (!ret)
3279 ublk_mark_io_ready(ub, q_id, io);
3280 mutex_unlock(&ub->mutex);
3281 return ret;
3282 }
3283
ublk_check_commit_and_fetch(const struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)3284 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3285 struct ublk_io *io, __u64 buf_addr)
3286 {
3287 struct request *req = io->req;
3288
3289 if (ublk_dev_need_map_io(ub)) {
3290 /*
3291 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3292 * NEED GET DATA is not enabled or it is Read IO.
3293 */
3294 if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3295 req_op(req) == REQ_OP_READ))
3296 return -EINVAL;
3297 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3298 /*
3299 * User copy requires addr to be unset when command is
3300 * not zone append
3301 */
3302 return -EINVAL;
3303 }
3304
3305 return 0;
3306 }
3307
ublk_need_complete_req(const struct ublk_device * ub,struct ublk_io * io)3308 static bool ublk_need_complete_req(const struct ublk_device *ub,
3309 struct ublk_io *io)
3310 {
3311 if (ublk_dev_need_req_ref(ub))
3312 return ublk_sub_req_ref(io);
3313 return true;
3314 }
3315
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)3316 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3317 struct request *req)
3318 {
3319 /*
3320 * We have handled UBLK_IO_NEED_GET_DATA command,
3321 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3322 * do the copy work.
3323 */
3324 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3325 /* update iod->addr because ublksrv may have passed a new io buffer */
3326 ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3327 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3328 __func__, ubq->q_id, req->tag, io->flags,
3329 ublk_get_iod(ubq, req->tag)->addr);
3330
3331 return ublk_start_io(ubq, req, io);
3332 }
3333
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)3334 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3335 unsigned int issue_flags)
3336 {
3337 /* May point to userspace-mapped memory */
3338 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3339 struct ublksrv_io_cmd);
3340 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3341 struct ublk_device *ub = cmd->file->private_data;
3342 struct ublk_queue *ubq;
3343 struct ublk_io *io = NULL;
3344 u32 cmd_op = cmd->cmd_op;
3345 u16 q_id = READ_ONCE(ub_src->q_id);
3346 u16 tag = READ_ONCE(ub_src->tag);
3347 s32 result = READ_ONCE(ub_src->result);
3348 u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3349 struct request *req;
3350 int ret;
3351 bool compl;
3352
3353 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3354
3355 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3356 __func__, cmd->cmd_op, q_id, tag, result);
3357
3358 ret = ublk_check_cmd_op(cmd_op);
3359 if (ret)
3360 goto out;
3361
3362 /*
3363 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3364 * so no need to validate the q_id, tag, or task
3365 */
3366 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3367 return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3368
3369 ret = -EINVAL;
3370 if (q_id >= ub->dev_info.nr_hw_queues)
3371 goto out;
3372
3373 ubq = ublk_get_queue(ub, q_id);
3374
3375 if (tag >= ub->dev_info.queue_depth)
3376 goto out;
3377
3378 io = &ubq->ios[tag];
3379 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3380 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3381 ret = ublk_check_fetch_buf(ub, addr);
3382 if (ret)
3383 goto out;
3384 ret = ublk_fetch(cmd, ub, io, addr, q_id);
3385 if (ret)
3386 goto out;
3387
3388 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3389 return -EIOCBQUEUED;
3390 }
3391
3392 if (READ_ONCE(io->task) != current) {
3393 /*
3394 * ublk_register_io_buf() accesses only the io's refcount,
3395 * so can be handled on any task
3396 */
3397 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3398 return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3399 addr, issue_flags);
3400
3401 goto out;
3402 }
3403
3404 /* there is pending io cmd, something must be wrong */
3405 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3406 ret = -EBUSY;
3407 goto out;
3408 }
3409
3410 /*
3411 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3412 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3413 */
3414 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3415 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3416 goto out;
3417
3418 switch (_IOC_NR(cmd_op)) {
3419 case UBLK_IO_REGISTER_IO_BUF:
3420 return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3421 issue_flags);
3422 case UBLK_IO_COMMIT_AND_FETCH_REQ:
3423 ret = ublk_check_commit_and_fetch(ub, io, addr);
3424 if (ret)
3425 goto out;
3426 io->res = result;
3427 req = ublk_fill_io_cmd(io, cmd);
3428 ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3429 if (buf_idx != UBLK_INVALID_BUF_IDX)
3430 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3431 compl = ublk_need_complete_req(ub, io);
3432
3433 if (req_op(req) == REQ_OP_ZONE_APPEND)
3434 req->__sector = addr;
3435 if (compl)
3436 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3437
3438 if (ret)
3439 goto out;
3440 break;
3441 case UBLK_IO_NEED_GET_DATA:
3442 /*
3443 * ublk_get_data() may fail and fallback to requeue, so keep
3444 * uring_cmd active first and prepare for handling new requeued
3445 * request
3446 */
3447 req = ublk_fill_io_cmd(io, cmd);
3448 ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3449 WARN_ON_ONCE(ret);
3450 if (likely(ublk_get_data(ubq, io, req))) {
3451 __ublk_prep_compl_io_cmd(io, req);
3452 return UBLK_IO_RES_OK;
3453 }
3454 break;
3455 default:
3456 goto out;
3457 }
3458 ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3459 return -EIOCBQUEUED;
3460
3461 out:
3462 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3463 __func__, cmd_op, tag, ret, io ? io->flags : 0);
3464 return ret;
3465 }
3466
__ublk_check_and_get_req(struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io)3467 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3468 u16 q_id, u16 tag, struct ublk_io *io)
3469 {
3470 struct request *req;
3471
3472 /*
3473 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3474 * which would overwrite it with io->cmd
3475 */
3476 req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3477 if (!req)
3478 return NULL;
3479
3480 if (!ublk_get_req_ref(io))
3481 return NULL;
3482
3483 if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3484 goto fail_put;
3485
3486 if (!ublk_rq_has_data(req))
3487 goto fail_put;
3488
3489 return req;
3490 fail_put:
3491 ublk_put_req_ref(io, req);
3492 return NULL;
3493 }
3494
ublk_ch_uring_cmd_cb(struct io_tw_req tw_req,io_tw_token_t tw)3495 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3496 {
3497 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3498 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3499 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3500
3501 if (ret != -EIOCBQUEUED)
3502 io_uring_cmd_done(cmd, ret, issue_flags);
3503 }
3504
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3505 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3506 {
3507 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3508 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3509 return 0;
3510 }
3511
3512 /* well-implemented server won't run into unlocked */
3513 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3514 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3515 return -EIOCBQUEUED;
3516 }
3517
3518 return ublk_ch_uring_cmd_local(cmd, issue_flags);
3519 }
3520
ublk_batch_buf_addr(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3521 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3522 const struct ublk_elem_header *elem)
3523 {
3524 const void *buf = elem;
3525
3526 if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3527 return *(const __u64 *)(buf + sizeof(*elem));
3528 return 0;
3529 }
3530
ublk_batch_zone_lba(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3531 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3532 const struct ublk_elem_header *elem)
3533 {
3534 const void *buf = elem;
3535
3536 if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3537 return *(const __u64 *)(buf + sizeof(*elem) +
3538 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3539 return -1;
3540 }
3541
3542 static struct ublk_auto_buf_reg
ublk_batch_auto_buf_reg(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3543 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3544 const struct ublk_elem_header *elem)
3545 {
3546 struct ublk_auto_buf_reg reg = {
3547 .index = elem->buf_index,
3548 .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3549 UBLK_AUTO_BUF_REG_FALLBACK : 0,
3550 };
3551
3552 return reg;
3553 }
3554
3555 /*
3556 * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3557 * it is the least common multiple(LCM) of 8, 16 and 24
3558 */
3559 #define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10)
3560 struct ublk_batch_io_iter {
3561 void __user *uaddr;
3562 unsigned done, total;
3563 unsigned char elem_bytes;
3564 /* copy to this buffer from user space */
3565 unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3566 };
3567
3568 static inline int
__ublk_walk_cmd_buf(struct ublk_queue * ubq,struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,unsigned bytes,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3569 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3570 struct ublk_batch_io_iter *iter,
3571 const struct ublk_batch_io_data *data,
3572 unsigned bytes,
3573 int (*cb)(struct ublk_queue *q,
3574 const struct ublk_batch_io_data *data,
3575 const struct ublk_elem_header *elem))
3576 {
3577 unsigned int i;
3578 int ret = 0;
3579
3580 for (i = 0; i < bytes; i += iter->elem_bytes) {
3581 const struct ublk_elem_header *elem =
3582 (const struct ublk_elem_header *)&iter->buf[i];
3583
3584 if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3585 ret = -EINVAL;
3586 break;
3587 }
3588
3589 ret = cb(ubq, data, elem);
3590 if (unlikely(ret))
3591 break;
3592 }
3593
3594 iter->done += i;
3595 return ret;
3596 }
3597
ublk_walk_cmd_buf(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3598 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3599 const struct ublk_batch_io_data *data,
3600 int (*cb)(struct ublk_queue *q,
3601 const struct ublk_batch_io_data *data,
3602 const struct ublk_elem_header *elem))
3603 {
3604 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3605 int ret = 0;
3606
3607 while (iter->done < iter->total) {
3608 unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3609
3610 if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3611 pr_warn("ublk%d: read batch cmd buffer failed\n",
3612 data->ub->dev_info.dev_id);
3613 return -EFAULT;
3614 }
3615
3616 ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3617 if (ret)
3618 return ret;
3619 }
3620 return 0;
3621 }
3622
ublk_batch_unprep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3623 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3624 const struct ublk_batch_io_data *data,
3625 const struct ublk_elem_header *elem)
3626 {
3627 struct ublk_io *io = &ubq->ios[elem->tag];
3628
3629 /*
3630 * If queue was ready before this decrement, it won't be anymore,
3631 * so we need to decrement the queue ready count and restore the
3632 * canceling flag to prevent new requests from being queued.
3633 */
3634 if (ublk_queue_ready(ubq)) {
3635 data->ub->nr_queue_ready--;
3636 spin_lock(&ubq->cancel_lock);
3637 ubq->canceling = true;
3638 spin_unlock(&ubq->cancel_lock);
3639 }
3640 ubq->nr_io_ready--;
3641
3642 ublk_io_lock(io);
3643 io->flags = 0;
3644 ublk_io_unlock(io);
3645 return 0;
3646 }
3647
ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data)3648 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3649 const struct ublk_batch_io_data *data)
3650 {
3651 int ret;
3652
3653 /* Re-process only what we've already processed, starting from beginning */
3654 iter->total = iter->done;
3655 iter->done = 0;
3656
3657 ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3658 WARN_ON_ONCE(ret);
3659 }
3660
ublk_batch_prep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3661 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3662 const struct ublk_batch_io_data *data,
3663 const struct ublk_elem_header *elem)
3664 {
3665 struct ublk_io *io = &ubq->ios[elem->tag];
3666 const struct ublk_batch_io *uc = &data->header;
3667 union ublk_io_buf buf = { 0 };
3668 int ret;
3669
3670 if (ublk_dev_support_auto_buf_reg(data->ub))
3671 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3672 else if (ublk_dev_need_map_io(data->ub)) {
3673 buf.addr = ublk_batch_buf_addr(uc, elem);
3674
3675 ret = ublk_check_fetch_buf(data->ub, buf.addr);
3676 if (ret)
3677 return ret;
3678 }
3679
3680 ublk_io_lock(io);
3681 ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3682 if (!ret)
3683 io->buf = buf;
3684 ublk_io_unlock(io);
3685
3686 if (!ret)
3687 ublk_mark_io_ready(data->ub, ubq->q_id, io);
3688
3689 return ret;
3690 }
3691
ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data * data)3692 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3693 {
3694 const struct ublk_batch_io *uc = &data->header;
3695 struct io_uring_cmd *cmd = data->cmd;
3696 struct ublk_batch_io_iter iter = {
3697 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3698 .total = uc->nr_elem * uc->elem_bytes,
3699 .elem_bytes = uc->elem_bytes,
3700 };
3701 int ret;
3702
3703 mutex_lock(&data->ub->mutex);
3704 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3705
3706 if (ret && iter.done)
3707 ublk_batch_revert_prep_cmd(&iter, data);
3708 mutex_unlock(&data->ub->mutex);
3709 return ret;
3710 }
3711
ublk_batch_commit_io_check(const struct ublk_queue * ubq,struct ublk_io * io,union ublk_io_buf * buf)3712 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3713 struct ublk_io *io,
3714 union ublk_io_buf *buf)
3715 {
3716 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3717 return -EBUSY;
3718
3719 /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3720 if (ublk_need_map_io(ubq) && !buf->addr)
3721 return -EINVAL;
3722 return 0;
3723 }
3724
ublk_batch_commit_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3725 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3726 const struct ublk_batch_io_data *data,
3727 const struct ublk_elem_header *elem)
3728 {
3729 struct ublk_io *io = &ubq->ios[elem->tag];
3730 const struct ublk_batch_io *uc = &data->header;
3731 u16 buf_idx = UBLK_INVALID_BUF_IDX;
3732 union ublk_io_buf buf = { 0 };
3733 struct request *req = NULL;
3734 bool auto_reg = false;
3735 bool compl = false;
3736 int ret;
3737
3738 if (ublk_dev_support_auto_buf_reg(data->ub)) {
3739 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3740 auto_reg = true;
3741 } else if (ublk_dev_need_map_io(data->ub))
3742 buf.addr = ublk_batch_buf_addr(uc, elem);
3743
3744 ublk_io_lock(io);
3745 ret = ublk_batch_commit_io_check(ubq, io, &buf);
3746 if (!ret) {
3747 io->res = elem->result;
3748 io->buf = buf;
3749 req = ublk_fill_io_cmd(io, data->cmd);
3750
3751 if (auto_reg)
3752 ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3753 compl = ublk_need_complete_req(data->ub, io);
3754 }
3755 ublk_io_unlock(io);
3756
3757 if (unlikely(ret)) {
3758 pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3759 __func__, data->ub->dev_info.dev_id, ubq->q_id,
3760 elem->tag, ret);
3761 return ret;
3762 }
3763
3764 if (buf_idx != UBLK_INVALID_BUF_IDX)
3765 io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3766 if (req_op(req) == REQ_OP_ZONE_APPEND)
3767 req->__sector = ublk_batch_zone_lba(uc, elem);
3768 if (compl)
3769 __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3770 return 0;
3771 }
3772
ublk_handle_batch_commit_cmd(struct ublk_batch_io_data * data)3773 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3774 {
3775 const struct ublk_batch_io *uc = &data->header;
3776 struct io_uring_cmd *cmd = data->cmd;
3777 struct ublk_batch_io_iter iter = {
3778 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3779 .total = uc->nr_elem * uc->elem_bytes,
3780 .elem_bytes = uc->elem_bytes,
3781 };
3782 DEFINE_IO_COMP_BATCH(iob);
3783 int ret;
3784
3785 data->iob = &iob;
3786 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3787
3788 if (iob.complete)
3789 iob.complete(&iob);
3790
3791 return iter.done == 0 ? ret : iter.done;
3792 }
3793
ublk_check_batch_cmd_flags(const struct ublk_batch_io * uc)3794 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3795 {
3796 unsigned elem_bytes = sizeof(struct ublk_elem_header);
3797
3798 if (uc->flags & ~UBLK_BATCH_F_ALL)
3799 return -EINVAL;
3800
3801 /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3802 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3803 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3804 return -EINVAL;
3805
3806 elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3807 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3808 if (uc->elem_bytes != elem_bytes)
3809 return -EINVAL;
3810 return 0;
3811 }
3812
ublk_check_batch_cmd(const struct ublk_batch_io_data * data)3813 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3814 {
3815 const struct ublk_batch_io *uc = &data->header;
3816
3817 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3818 return -EINVAL;
3819
3820 if (uc->nr_elem > data->ub->dev_info.queue_depth)
3821 return -E2BIG;
3822
3823 if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3824 !ublk_dev_is_zoned(data->ub))
3825 return -EINVAL;
3826
3827 if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3828 !ublk_dev_need_map_io(data->ub))
3829 return -EINVAL;
3830
3831 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3832 !ublk_dev_support_auto_buf_reg(data->ub))
3833 return -EINVAL;
3834
3835 return ublk_check_batch_cmd_flags(uc);
3836 }
3837
ublk_batch_attach(struct ublk_queue * ubq,struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)3838 static int ublk_batch_attach(struct ublk_queue *ubq,
3839 struct ublk_batch_io_data *data,
3840 struct ublk_batch_fetch_cmd *fcmd)
3841 {
3842 struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3843 bool free = false;
3844 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3845
3846 spin_lock(&ubq->evts_lock);
3847 if (unlikely(ubq->force_abort || ubq->canceling)) {
3848 free = true;
3849 } else {
3850 list_add_tail(&fcmd->node, &ubq->fcmd_head);
3851 new_fcmd = __ublk_acquire_fcmd(ubq);
3852 }
3853 spin_unlock(&ubq->evts_lock);
3854
3855 if (unlikely(free)) {
3856 ublk_batch_free_fcmd(fcmd);
3857 return -ENODEV;
3858 }
3859
3860 pdu->ubq = ubq;
3861 pdu->fcmd = fcmd;
3862 io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3863
3864 if (!new_fcmd)
3865 goto out;
3866
3867 /*
3868 * If the two fetch commands are originated from same io_ring_ctx,
3869 * run batch dispatch directly. Otherwise, schedule task work for
3870 * doing it.
3871 */
3872 if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3873 io_uring_cmd_ctx_handle(fcmd->cmd)) {
3874 data->cmd = new_fcmd->cmd;
3875 ublk_batch_dispatch(ubq, data, new_fcmd);
3876 } else {
3877 io_uring_cmd_complete_in_task(new_fcmd->cmd,
3878 ublk_batch_tw_cb);
3879 }
3880 out:
3881 return -EIOCBQUEUED;
3882 }
3883
ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data * data)3884 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3885 {
3886 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3887 struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3888
3889 if (!fcmd)
3890 return -ENOMEM;
3891
3892 return ublk_batch_attach(ubq, data, fcmd);
3893 }
3894
ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data * data)3895 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3896 {
3897 const struct ublk_batch_io *uc = &data->header;
3898
3899 if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3900 return -EINVAL;
3901
3902 if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3903 return -EINVAL;
3904
3905 if (uc->elem_bytes != sizeof(__u16))
3906 return -EINVAL;
3907
3908 if (uc->flags != 0)
3909 return -EINVAL;
3910
3911 return 0;
3912 }
3913
ublk_handle_non_batch_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3914 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3915 unsigned int issue_flags)
3916 {
3917 const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3918 struct ublksrv_io_cmd);
3919 struct ublk_device *ub = cmd->file->private_data;
3920 unsigned tag = READ_ONCE(ub_cmd->tag);
3921 unsigned q_id = READ_ONCE(ub_cmd->q_id);
3922 unsigned index = READ_ONCE(ub_cmd->addr);
3923 struct ublk_queue *ubq;
3924 struct ublk_io *io;
3925
3926 if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3927 return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3928
3929 if (q_id >= ub->dev_info.nr_hw_queues)
3930 return -EINVAL;
3931
3932 if (tag >= ub->dev_info.queue_depth)
3933 return -EINVAL;
3934
3935 if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3936 return -EOPNOTSUPP;
3937
3938 ubq = ublk_get_queue(ub, q_id);
3939 io = &ubq->ios[tag];
3940 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3941 issue_flags);
3942 }
3943
ublk_ch_batch_io_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3944 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3945 unsigned int issue_flags)
3946 {
3947 const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3948 struct ublk_batch_io);
3949 struct ublk_device *ub = cmd->file->private_data;
3950 struct ublk_batch_io_data data = {
3951 .ub = ub,
3952 .cmd = cmd,
3953 .header = (struct ublk_batch_io) {
3954 .q_id = READ_ONCE(uc->q_id),
3955 .flags = READ_ONCE(uc->flags),
3956 .nr_elem = READ_ONCE(uc->nr_elem),
3957 .elem_bytes = READ_ONCE(uc->elem_bytes),
3958 },
3959 .issue_flags = issue_flags,
3960 };
3961 u32 cmd_op = cmd->cmd_op;
3962 int ret = -EINVAL;
3963
3964 if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3965 ublk_batch_cancel_fn(cmd, issue_flags);
3966 return 0;
3967 }
3968
3969 switch (cmd_op) {
3970 case UBLK_U_IO_PREP_IO_CMDS:
3971 ret = ublk_check_batch_cmd(&data);
3972 if (ret)
3973 goto out;
3974 ret = ublk_handle_batch_prep_cmd(&data);
3975 break;
3976 case UBLK_U_IO_COMMIT_IO_CMDS:
3977 ret = ublk_check_batch_cmd(&data);
3978 if (ret)
3979 goto out;
3980 ret = ublk_handle_batch_commit_cmd(&data);
3981 break;
3982 case UBLK_U_IO_FETCH_IO_CMDS:
3983 ret = ublk_validate_batch_fetch_cmd(&data);
3984 if (ret)
3985 goto out;
3986 ret = ublk_handle_batch_fetch_cmd(&data);
3987 break;
3988 default:
3989 ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3990 break;
3991 }
3992 out:
3993 return ret;
3994 }
3995
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)3996 static inline bool ublk_check_ubuf_dir(const struct request *req,
3997 int ubuf_dir)
3998 {
3999 /* copy ubuf to request pages */
4000 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
4001 ubuf_dir == ITER_SOURCE)
4002 return true;
4003
4004 /* copy request pages to ubuf */
4005 if ((req_op(req) == REQ_OP_WRITE ||
4006 req_op(req) == REQ_OP_ZONE_APPEND) &&
4007 ubuf_dir == ITER_DEST)
4008 return true;
4009
4010 return false;
4011 }
4012
4013 static ssize_t
ublk_user_copy(struct kiocb * iocb,struct iov_iter * iter,int dir)4014 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4015 {
4016 struct ublk_device *ub = iocb->ki_filp->private_data;
4017 struct ublk_queue *ubq;
4018 struct request *req;
4019 struct ublk_io *io;
4020 unsigned data_len;
4021 bool is_integrity;
4022 bool on_daemon;
4023 size_t buf_off;
4024 u16 tag, q_id;
4025 ssize_t ret;
4026
4027 if (!user_backed_iter(iter))
4028 return -EACCES;
4029
4030 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4031 return -EACCES;
4032
4033 tag = ublk_pos_to_tag(iocb->ki_pos);
4034 q_id = ublk_pos_to_hwq(iocb->ki_pos);
4035 buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4036 is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4037
4038 if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4039 return -EINVAL;
4040
4041 if (q_id >= ub->dev_info.nr_hw_queues)
4042 return -EINVAL;
4043
4044 ubq = ublk_get_queue(ub, q_id);
4045 if (!ublk_dev_support_user_copy(ub))
4046 return -EACCES;
4047
4048 if (tag >= ub->dev_info.queue_depth)
4049 return -EINVAL;
4050
4051 io = &ubq->ios[tag];
4052 on_daemon = current == READ_ONCE(io->task);
4053 if (on_daemon) {
4054 /* On daemon, io can't be completed concurrently, so skip ref */
4055 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4056 return -EINVAL;
4057
4058 req = io->req;
4059 if (!ublk_rq_has_data(req))
4060 return -EINVAL;
4061 } else {
4062 req = __ublk_check_and_get_req(ub, q_id, tag, io);
4063 if (!req)
4064 return -EINVAL;
4065 }
4066
4067 if (is_integrity) {
4068 struct blk_integrity *bi = &req->q->limits.integrity;
4069
4070 data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4071 } else {
4072 data_len = blk_rq_bytes(req);
4073 }
4074 if (buf_off > data_len) {
4075 ret = -EINVAL;
4076 goto out;
4077 }
4078
4079 if (!ublk_check_ubuf_dir(req, dir)) {
4080 ret = -EACCES;
4081 goto out;
4082 }
4083
4084 if (is_integrity)
4085 ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4086 else
4087 ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4088
4089 out:
4090 if (!on_daemon)
4091 ublk_put_req_ref(io, req);
4092 return ret;
4093 }
4094
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)4095 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4096 {
4097 return ublk_user_copy(iocb, to, ITER_DEST);
4098 }
4099
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)4100 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4101 {
4102 return ublk_user_copy(iocb, from, ITER_SOURCE);
4103 }
4104
4105 static const struct file_operations ublk_ch_fops = {
4106 .owner = THIS_MODULE,
4107 .open = ublk_ch_open,
4108 .release = ublk_ch_release,
4109 .read_iter = ublk_ch_read_iter,
4110 .write_iter = ublk_ch_write_iter,
4111 .uring_cmd = ublk_ch_uring_cmd,
4112 .mmap = ublk_ch_mmap,
4113 };
4114
4115 static const struct file_operations ublk_ch_batch_io_fops = {
4116 .owner = THIS_MODULE,
4117 .open = ublk_ch_open,
4118 .release = ublk_ch_release,
4119 .read_iter = ublk_ch_read_iter,
4120 .write_iter = ublk_ch_write_iter,
4121 .uring_cmd = ublk_ch_batch_io_uring_cmd,
4122 .mmap = ublk_ch_mmap,
4123 };
4124
__ublk_deinit_queue(struct ublk_device * ub,struct ublk_queue * ubq)4125 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4126 {
4127 int size, i;
4128
4129 size = ublk_queue_cmd_buf_size(ub);
4130
4131 for (i = 0; i < ubq->q_depth; i++) {
4132 struct ublk_io *io = &ubq->ios[i];
4133 if (io->task)
4134 put_task_struct(io->task);
4135 WARN_ON_ONCE(refcount_read(&io->ref));
4136 WARN_ON_ONCE(io->task_registered_buffers);
4137 }
4138
4139 if (ubq->io_cmd_buf)
4140 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4141
4142 if (ublk_dev_support_batch_io(ub))
4143 ublk_io_evts_deinit(ubq);
4144
4145 kvfree(ubq);
4146 }
4147
ublk_deinit_queue(struct ublk_device * ub,int q_id)4148 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4149 {
4150 struct ublk_queue *ubq = ub->queues[q_id];
4151
4152 if (!ubq)
4153 return;
4154
4155 __ublk_deinit_queue(ub, ubq);
4156 ub->queues[q_id] = NULL;
4157 }
4158
ublk_get_queue_numa_node(struct ublk_device * ub,int q_id)4159 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4160 {
4161 unsigned int cpu;
4162
4163 /* Find first CPU mapped to this queue */
4164 for_each_possible_cpu(cpu) {
4165 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4166 return cpu_to_node(cpu);
4167 }
4168
4169 return NUMA_NO_NODE;
4170 }
4171
ublk_init_queue(struct ublk_device * ub,int q_id)4172 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4173 {
4174 int depth = ub->dev_info.queue_depth;
4175 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4176 struct ublk_queue *ubq;
4177 struct page *page;
4178 int numa_node;
4179 int size, i, ret;
4180
4181 /* Determine NUMA node based on queue's CPU affinity */
4182 numa_node = ublk_get_queue_numa_node(ub, q_id);
4183
4184 /* Allocate queue structure on local NUMA node */
4185 ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4186 numa_node);
4187 if (!ubq)
4188 return -ENOMEM;
4189
4190 spin_lock_init(&ubq->cancel_lock);
4191 ubq->flags = ub->dev_info.flags;
4192 ubq->q_id = q_id;
4193 ubq->q_depth = depth;
4194 size = ublk_queue_cmd_buf_size(ub);
4195
4196 /* Allocate I/O command buffer on local NUMA node */
4197 page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4198 if (!page) {
4199 kvfree(ubq);
4200 return -ENOMEM;
4201 }
4202 ubq->io_cmd_buf = page_address(page);
4203
4204 for (i = 0; i < ubq->q_depth; i++)
4205 spin_lock_init(&ubq->ios[i].lock);
4206
4207 if (ublk_dev_support_batch_io(ub)) {
4208 ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4209 if (ret)
4210 goto fail;
4211 INIT_LIST_HEAD(&ubq->fcmd_head);
4212 }
4213 ub->queues[q_id] = ubq;
4214 ubq->dev = ub;
4215
4216 return 0;
4217 fail:
4218 __ublk_deinit_queue(ub, ubq);
4219 return ret;
4220 }
4221
ublk_deinit_queues(struct ublk_device * ub)4222 static void ublk_deinit_queues(struct ublk_device *ub)
4223 {
4224 int i;
4225
4226 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4227 ublk_deinit_queue(ub, i);
4228 }
4229
ublk_init_queues(struct ublk_device * ub)4230 static int ublk_init_queues(struct ublk_device *ub)
4231 {
4232 int i, ret;
4233
4234 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4235 ret = ublk_init_queue(ub, i);
4236 if (ret)
4237 goto fail;
4238 }
4239
4240 init_completion(&ub->completion);
4241 return 0;
4242
4243 fail:
4244 ublk_deinit_queues(ub);
4245 return ret;
4246 }
4247
ublk_alloc_dev_number(struct ublk_device * ub,int idx)4248 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4249 {
4250 int i = idx;
4251 int err;
4252
4253 spin_lock(&ublk_idr_lock);
4254 /* allocate id, if @id >= 0, we're requesting that specific id */
4255 if (i >= 0) {
4256 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4257 if (err == -ENOSPC)
4258 err = -EEXIST;
4259 } else {
4260 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4261 GFP_NOWAIT);
4262 }
4263 spin_unlock(&ublk_idr_lock);
4264
4265 if (err >= 0)
4266 ub->ub_number = err;
4267
4268 return err;
4269 }
4270
ublk_free_dev_number(struct ublk_device * ub)4271 static void ublk_free_dev_number(struct ublk_device *ub)
4272 {
4273 spin_lock(&ublk_idr_lock);
4274 idr_remove(&ublk_index_idr, ub->ub_number);
4275 wake_up_all(&ublk_idr_wq);
4276 spin_unlock(&ublk_idr_lock);
4277 }
4278
ublk_cdev_rel(struct device * dev)4279 static void ublk_cdev_rel(struct device *dev)
4280 {
4281 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4282
4283 ublk_buf_cleanup(ub);
4284 blk_mq_free_tag_set(&ub->tag_set);
4285 ublk_deinit_queues(ub);
4286 ublk_free_dev_number(ub);
4287 mutex_destroy(&ub->mutex);
4288 mutex_destroy(&ub->cancel_mutex);
4289 kfree(ub);
4290 }
4291
ublk_add_chdev(struct ublk_device * ub)4292 static int ublk_add_chdev(struct ublk_device *ub)
4293 {
4294 struct device *dev = &ub->cdev_dev;
4295 int minor = ub->ub_number;
4296 int ret;
4297
4298 dev->parent = ublk_misc.this_device;
4299 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4300 dev->class = &ublk_chr_class;
4301 dev->release = ublk_cdev_rel;
4302 device_initialize(dev);
4303
4304 ret = dev_set_name(dev, "ublkc%d", minor);
4305 if (ret)
4306 goto fail;
4307
4308 if (ublk_dev_support_batch_io(ub))
4309 cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4310 else
4311 cdev_init(&ub->cdev, &ublk_ch_fops);
4312 ret = cdev_device_add(&ub->cdev, dev);
4313 if (ret)
4314 goto fail;
4315
4316 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4317 unprivileged_ublks_added++;
4318 return 0;
4319 fail:
4320 put_device(dev);
4321 return ret;
4322 }
4323
4324 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)4325 static void ublk_align_max_io_size(struct ublk_device *ub)
4326 {
4327 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4328
4329 ub->dev_info.max_io_buf_bytes =
4330 round_down(max_io_bytes, PAGE_SIZE);
4331 }
4332
ublk_add_tag_set(struct ublk_device * ub)4333 static int ublk_add_tag_set(struct ublk_device *ub)
4334 {
4335 if (ublk_dev_support_batch_io(ub))
4336 ub->tag_set.ops = &ublk_batch_mq_ops;
4337 else
4338 ub->tag_set.ops = &ublk_mq_ops;
4339 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4340 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4341 ub->tag_set.numa_node = NUMA_NO_NODE;
4342 ub->tag_set.driver_data = ub;
4343 return blk_mq_alloc_tag_set(&ub->tag_set);
4344 }
4345
ublk_remove(struct ublk_device * ub)4346 static void ublk_remove(struct ublk_device *ub)
4347 {
4348 bool unprivileged;
4349
4350 ublk_stop_dev(ub);
4351 cdev_device_del(&ub->cdev, &ub->cdev_dev);
4352 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4353 ublk_put_device(ub);
4354
4355 if (unprivileged)
4356 unprivileged_ublks_added--;
4357 }
4358
ublk_get_device_from_id(int idx)4359 static struct ublk_device *ublk_get_device_from_id(int idx)
4360 {
4361 struct ublk_device *ub = NULL;
4362
4363 if (idx < 0)
4364 return NULL;
4365
4366 spin_lock(&ublk_idr_lock);
4367 ub = idr_find(&ublk_index_idr, idx);
4368 if (ub)
4369 ub = ublk_get_device(ub);
4370 spin_unlock(&ublk_idr_lock);
4371
4372 return ub;
4373 }
4374
ublk_validate_user_pid(struct ublk_device * ub,pid_t ublksrv_pid)4375 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4376 {
4377 rcu_read_lock();
4378 ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4379 rcu_read_unlock();
4380
4381 return ub->ublksrv_tgid == ublksrv_pid;
4382 }
4383
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4384 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4385 const struct ublksrv_ctrl_cmd *header)
4386 {
4387 const struct ublk_param_basic *p = &ub->params.basic;
4388 int ublksrv_pid = (int)header->data[0];
4389 struct queue_limits lim = {
4390 .logical_block_size = 1 << p->logical_bs_shift,
4391 .physical_block_size = 1 << p->physical_bs_shift,
4392 .io_min = 1 << p->io_min_shift,
4393 .io_opt = 1 << p->io_opt_shift,
4394 .max_hw_sectors = p->max_sectors,
4395 .chunk_sectors = p->chunk_sectors,
4396 .virt_boundary_mask = p->virt_boundary_mask,
4397 .max_segments = USHRT_MAX,
4398 .max_segment_size = UINT_MAX,
4399 .dma_alignment = 3,
4400 };
4401 struct gendisk *disk;
4402 int ret = -EINVAL;
4403
4404 if (ublksrv_pid <= 0)
4405 return -EINVAL;
4406 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4407 return -EINVAL;
4408
4409 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4410 const struct ublk_param_discard *pd = &ub->params.discard;
4411
4412 lim.discard_alignment = pd->discard_alignment;
4413 lim.discard_granularity = pd->discard_granularity;
4414 lim.max_hw_discard_sectors = pd->max_discard_sectors;
4415 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4416 lim.max_discard_segments = pd->max_discard_segments;
4417 }
4418
4419 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4420 const struct ublk_param_zoned *p = &ub->params.zoned;
4421
4422 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4423 return -EOPNOTSUPP;
4424
4425 lim.features |= BLK_FEAT_ZONED;
4426 lim.max_active_zones = p->max_active_zones;
4427 lim.max_open_zones = p->max_open_zones;
4428 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4429 }
4430
4431 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4432 lim.features |= BLK_FEAT_WRITE_CACHE;
4433 if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4434 lim.features |= BLK_FEAT_FUA;
4435 }
4436
4437 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4438 lim.features |= BLK_FEAT_ROTATIONAL;
4439
4440 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4441 lim.dma_alignment = ub->params.dma.alignment;
4442
4443 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4444 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4445 lim.max_segment_size = ub->params.seg.max_segment_size;
4446 lim.max_segments = ub->params.seg.max_segments;
4447 }
4448
4449 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4450 const struct ublk_param_integrity *p = &ub->params.integrity;
4451 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4452
4453 lim.max_integrity_segments =
4454 p->max_integrity_segments ?: USHRT_MAX;
4455 lim.integrity = (struct blk_integrity) {
4456 .flags = ublk_integrity_flags(p->flags),
4457 .csum_type = ublk_integrity_csum_type(p->csum_type),
4458 .metadata_size = p->metadata_size,
4459 .pi_offset = p->pi_offset,
4460 .interval_exp = p->interval_exp,
4461 .tag_size = p->tag_size,
4462 .pi_tuple_size = pi_tuple_size,
4463 };
4464 }
4465
4466 if (wait_for_completion_interruptible(&ub->completion) != 0)
4467 return -EINTR;
4468
4469 if (!ublk_validate_user_pid(ub, ublksrv_pid))
4470 return -EINVAL;
4471
4472 mutex_lock(&ub->mutex);
4473 /* device may become not ready in case of F_BATCH */
4474 if (!ublk_dev_ready(ub)) {
4475 ret = -EINVAL;
4476 goto out_unlock;
4477 }
4478 if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4479 test_bit(UB_STATE_USED, &ub->state)) {
4480 ret = -EEXIST;
4481 goto out_unlock;
4482 }
4483
4484 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4485 if (IS_ERR(disk)) {
4486 ret = PTR_ERR(disk);
4487 goto out_unlock;
4488 }
4489 sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4490 disk->fops = &ub_fops;
4491 disk->private_data = ub;
4492
4493 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4494 ub->ub_disk = disk;
4495
4496 ublk_apply_params(ub);
4497
4498 /*
4499 * Suppress partition scan to avoid potential IO hang.
4500 *
4501 * If ublk server error occurs during partition scan, the IO may
4502 * wait while holding ub->mutex, which can deadlock with other
4503 * operations that need the mutex. Defer partition scan to async
4504 * work.
4505 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4506 * permanently.
4507 */
4508 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4509
4510 ublk_get_device(ub);
4511 ub->dev_info.state = UBLK_S_DEV_LIVE;
4512
4513 if (ublk_dev_is_zoned(ub)) {
4514 ret = ublk_revalidate_disk_zones(ub);
4515 if (ret)
4516 goto out_put_cdev;
4517 }
4518
4519 ret = add_disk(disk);
4520 if (ret)
4521 goto out_put_cdev;
4522
4523 set_bit(UB_STATE_USED, &ub->state);
4524
4525 /* Skip partition scan if disabled by user */
4526 if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4527 /* Not clear for unprivileged daemons, see comment above */
4528 if (!ub->unprivileged_daemons)
4529 clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4530 } else {
4531 /* Schedule async partition scan for trusted daemons */
4532 if (!ub->unprivileged_daemons)
4533 schedule_work(&ub->partition_scan_work);
4534 }
4535
4536 out_put_cdev:
4537 if (ret) {
4538 ublk_detach_disk(ub);
4539 ublk_put_device(ub);
4540 }
4541 if (ret)
4542 put_disk(disk);
4543 out_unlock:
4544 mutex_unlock(&ub->mutex);
4545 return ret;
4546 }
4547
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4548 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4549 const struct ublksrv_ctrl_cmd *header)
4550 {
4551 void __user *argp = (void __user *)(unsigned long)header->addr;
4552 cpumask_var_t cpumask;
4553 unsigned long queue;
4554 unsigned int retlen;
4555 unsigned int i;
4556 int ret;
4557
4558 if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4559 return -EINVAL;
4560 if (header->len & (sizeof(unsigned long)-1))
4561 return -EINVAL;
4562 if (!header->addr)
4563 return -EINVAL;
4564
4565 queue = header->data[0];
4566 if (queue >= ub->dev_info.nr_hw_queues)
4567 return -EINVAL;
4568
4569 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4570 return -ENOMEM;
4571
4572 for_each_possible_cpu(i) {
4573 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4574 cpumask_set_cpu(i, cpumask);
4575 }
4576
4577 ret = -EFAULT;
4578 retlen = min_t(unsigned short, header->len, cpumask_size());
4579 if (copy_to_user(argp, cpumask, retlen))
4580 goto out_free_cpumask;
4581 if (retlen != header->len &&
4582 clear_user(argp + retlen, header->len - retlen))
4583 goto out_free_cpumask;
4584
4585 ret = 0;
4586 out_free_cpumask:
4587 free_cpumask_var(cpumask);
4588 return ret;
4589 }
4590
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)4591 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4592 {
4593 pr_devel("%s: dev id %d flags %llx\n", __func__,
4594 info->dev_id, info->flags);
4595 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4596 info->nr_hw_queues, info->queue_depth);
4597 }
4598
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)4599 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4600 {
4601 void __user *argp = (void __user *)(unsigned long)header->addr;
4602 struct ublksrv_ctrl_dev_info info;
4603 struct ublk_device *ub;
4604 int ret = -EINVAL;
4605
4606 if (header->len < sizeof(info) || !header->addr)
4607 return -EINVAL;
4608 if (header->queue_id != (u16)-1) {
4609 pr_warn("%s: queue_id is wrong %x\n",
4610 __func__, header->queue_id);
4611 return -EINVAL;
4612 }
4613
4614 if (copy_from_user(&info, argp, sizeof(info)))
4615 return -EFAULT;
4616
4617 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4618 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4619 return -EINVAL;
4620
4621 if (capable(CAP_SYS_ADMIN))
4622 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4623 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4624 return -EPERM;
4625
4626 /* forbid nonsense combinations of recovery flags */
4627 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4628 case 0:
4629 case UBLK_F_USER_RECOVERY:
4630 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4631 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4632 break;
4633 default:
4634 pr_warn("%s: invalid recovery flags %llx\n", __func__,
4635 info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4636 return -EINVAL;
4637 }
4638
4639 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4640 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4641 return -EINVAL;
4642 }
4643
4644 /*
4645 * unprivileged device can't be trusted, but RECOVERY and
4646 * RECOVERY_REISSUE still may hang error handling, so can't
4647 * support recovery features for unprivileged ublk now
4648 *
4649 * TODO: provide forward progress for RECOVERY handler, so that
4650 * unprivileged device can benefit from it
4651 */
4652 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4653 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4654 UBLK_F_USER_RECOVERY);
4655
4656 /*
4657 * For USER_COPY, we depends on userspace to fill request
4658 * buffer by pwrite() to ublk char device, which can't be
4659 * used for unprivileged device
4660 *
4661 * Same with zero copy or auto buffer register.
4662 */
4663 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4664 UBLK_F_AUTO_BUF_REG))
4665 return -EINVAL;
4666 }
4667
4668 /* User copy is required to access integrity buffer */
4669 if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4670 return -EINVAL;
4671
4672 /* the created device is always owned by current user */
4673 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4674
4675 if (header->dev_id != info.dev_id) {
4676 pr_warn("%s: dev id not match %u %u\n",
4677 __func__, header->dev_id, info.dev_id);
4678 return -EINVAL;
4679 }
4680
4681 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4682 pr_warn("%s: dev id is too large. Max supported is %d\n",
4683 __func__, UBLK_MAX_UBLKS - 1);
4684 return -EINVAL;
4685 }
4686
4687 ublk_dump_dev_info(&info);
4688
4689 ret = mutex_lock_killable(&ublk_ctl_mutex);
4690 if (ret)
4691 return ret;
4692
4693 ret = -EACCES;
4694 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4695 unprivileged_ublks_added >= unprivileged_ublks_max)
4696 goto out_unlock;
4697
4698 ret = -ENOMEM;
4699 ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4700 if (!ub)
4701 goto out_unlock;
4702 mutex_init(&ub->mutex);
4703 spin_lock_init(&ub->lock);
4704 mutex_init(&ub->cancel_mutex);
4705 mt_init(&ub->buf_tree);
4706 ida_init(&ub->buf_ida);
4707 INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4708
4709 ret = ublk_alloc_dev_number(ub, header->dev_id);
4710 if (ret < 0)
4711 goto out_free_ub;
4712
4713 memcpy(&ub->dev_info, &info, sizeof(info));
4714
4715 /* update device id */
4716 ub->dev_info.dev_id = ub->ub_number;
4717
4718 /*
4719 * 64bit flags will be copied back to userspace as feature
4720 * negotiation result, so have to clear flags which driver
4721 * doesn't support yet, then userspace can get correct flags
4722 * (features) to handle.
4723 */
4724 ub->dev_info.flags &= UBLK_F_ALL;
4725
4726 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4727 UBLK_F_URING_CMD_COMP_IN_TASK |
4728 UBLK_F_PER_IO_DAEMON |
4729 UBLK_F_BUF_REG_OFF_DAEMON |
4730 UBLK_F_SAFE_STOP_DEV;
4731
4732 /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4733 if (ublk_dev_support_batch_io(ub))
4734 ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4735
4736 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4737 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4738 UBLK_F_AUTO_BUF_REG))
4739 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4740
4741 /* UBLK_F_BATCH_IO doesn't support GET_DATA */
4742 if (ublk_dev_support_batch_io(ub))
4743 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4744
4745 /*
4746 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4747 * returning write_append_lba, which is only allowed in case of
4748 * user copy or zero copy
4749 */
4750 if (ublk_dev_is_zoned(ub) &&
4751 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4752 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4753 ret = -EINVAL;
4754 goto out_free_dev_number;
4755 }
4756
4757 ub->dev_info.nr_hw_queues = min_t(unsigned int,
4758 ub->dev_info.nr_hw_queues, nr_cpu_ids);
4759 ublk_align_max_io_size(ub);
4760
4761 ret = ublk_add_tag_set(ub);
4762 if (ret)
4763 goto out_free_dev_number;
4764
4765 ret = ublk_init_queues(ub);
4766 if (ret)
4767 goto out_free_tag_set;
4768
4769 ret = -EFAULT;
4770 if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4771 goto out_deinit_queues;
4772
4773 /*
4774 * Add the char dev so that ublksrv daemon can be setup.
4775 * ublk_add_chdev() will cleanup everything if it fails.
4776 */
4777 ret = ublk_add_chdev(ub);
4778 goto out_unlock;
4779
4780 out_deinit_queues:
4781 ublk_deinit_queues(ub);
4782 out_free_tag_set:
4783 blk_mq_free_tag_set(&ub->tag_set);
4784 out_free_dev_number:
4785 ublk_free_dev_number(ub);
4786 out_free_ub:
4787 mutex_destroy(&ub->mutex);
4788 mutex_destroy(&ub->cancel_mutex);
4789 kfree(ub);
4790 out_unlock:
4791 mutex_unlock(&ublk_ctl_mutex);
4792 return ret;
4793 }
4794
ublk_idr_freed(int id)4795 static inline bool ublk_idr_freed(int id)
4796 {
4797 void *ptr;
4798
4799 spin_lock(&ublk_idr_lock);
4800 ptr = idr_find(&ublk_index_idr, id);
4801 spin_unlock(&ublk_idr_lock);
4802
4803 return ptr == NULL;
4804 }
4805
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)4806 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4807 {
4808 struct ublk_device *ub = *p_ub;
4809 int idx = ub->ub_number;
4810 int ret;
4811
4812 ret = mutex_lock_killable(&ublk_ctl_mutex);
4813 if (ret)
4814 return ret;
4815
4816 if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4817 ublk_remove(ub);
4818 set_bit(UB_STATE_DELETED, &ub->state);
4819 }
4820
4821 /* Mark the reference as consumed */
4822 *p_ub = NULL;
4823 ublk_put_device(ub);
4824 mutex_unlock(&ublk_ctl_mutex);
4825
4826 /*
4827 * Wait until the idr is removed, then it can be reused after
4828 * DEL_DEV command is returned.
4829 *
4830 * If we returns because of user interrupt, future delete command
4831 * may come:
4832 *
4833 * - the device number isn't freed, this device won't or needn't
4834 * be deleted again, since UB_STATE_DELETED is set, and device
4835 * will be released after the last reference is dropped
4836 *
4837 * - the device number is freed already, we will not find this
4838 * device via ublk_get_device_from_id()
4839 */
4840 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4841 return -EINTR;
4842 return 0;
4843 }
4844
ublk_ctrl_cmd_dump(u32 cmd_op,const struct ublksrv_ctrl_cmd * header)4845 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4846 const struct ublksrv_ctrl_cmd *header)
4847 {
4848 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4849 __func__, cmd_op, header->dev_id, header->queue_id,
4850 header->data[0], header->addr, header->len);
4851 }
4852
ublk_ctrl_stop_dev(struct ublk_device * ub)4853 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4854 {
4855 ublk_stop_dev(ub);
4856 }
4857
ublk_ctrl_try_stop_dev(struct ublk_device * ub)4858 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4859 {
4860 struct gendisk *disk;
4861 int ret = 0;
4862
4863 disk = ublk_get_disk(ub);
4864 if (!disk)
4865 return -ENODEV;
4866
4867 mutex_lock(&disk->open_mutex);
4868 if (disk_openers(disk) > 0) {
4869 ret = -EBUSY;
4870 goto unlock;
4871 }
4872 ub->block_open = true;
4873 /* release open_mutex as del_gendisk() will reacquire it */
4874 mutex_unlock(&disk->open_mutex);
4875
4876 ublk_ctrl_stop_dev(ub);
4877 goto out;
4878
4879 unlock:
4880 mutex_unlock(&disk->open_mutex);
4881 out:
4882 ublk_put_disk(disk);
4883 return ret;
4884 }
4885
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4886 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4887 const struct ublksrv_ctrl_cmd *header)
4888 {
4889 struct task_struct *p;
4890 struct pid *pid;
4891 struct ublksrv_ctrl_dev_info dev_info;
4892 pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4893 void __user *argp = (void __user *)(unsigned long)header->addr;
4894
4895 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4896 return -EINVAL;
4897
4898 memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4899 dev_info.ublksrv_pid = -1;
4900
4901 if (init_ublksrv_tgid > 0) {
4902 rcu_read_lock();
4903 pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4904 p = pid_task(pid, PIDTYPE_TGID);
4905 if (p) {
4906 int vnr = task_tgid_vnr(p);
4907
4908 if (vnr)
4909 dev_info.ublksrv_pid = vnr;
4910 }
4911 rcu_read_unlock();
4912 }
4913
4914 if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4915 return -EFAULT;
4916
4917 return 0;
4918 }
4919
4920 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)4921 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4922 {
4923 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4924 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4925
4926 if (ub->ub_disk) {
4927 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4928 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4929 } else {
4930 ub->params.devt.disk_major = 0;
4931 ub->params.devt.disk_minor = 0;
4932 }
4933 ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4934 }
4935
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4936 static int ublk_ctrl_get_params(struct ublk_device *ub,
4937 const struct ublksrv_ctrl_cmd *header)
4938 {
4939 void __user *argp = (void __user *)(unsigned long)header->addr;
4940 struct ublk_params_header ph;
4941 int ret;
4942
4943 if (header->len <= sizeof(ph) || !header->addr)
4944 return -EINVAL;
4945
4946 if (copy_from_user(&ph, argp, sizeof(ph)))
4947 return -EFAULT;
4948
4949 if (ph.len > header->len || !ph.len)
4950 return -EINVAL;
4951
4952 if (ph.len > sizeof(struct ublk_params))
4953 ph.len = sizeof(struct ublk_params);
4954
4955 mutex_lock(&ub->mutex);
4956 ublk_ctrl_fill_params_devt(ub);
4957 if (copy_to_user(argp, &ub->params, ph.len))
4958 ret = -EFAULT;
4959 else
4960 ret = 0;
4961 mutex_unlock(&ub->mutex);
4962
4963 return ret;
4964 }
4965
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4966 static int ublk_ctrl_set_params(struct ublk_device *ub,
4967 const struct ublksrv_ctrl_cmd *header)
4968 {
4969 void __user *argp = (void __user *)(unsigned long)header->addr;
4970 struct ublk_params_header ph;
4971 int ret = -EFAULT;
4972
4973 if (header->len <= sizeof(ph) || !header->addr)
4974 return -EINVAL;
4975
4976 if (copy_from_user(&ph, argp, sizeof(ph)))
4977 return -EFAULT;
4978
4979 if (ph.len > header->len || !ph.len || !ph.types)
4980 return -EINVAL;
4981
4982 if (ph.len > sizeof(struct ublk_params))
4983 ph.len = sizeof(struct ublk_params);
4984
4985 mutex_lock(&ub->mutex);
4986 if (test_bit(UB_STATE_USED, &ub->state)) {
4987 /*
4988 * Parameters can only be changed when device hasn't
4989 * been started yet
4990 */
4991 ret = -EACCES;
4992 } else if (copy_from_user(&ub->params, argp, ph.len)) {
4993 ret = -EFAULT;
4994 } else {
4995 /* clear all we don't support yet */
4996 ub->params.types &= UBLK_PARAM_TYPE_ALL;
4997 ret = ublk_validate_params(ub);
4998 if (ret)
4999 ub->params.types = 0;
5000 }
5001 mutex_unlock(&ub->mutex);
5002
5003 return ret;
5004 }
5005
ublk_ctrl_start_recovery(struct ublk_device * ub)5006 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
5007 {
5008 int ret = -EINVAL;
5009
5010 mutex_lock(&ub->mutex);
5011 if (ublk_nosrv_should_stop_dev(ub))
5012 goto out_unlock;
5013 /*
5014 * START_RECOVERY is only allowd after:
5015 *
5016 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5017 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
5018 * released.
5019 *
5020 * and one of the following holds
5021 *
5022 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5023 * (a)has quiesced request queue
5024 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
5025 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5026 * (d)has completed/camceled all ioucmds owned by ther dying process
5027 *
5028 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5029 * quiesced, but all I/O is being immediately errored
5030 */
5031 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5032 ret = -EBUSY;
5033 goto out_unlock;
5034 }
5035 pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5036 init_completion(&ub->completion);
5037 ret = 0;
5038 out_unlock:
5039 mutex_unlock(&ub->mutex);
5040 return ret;
5041 }
5042
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5043 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5044 const struct ublksrv_ctrl_cmd *header)
5045 {
5046 int ublksrv_pid = (int)header->data[0];
5047 int ret = -EINVAL;
5048
5049 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5050 header->dev_id);
5051
5052 if (wait_for_completion_interruptible(&ub->completion))
5053 return -EINTR;
5054
5055 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5056 header->dev_id);
5057
5058 if (!ublk_validate_user_pid(ub, ublksrv_pid))
5059 return -EINVAL;
5060
5061 mutex_lock(&ub->mutex);
5062 if (ublk_nosrv_should_stop_dev(ub))
5063 goto out_unlock;
5064
5065 if (!ublk_dev_in_recoverable_state(ub)) {
5066 ret = -EBUSY;
5067 goto out_unlock;
5068 }
5069 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5070 ub->dev_info.state = UBLK_S_DEV_LIVE;
5071 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5072 __func__, ublksrv_pid, header->dev_id);
5073 blk_mq_kick_requeue_list(ub->ub_disk->queue);
5074 ret = 0;
5075 out_unlock:
5076 mutex_unlock(&ub->mutex);
5077 return ret;
5078 }
5079
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)5080 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5081 {
5082 void __user *argp = (void __user *)(unsigned long)header->addr;
5083 u64 features = UBLK_F_ALL;
5084
5085 if (header->len != UBLK_FEATURES_LEN || !header->addr)
5086 return -EINVAL;
5087
5088 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5089 return -EFAULT;
5090
5091 return 0;
5092 }
5093
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5094 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5095 {
5096 struct ublk_param_basic *p = &ub->params.basic;
5097 u64 new_size = header->data[0];
5098 int ret = 0;
5099
5100 mutex_lock(&ub->mutex);
5101 if (!ub->ub_disk) {
5102 ret = -ENODEV;
5103 goto out;
5104 }
5105 p->dev_sectors = new_size;
5106 set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5107 out:
5108 mutex_unlock(&ub->mutex);
5109 return ret;
5110 }
5111
5112 struct count_busy {
5113 const struct ublk_queue *ubq;
5114 unsigned int nr_busy;
5115 };
5116
ublk_count_busy_req(struct request * rq,void * data)5117 static bool ublk_count_busy_req(struct request *rq, void *data)
5118 {
5119 struct count_busy *idle = data;
5120
5121 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5122 idle->nr_busy += 1;
5123 return true;
5124 }
5125
5126 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)5127 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5128 {
5129 struct count_busy data = {
5130 .ubq = ubq,
5131 };
5132
5133 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5134 return data.nr_busy < ubq->q_depth;
5135 }
5136
5137 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)5138 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5139 unsigned int timeout_ms)
5140 {
5141 unsigned int elapsed = 0;
5142 int ret;
5143
5144 /*
5145 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5146 * or new fetch command, so needn't wait any more
5147 */
5148 if (ublk_dev_support_batch_io(ub))
5149 return 0;
5150
5151 while (elapsed < timeout_ms && !signal_pending(current)) {
5152 unsigned int queues_cancelable = 0;
5153 int i;
5154
5155 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5156 struct ublk_queue *ubq = ublk_get_queue(ub, i);
5157
5158 queues_cancelable += !!ubq_has_idle_io(ubq);
5159 }
5160
5161 /*
5162 * Each queue needs at least one active command for
5163 * notifying ublk server
5164 */
5165 if (queues_cancelable == ub->dev_info.nr_hw_queues)
5166 break;
5167
5168 msleep(UBLK_REQUEUE_DELAY_MS);
5169 elapsed += UBLK_REQUEUE_DELAY_MS;
5170 }
5171
5172 if (signal_pending(current))
5173 ret = -EINTR;
5174 else if (elapsed >= timeout_ms)
5175 ret = -EBUSY;
5176 else
5177 ret = 0;
5178
5179 return ret;
5180 }
5181
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5182 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5183 const struct ublksrv_ctrl_cmd *header)
5184 {
5185 /* zero means wait forever */
5186 u64 timeout_ms = header->data[0];
5187 struct gendisk *disk;
5188 int ret = -ENODEV;
5189
5190 if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5191 return -EOPNOTSUPP;
5192
5193 mutex_lock(&ub->mutex);
5194 disk = ublk_get_disk(ub);
5195 if (!disk)
5196 goto unlock;
5197 if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5198 goto put_disk;
5199
5200 ret = 0;
5201 /* already in expected state */
5202 if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5203 goto put_disk;
5204
5205 /* Mark the device as canceling */
5206 mutex_lock(&ub->cancel_mutex);
5207 blk_mq_quiesce_queue(disk->queue);
5208 ublk_set_canceling(ub, true);
5209 blk_mq_unquiesce_queue(disk->queue);
5210 mutex_unlock(&ub->cancel_mutex);
5211
5212 if (!timeout_ms)
5213 timeout_ms = UINT_MAX;
5214 ret = ublk_wait_for_idle_io(ub, timeout_ms);
5215
5216 put_disk:
5217 ublk_put_disk(disk);
5218 unlock:
5219 mutex_unlock(&ub->mutex);
5220
5221 /* Cancel pending uring_cmd */
5222 if (!ret)
5223 ublk_cancel_dev(ub);
5224 return ret;
5225 }
5226
5227 /*
5228 * All control commands are sent via /dev/ublk-control, so we have to check
5229 * the destination device's permission
5230 */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)5231 static int ublk_char_dev_permission(struct ublk_device *ub,
5232 const char *dev_path, int mask)
5233 {
5234 int err;
5235 struct path path;
5236 struct kstat stat;
5237
5238 err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5239 if (err)
5240 return err;
5241
5242 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5243 if (err)
5244 goto exit;
5245
5246 err = -EPERM;
5247 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5248 goto exit;
5249
5250 err = inode_permission(&nop_mnt_idmap,
5251 d_backing_inode(path.dentry), mask);
5252 exit:
5253 path_put(&path);
5254 return err;
5255 }
5256
5257 /*
5258 * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5259 * if device is started. If device is not yet started, only mutex is
5260 * needed since no I/O path can access the tree.
5261 *
5262 * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5263 * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5264 */
ublk_lock_buf_tree(struct ublk_device * ub)5265 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5266 {
5267 unsigned int memflags = 0;
5268
5269 mutex_lock(&ub->mutex);
5270 if (ub->ub_disk)
5271 memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5272
5273 return memflags;
5274 }
5275
ublk_unlock_buf_tree(struct ublk_device * ub,unsigned int memflags)5276 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5277 {
5278 if (ub->ub_disk)
5279 blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5280 mutex_unlock(&ub->mutex);
5281 }
5282
5283 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
ublk_buf_erase_ranges(struct ublk_device * ub,int buf_index)5284 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5285 {
5286 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5287 struct ublk_buf_range *range;
5288
5289 mas_lock(&mas);
5290 mas_for_each(&mas, range, ULONG_MAX) {
5291 if (range->buf_index == buf_index) {
5292 mas_erase(&mas);
5293 kfree(range);
5294 }
5295 }
5296 mas_unlock(&mas);
5297 }
5298
__ublk_ctrl_reg_buf(struct ublk_device * ub,struct page ** pages,unsigned long nr_pages,int index,unsigned short flags)5299 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5300 struct page **pages, unsigned long nr_pages,
5301 int index, unsigned short flags)
5302 {
5303 unsigned long i;
5304 int ret;
5305
5306 for (i = 0; i < nr_pages; i++) {
5307 unsigned long pfn = page_to_pfn(pages[i]);
5308 unsigned long start = i;
5309 struct ublk_buf_range *range;
5310
5311 /* Find run of consecutive PFNs */
5312 while (i + 1 < nr_pages &&
5313 page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5314 i++;
5315
5316 range = kzalloc(sizeof(*range), GFP_KERNEL);
5317 if (!range) {
5318 ret = -ENOMEM;
5319 goto unwind;
5320 }
5321 range->buf_index = index;
5322 range->flags = flags;
5323 range->base_offset = start << PAGE_SHIFT;
5324
5325 ret = mtree_insert_range(&ub->buf_tree, pfn,
5326 pfn + (i - start),
5327 range, GFP_KERNEL);
5328 if (ret) {
5329 kfree(range);
5330 goto unwind;
5331 }
5332 }
5333 return 0;
5334
5335 unwind:
5336 ublk_buf_erase_ranges(ub, index);
5337 return ret;
5338 }
5339
5340 /*
5341 * Register a shared memory buffer for zero-copy I/O.
5342 * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5343 * internally. Returns buffer index (>= 0) on success.
5344 */
ublk_ctrl_reg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5345 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5346 struct ublksrv_ctrl_cmd *header)
5347 {
5348 void __user *argp = (void __user *)(unsigned long)header->addr;
5349 struct ublk_shmem_buf_reg buf_reg;
5350 unsigned long nr_pages;
5351 struct page **pages = NULL;
5352 unsigned int gup_flags;
5353 unsigned int memflags;
5354 long pinned;
5355 int index;
5356 int ret;
5357
5358 if (!ublk_dev_support_shmem_zc(ub))
5359 return -EOPNOTSUPP;
5360
5361 memset(&buf_reg, 0, sizeof(buf_reg));
5362 if (copy_from_user(&buf_reg, argp,
5363 min_t(size_t, header->len, sizeof(buf_reg))))
5364 return -EFAULT;
5365
5366 if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5367 return -EINVAL;
5368
5369 if (buf_reg.reserved)
5370 return -EINVAL;
5371
5372 if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5373 !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5374 return -EINVAL;
5375
5376 nr_pages = buf_reg.len >> PAGE_SHIFT;
5377
5378 /* Pin pages before any locks (may sleep) */
5379 pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5380 if (!pages)
5381 return -ENOMEM;
5382
5383 gup_flags = FOLL_LONGTERM;
5384 if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5385 gup_flags |= FOLL_WRITE;
5386
5387 pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5388 if (pinned < 0) {
5389 ret = pinned;
5390 goto err_free_pages;
5391 }
5392 if (pinned != nr_pages) {
5393 ret = -EFAULT;
5394 goto err_unpin;
5395 }
5396
5397 memflags = ublk_lock_buf_tree(ub);
5398
5399 index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5400 if (index < 0) {
5401 ret = index;
5402 goto err_unlock;
5403 }
5404
5405 ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5406 if (ret) {
5407 ida_free(&ub->buf_ida, index);
5408 goto err_unlock;
5409 }
5410
5411 ublk_unlock_buf_tree(ub, memflags);
5412 kvfree(pages);
5413 return index;
5414
5415 err_unlock:
5416 ublk_unlock_buf_tree(ub, memflags);
5417 err_unpin:
5418 unpin_user_pages(pages, pinned);
5419 err_free_pages:
5420 kvfree(pages);
5421 return ret;
5422 }
5423
ublk_unpin_range_pages(unsigned long base_pfn,unsigned long nr_pages)5424 static void ublk_unpin_range_pages(unsigned long base_pfn,
5425 unsigned long nr_pages)
5426 {
5427 #define UBLK_UNPIN_BATCH 32
5428 struct page *pages[UBLK_UNPIN_BATCH];
5429 unsigned long off;
5430
5431 for (off = 0; off < nr_pages; ) {
5432 unsigned int batch = min_t(unsigned long,
5433 nr_pages - off, UBLK_UNPIN_BATCH);
5434 unsigned int j;
5435
5436 for (j = 0; j < batch; j++)
5437 pages[j] = pfn_to_page(base_pfn + off + j);
5438 unpin_user_pages(pages, batch);
5439 off += batch;
5440 }
5441 }
5442
5443 /*
5444 * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under
5445 * mas_lock, collecting them into an xarray. Then drop the lock and
5446 * unpin pages + free ranges outside spinlock context.
5447 *
5448 * Returns true if the tree walk completed, false if more ranges remain.
5449 * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value().
5450 */
5451 #define UBLK_REMOVE_BATCH 64
5452
__ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index,int * ret)5453 static bool __ublk_shmem_remove_ranges(struct ublk_device *ub,
5454 int buf_index, int *ret)
5455 {
5456 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5457 struct ublk_buf_range *range;
5458 struct xarray to_unpin;
5459 unsigned long idx;
5460 unsigned int count = 0;
5461 bool done = false;
5462 void *entry;
5463
5464 xa_init(&to_unpin);
5465
5466 mas_lock(&mas);
5467 mas_for_each(&mas, range, ULONG_MAX) {
5468 unsigned long nr;
5469
5470 if (buf_index >= 0 && range->buf_index != buf_index)
5471 continue;
5472
5473 *ret = 0;
5474 nr = mas.last - mas.index + 1;
5475 if (xa_err(xa_store(&to_unpin, mas.index,
5476 xa_mk_value(nr), GFP_ATOMIC)))
5477 goto unlock;
5478 mas_erase(&mas);
5479 kfree(range);
5480 if (++count >= UBLK_REMOVE_BATCH)
5481 goto unlock;
5482 }
5483 done = true;
5484 unlock:
5485 mas_unlock(&mas);
5486
5487 xa_for_each(&to_unpin, idx, entry)
5488 ublk_unpin_range_pages(idx, xa_to_value(entry));
5489 xa_destroy(&to_unpin);
5490
5491 return done;
5492 }
5493
5494 /*
5495 * Remove ranges from the maple tree matching buf_index, unpin pages
5496 * and free range structs. If buf_index < 0, remove all ranges.
5497 * Processes ranges in batches to avoid holding the maple tree spinlock
5498 * across potentially expensive page unpinning.
5499 */
ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index)5500 static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index)
5501 {
5502 int ret = -ENOENT;
5503
5504 while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret))
5505 cond_resched();
5506 return ret;
5507 }
5508
ublk_ctrl_unreg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5509 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5510 struct ublksrv_ctrl_cmd *header)
5511 {
5512 int index = (int)header->data[0];
5513 unsigned int memflags;
5514 int ret;
5515
5516 if (!ublk_dev_support_shmem_zc(ub))
5517 return -EOPNOTSUPP;
5518
5519 if (index < 0 || index > USHRT_MAX)
5520 return -EINVAL;
5521
5522 memflags = ublk_lock_buf_tree(ub);
5523
5524 ret = ublk_shmem_remove_ranges(ub, index);
5525 if (!ret)
5526 ida_free(&ub->buf_ida, index);
5527
5528 ublk_unlock_buf_tree(ub, memflags);
5529 return ret;
5530 }
5531
ublk_buf_cleanup(struct ublk_device * ub)5532 static void ublk_buf_cleanup(struct ublk_device *ub)
5533 {
5534 ublk_shmem_remove_ranges(ub, -1);
5535 mtree_destroy(&ub->buf_tree);
5536 ida_destroy(&ub->buf_ida);
5537 }
5538
5539 /* Check if request pages match a registered shared memory buffer */
ublk_try_buf_match(struct ublk_device * ub,struct request * rq,u32 * buf_idx,u32 * buf_off)5540 static bool ublk_try_buf_match(struct ublk_device *ub,
5541 struct request *rq,
5542 u32 *buf_idx, u32 *buf_off)
5543 {
5544 struct req_iterator iter;
5545 struct bio_vec bv;
5546 int index = -1;
5547 unsigned long expected_offset = 0;
5548 bool first = true;
5549
5550 rq_for_each_bvec(bv, rq, iter) {
5551 unsigned long pfn = page_to_pfn(bv.bv_page);
5552 unsigned long end_pfn = pfn +
5553 ((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5554 struct ublk_buf_range *range;
5555 unsigned long off;
5556 MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5557
5558 range = mas_walk(&mas);
5559 if (!range)
5560 return false;
5561
5562 /* verify all pages in this bvec fall within the range */
5563 if (end_pfn > mas.last)
5564 return false;
5565
5566 off = range->base_offset +
5567 (pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5568
5569 if (first) {
5570 /* Read-only buffer can't serve READ (kernel writes) */
5571 if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5572 req_op(rq) != REQ_OP_WRITE)
5573 return false;
5574 index = range->buf_index;
5575 expected_offset = off;
5576 *buf_off = off;
5577 first = false;
5578 } else {
5579 if (range->buf_index != index)
5580 return false;
5581 if (off != expected_offset)
5582 return false;
5583 }
5584 expected_offset += bv.bv_len;
5585 }
5586
5587 if (first)
5588 return false;
5589
5590 *buf_idx = index;
5591 return true;
5592 }
5593
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,u32 cmd_op,struct ublksrv_ctrl_cmd * header)5594 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5595 u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5596 {
5597 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5598 void __user *argp = (void __user *)(unsigned long)header->addr;
5599 char *dev_path = NULL;
5600 int ret = 0;
5601 int mask;
5602
5603 if (!unprivileged) {
5604 if (!capable(CAP_SYS_ADMIN))
5605 return -EPERM;
5606 /*
5607 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5608 * char_dev_path in payload too, since userspace may not
5609 * know if the specified device is created as unprivileged
5610 * mode.
5611 */
5612 if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5613 return 0;
5614 }
5615
5616 /*
5617 * User has to provide the char device path for unprivileged ublk
5618 *
5619 * header->addr always points to the dev path buffer, and
5620 * header->dev_path_len records length of dev path buffer.
5621 */
5622 if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5623 return -EINVAL;
5624
5625 if (header->len < header->dev_path_len)
5626 return -EINVAL;
5627
5628 dev_path = memdup_user_nul(argp, header->dev_path_len);
5629 if (IS_ERR(dev_path))
5630 return PTR_ERR(dev_path);
5631
5632 ret = -EINVAL;
5633 switch (_IOC_NR(cmd_op)) {
5634 case UBLK_CMD_GET_DEV_INFO:
5635 case UBLK_CMD_GET_DEV_INFO2:
5636 case UBLK_CMD_GET_QUEUE_AFFINITY:
5637 case UBLK_CMD_GET_PARAMS:
5638 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5639 mask = MAY_READ;
5640 break;
5641 case UBLK_CMD_START_DEV:
5642 case UBLK_CMD_STOP_DEV:
5643 case UBLK_CMD_ADD_DEV:
5644 case UBLK_CMD_DEL_DEV:
5645 case UBLK_CMD_SET_PARAMS:
5646 case UBLK_CMD_START_USER_RECOVERY:
5647 case UBLK_CMD_END_USER_RECOVERY:
5648 case UBLK_CMD_UPDATE_SIZE:
5649 case UBLK_CMD_QUIESCE_DEV:
5650 case UBLK_CMD_TRY_STOP_DEV:
5651 case UBLK_CMD_REG_BUF:
5652 case UBLK_CMD_UNREG_BUF:
5653 mask = MAY_READ | MAY_WRITE;
5654 break;
5655 default:
5656 goto exit;
5657 }
5658
5659 ret = ublk_char_dev_permission(ub, dev_path, mask);
5660 if (!ret) {
5661 header->len -= header->dev_path_len;
5662 header->addr += header->dev_path_len;
5663 }
5664 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5665 __func__, ub->ub_number, cmd_op,
5666 ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5667 dev_path, ret);
5668 exit:
5669 kfree(dev_path);
5670 return ret;
5671 }
5672
ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)5673 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5674 {
5675 switch (_IOC_NR(cmd_op)) {
5676 case UBLK_CMD_GET_QUEUE_AFFINITY:
5677 case UBLK_CMD_GET_DEV_INFO:
5678 case UBLK_CMD_GET_DEV_INFO2:
5679 case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5680 return false;
5681 default:
5682 return true;
5683 }
5684 }
5685
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)5686 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5687 unsigned int issue_flags)
5688 {
5689 /* May point to userspace-mapped memory */
5690 const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5691 struct ublksrv_ctrl_cmd);
5692 struct ublksrv_ctrl_cmd header;
5693 struct ublk_device *ub = NULL;
5694 u32 cmd_op = cmd->cmd_op;
5695 int ret = -EINVAL;
5696
5697 if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5698 issue_flags & IO_URING_F_NONBLOCK)
5699 return -EAGAIN;
5700
5701 if (!(issue_flags & IO_URING_F_SQE128))
5702 return -EINVAL;
5703
5704 header.dev_id = READ_ONCE(ub_src->dev_id);
5705 header.queue_id = READ_ONCE(ub_src->queue_id);
5706 header.len = READ_ONCE(ub_src->len);
5707 header.addr = READ_ONCE(ub_src->addr);
5708 header.data[0] = READ_ONCE(ub_src->data[0]);
5709 header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5710 ublk_ctrl_cmd_dump(cmd_op, &header);
5711
5712 ret = ublk_check_cmd_op(cmd_op);
5713 if (ret)
5714 goto out;
5715
5716 if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5717 ret = ublk_ctrl_get_features(&header);
5718 goto out;
5719 }
5720
5721 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5722 ret = -ENODEV;
5723 ub = ublk_get_device_from_id(header.dev_id);
5724 if (!ub)
5725 goto out;
5726
5727 ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5728 if (ret)
5729 goto put_dev;
5730 }
5731
5732 switch (_IOC_NR(cmd_op)) {
5733 case UBLK_CMD_START_DEV:
5734 ret = ublk_ctrl_start_dev(ub, &header);
5735 break;
5736 case UBLK_CMD_STOP_DEV:
5737 ublk_ctrl_stop_dev(ub);
5738 ret = 0;
5739 break;
5740 case UBLK_CMD_GET_DEV_INFO:
5741 case UBLK_CMD_GET_DEV_INFO2:
5742 ret = ublk_ctrl_get_dev_info(ub, &header);
5743 break;
5744 case UBLK_CMD_ADD_DEV:
5745 ret = ublk_ctrl_add_dev(&header);
5746 break;
5747 case UBLK_CMD_DEL_DEV:
5748 ret = ublk_ctrl_del_dev(&ub, true);
5749 break;
5750 case UBLK_CMD_DEL_DEV_ASYNC:
5751 ret = ublk_ctrl_del_dev(&ub, false);
5752 break;
5753 case UBLK_CMD_GET_QUEUE_AFFINITY:
5754 ret = ublk_ctrl_get_queue_affinity(ub, &header);
5755 break;
5756 case UBLK_CMD_GET_PARAMS:
5757 ret = ublk_ctrl_get_params(ub, &header);
5758 break;
5759 case UBLK_CMD_SET_PARAMS:
5760 ret = ublk_ctrl_set_params(ub, &header);
5761 break;
5762 case UBLK_CMD_START_USER_RECOVERY:
5763 ret = ublk_ctrl_start_recovery(ub);
5764 break;
5765 case UBLK_CMD_END_USER_RECOVERY:
5766 ret = ublk_ctrl_end_recovery(ub, &header);
5767 break;
5768 case UBLK_CMD_UPDATE_SIZE:
5769 ret = ublk_ctrl_set_size(ub, &header);
5770 break;
5771 case UBLK_CMD_QUIESCE_DEV:
5772 ret = ublk_ctrl_quiesce_dev(ub, &header);
5773 break;
5774 case UBLK_CMD_TRY_STOP_DEV:
5775 ret = ublk_ctrl_try_stop_dev(ub);
5776 break;
5777 case UBLK_CMD_REG_BUF:
5778 ret = ublk_ctrl_reg_buf(ub, &header);
5779 break;
5780 case UBLK_CMD_UNREG_BUF:
5781 ret = ublk_ctrl_unreg_buf(ub, &header);
5782 break;
5783 default:
5784 ret = -EOPNOTSUPP;
5785 break;
5786 }
5787
5788 put_dev:
5789 if (ub)
5790 ublk_put_device(ub);
5791 out:
5792 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5793 __func__, ret, cmd_op, header.dev_id, header.queue_id);
5794 return ret;
5795 }
5796
5797 static const struct file_operations ublk_ctl_fops = {
5798 .open = nonseekable_open,
5799 .uring_cmd = ublk_ctrl_uring_cmd,
5800 .owner = THIS_MODULE,
5801 .llseek = noop_llseek,
5802 };
5803
5804 static struct miscdevice ublk_misc = {
5805 .minor = MISC_DYNAMIC_MINOR,
5806 .name = "ublk-control",
5807 .fops = &ublk_ctl_fops,
5808 };
5809
ublk_init(void)5810 static int __init ublk_init(void)
5811 {
5812 int ret;
5813
5814 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5815 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5816 /*
5817 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5818 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5819 */
5820 BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5821 UBLKSRV_IO_INTEGRITY_FLAG);
5822 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5823
5824 init_waitqueue_head(&ublk_idr_wq);
5825
5826 ret = misc_register(&ublk_misc);
5827 if (ret)
5828 return ret;
5829
5830 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5831 if (ret)
5832 goto unregister_mis;
5833
5834 ret = class_register(&ublk_chr_class);
5835 if (ret)
5836 goto free_chrdev_region;
5837
5838 return 0;
5839
5840 free_chrdev_region:
5841 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5842 unregister_mis:
5843 misc_deregister(&ublk_misc);
5844 return ret;
5845 }
5846
ublk_exit(void)5847 static void __exit ublk_exit(void)
5848 {
5849 struct ublk_device *ub;
5850 int id;
5851
5852 idr_for_each_entry(&ublk_index_idr, ub, id)
5853 ublk_remove(ub);
5854
5855 class_unregister(&ublk_chr_class);
5856 misc_deregister(&ublk_misc);
5857
5858 idr_destroy(&ublk_index_idr);
5859 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5860 }
5861
5862 module_init(ublk_init);
5863 module_exit(ublk_exit);
5864
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)5865 static int ublk_set_max_unprivileged_ublks(const char *buf,
5866 const struct kernel_param *kp)
5867 {
5868 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5869 }
5870
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)5871 static int ublk_get_max_unprivileged_ublks(char *buf,
5872 const struct kernel_param *kp)
5873 {
5874 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5875 }
5876
5877 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5878 .set = ublk_set_max_unprivileged_ublks,
5879 .get = ublk_get_max_unprivileged_ublks,
5880 };
5881
5882 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5883 &unprivileged_ublks_max, 0644);
5884 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5885
5886 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5887 MODULE_DESCRIPTION("Userspace block device");
5888 MODULE_LICENSE("GPL");
5889