xref: /linux/drivers/block/ublk_drv.c (revision f3e3dbcea15e20f7413afd8c791a496f0b80e80b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53 
54 #define UBLK_MINORS		(1U << MINORBITS)
55 
56 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
57 
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF	_IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF	_IOC_NR(UBLK_U_CMD_UNREG_BUF)
65 
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX	(1ULL << 32)
68 
69 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71 
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 		| UBLK_F_URING_CMD_COMP_IN_TASK \
75 		| UBLK_F_NEED_GET_DATA \
76 		| UBLK_F_USER_RECOVERY \
77 		| UBLK_F_USER_RECOVERY_REISSUE \
78 		| UBLK_F_UNPRIVILEGED_DEV \
79 		| UBLK_F_CMD_IOCTL_ENCODE \
80 		| UBLK_F_USER_COPY \
81 		| UBLK_F_ZONED \
82 		| UBLK_F_USER_RECOVERY_FAIL_IO \
83 		| UBLK_F_UPDATE_SIZE \
84 		| UBLK_F_AUTO_BUF_REG \
85 		| UBLK_F_QUIESCE \
86 		| UBLK_F_PER_IO_DAEMON \
87 		| UBLK_F_BUF_REG_OFF_DAEMON \
88 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 		| UBLK_F_SAFE_STOP_DEV \
90 		| UBLK_F_BATCH_IO \
91 		| UBLK_F_NO_AUTO_PART_SCAN \
92 		| UBLK_F_SHMEM_ZC)
93 
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 		| UBLK_F_USER_RECOVERY_REISSUE \
96 		| UBLK_F_USER_RECOVERY_FAIL_IO)
97 
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL                                \
100 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
102 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 	 UBLK_PARAM_TYPE_INTEGRITY)
104 
105 #define UBLK_BATCH_F_ALL  \
106 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
107 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109 
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 	struct list_head node;
113 	struct io_uring_cmd *cmd;
114 	unsigned short buf_group;
115 };
116 
117 struct ublk_uring_cmd_pdu {
118 	/*
119 	 * Store requests in same batch temporarily for queuing them to
120 	 * daemon context.
121 	 *
122 	 * It should have been stored to request payload, but we do want
123 	 * to avoid extra pre-allocation, and uring_cmd payload is always
124 	 * free for us
125 	 */
126 	union {
127 		struct request *req;
128 		struct request *req_list;
129 	};
130 
131 	/*
132 	 * The following two are valid in this cmd whole lifetime, and
133 	 * setup in ublk uring_cmd handler
134 	 */
135 	struct ublk_queue *ubq;
136 
137 	union {
138 		u16 tag;
139 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 	};
141 };
142 
143 struct ublk_batch_io_data {
144 	struct ublk_device *ub;
145 	struct io_uring_cmd *cmd;
146 	struct ublk_batch_io header;
147 	unsigned int issue_flags;
148 	struct io_comp_batch *iob;
149 };
150 
151 /*
152  * io command is active: sqe cmd is received, and its cqe isn't done
153  *
154  * If the flag is set, the io command is owned by ublk driver, and waited
155  * for incoming blk-mq request from the ublk block device.
156  *
157  * If the flag is cleared, the io command will be completed, and owned by
158  * ublk server.
159  */
160 #define UBLK_IO_FLAG_ACTIVE	0x01
161 
162 /*
163  * IO command is completed via cqe, and it is being handled by ublksrv, and
164  * not committed yet
165  *
166  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167  * cross verification
168  */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170 
171 /*
172  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173  * get data buffer address from ublksrv.
174  *
175  * Then, bio data could be copied into this data buffer for a WRITE request
176  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177  */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179 
180 /*
181  * request buffer is registered automatically, so we have to unregister it
182  * before completing this request.
183  *
184  * io_uring will unregister buffer automatically for us during exiting.
185  */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
187 
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED	0x80000000
190 
191 /*
192  * Initialize refcount to a large number to include any registered buffers.
193  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194  * any buffers registered on the io daemon task.
195  */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197 
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
200 
201 union ublk_io_buf {
202 	__u64	addr;
203 	struct ublk_auto_buf_reg auto_reg;
204 };
205 
206 struct ublk_io {
207 	union ublk_io_buf buf;
208 	unsigned int flags;
209 	int res;
210 
211 	union {
212 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
213 		struct io_uring_cmd *cmd;
214 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 		struct request *req;
216 	};
217 
218 	struct task_struct *task;
219 
220 	/*
221 	 * The number of uses of this I/O by the ublk server
222 	 * if user copy or zero copy are enabled:
223 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
225 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 	 * - 1 for each io_uring registered buffer not registered on task
227 	 * The I/O can only be completed once all references are dropped.
228 	 * User copy and buffer registration operations are only permitted
229 	 * if the reference count is nonzero.
230 	 */
231 	refcount_t ref;
232 	/* Count of buffers registered on task and not yet unregistered */
233 	unsigned task_registered_buffers;
234 
235 	void *buf_ctx_handle;
236 	spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238 
239 struct ublk_queue {
240 	int q_id;
241 	int q_depth;
242 
243 	unsigned long flags;
244 	struct ublksrv_io_desc *io_cmd_buf;
245 
246 	bool force_abort;
247 	bool canceling;
248 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 	spinlock_t		cancel_lock;
250 	struct ublk_device *dev;
251 	u32 nr_io_ready;
252 
253 	/*
254 	 * For supporting UBLK_F_BATCH_IO only.
255 	 *
256 	 * Inflight ublk request tag is saved in this fifo
257 	 *
258 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 	 * so lock is required for storing request tag to fifo
260 	 *
261 	 * Make sure just one reader for fetching request from task work
262 	 * function to ublk server, so no need to grab the lock in reader
263 	 * side.
264 	 *
265 	 * Batch I/O State Management:
266 	 *
267 	 * The batch I/O system uses implicit state management based on the
268 	 * combination of three key variables below.
269 	 *
270 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 	 *   No fetch commands available, events queue in evts_fifo
272 	 *
273 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 	 *   Fetch commands available but none processing events
275 	 *
276 	 * - ACTIVE: active_fcmd
277 	 *   One fetch command actively processing events from evts_fifo
278 	 *
279 	 * Key Invariants:
280 	 * - At most one active_fcmd at any time (single reader)
281 	 * - active_fcmd is always from fcmd_head list when non-NULL
282 	 * - evts_fifo can be read locklessly by the single active reader
283 	 * - All state transitions require evts_lock protection
284 	 * - Multiple writers to evts_fifo require lock protection
285 	 */
286 	struct {
287 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 		spinlock_t evts_lock;
289 
290 		/* List of fetch commands available to process events */
291 		struct list_head fcmd_head;
292 
293 		/* Currently active fetch command (NULL = none active) */
294 		struct ublk_batch_fetch_cmd  *active_fcmd;
295 	}____cacheline_aligned_in_smp;
296 
297 	struct ublk_io ios[] __counted_by(q_depth);
298 };
299 
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 	unsigned short buf_index;
303 	unsigned short flags;
304 	unsigned int base_offset;	/* byte offset within buffer */
305 };
306 
307 struct ublk_device {
308 	struct gendisk		*ub_disk;
309 
310 	struct ublksrv_ctrl_dev_info	dev_info;
311 
312 	struct blk_mq_tag_set	tag_set;
313 
314 	struct cdev		cdev;
315 	struct device		cdev_dev;
316 
317 #define UB_STATE_OPEN		0
318 #define UB_STATE_USED		1
319 #define UB_STATE_DELETED	2
320 	unsigned long		state;
321 	int			ub_number;
322 
323 	struct mutex		mutex;
324 
325 	spinlock_t		lock;
326 	struct mm_struct	*mm;
327 
328 	struct ublk_params	params;
329 
330 	struct completion	completion;
331 	u32			nr_queue_ready;
332 	bool 			unprivileged_daemons;
333 	struct mutex cancel_mutex;
334 	bool canceling;
335 	pid_t 	ublksrv_tgid;
336 	struct delayed_work	exit_work;
337 	struct work_struct	partition_scan_work;
338 
339 	bool			block_open; /* protected by open_mutex */
340 
341 	/* shared memory zero copy */
342 	struct maple_tree	buf_tree;
343 	struct ida		buf_ida;
344 
345 	struct ublk_queue       *queues[];
346 };
347 
348 /* header of ublk_params */
349 struct ublk_params_header {
350 	__u32	len;
351 	__u32	types;
352 };
353 
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 				  u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 		u16 q_id, u16 tag, struct ublk_io *io);
362 static inline unsigned int ublk_req_build_flags(struct request *req);
363 static void ublk_batch_dispatch(struct ublk_queue *ubq,
364 				const struct ublk_batch_io_data *data,
365 				struct ublk_batch_fetch_cmd *fcmd);
366 
ublk_dev_support_batch_io(const struct ublk_device * ub)367 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
368 {
369 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
370 }
371 
ublk_support_batch_io(const struct ublk_queue * ubq)372 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
373 {
374 	return ubq->flags & UBLK_F_BATCH_IO;
375 }
376 
ublk_io_lock(struct ublk_io * io)377 static inline void ublk_io_lock(struct ublk_io *io)
378 {
379 	spin_lock(&io->lock);
380 }
381 
ublk_io_unlock(struct ublk_io * io)382 static inline void ublk_io_unlock(struct ublk_io *io)
383 {
384 	spin_unlock(&io->lock);
385 }
386 
387 /* Initialize the event queue */
ublk_io_evts_init(struct ublk_queue * q,unsigned int size,int numa_node)388 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
389 				    int numa_node)
390 {
391 	spin_lock_init(&q->evts_lock);
392 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
393 }
394 
395 /* Check if event queue is empty */
ublk_io_evts_empty(const struct ublk_queue * q)396 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
397 {
398 	return kfifo_is_empty(&q->evts_fifo);
399 }
400 
ublk_io_evts_deinit(struct ublk_queue * q)401 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
402 {
403 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
404 	kfifo_free(&q->evts_fifo);
405 }
406 
407 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)408 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
409 {
410 	return &ubq->io_cmd_buf[tag];
411 }
412 
ublk_support_zero_copy(const struct ublk_queue * ubq)413 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
414 {
415 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
416 }
417 
ublk_dev_support_zero_copy(const struct ublk_device * ub)418 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
419 {
420 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
421 }
422 
ublk_support_shmem_zc(const struct ublk_queue * ubq)423 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
424 {
425 	return ubq->flags & UBLK_F_SHMEM_ZC;
426 }
427 
ublk_iod_is_shmem_zc(const struct ublk_queue * ubq,unsigned int tag)428 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
429 					unsigned int tag)
430 {
431 	return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
432 }
433 
ublk_dev_support_shmem_zc(const struct ublk_device * ub)434 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
435 {
436 	return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
437 }
438 
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)439 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
440 {
441 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
442 }
443 
ublk_dev_support_auto_buf_reg(const struct ublk_device * ub)444 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
445 {
446 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
447 }
448 
ublk_support_user_copy(const struct ublk_queue * ubq)449 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
450 {
451 	return ubq->flags & UBLK_F_USER_COPY;
452 }
453 
ublk_dev_support_user_copy(const struct ublk_device * ub)454 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
455 {
456 	return ub->dev_info.flags & UBLK_F_USER_COPY;
457 }
458 
ublk_dev_is_zoned(const struct ublk_device * ub)459 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
460 {
461 	return ub->dev_info.flags & UBLK_F_ZONED;
462 }
463 
ublk_queue_is_zoned(const struct ublk_queue * ubq)464 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
465 {
466 	return ubq->flags & UBLK_F_ZONED;
467 }
468 
ublk_dev_support_integrity(const struct ublk_device * ub)469 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
470 {
471 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
472 }
473 
474 #ifdef CONFIG_BLK_DEV_ZONED
475 
476 struct ublk_zoned_report_desc {
477 	__u64 sector;
478 	__u32 operation;
479 	__u32 nr_zones;
480 };
481 
482 static DEFINE_XARRAY(ublk_zoned_report_descs);
483 
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)484 static int ublk_zoned_insert_report_desc(const struct request *req,
485 		struct ublk_zoned_report_desc *desc)
486 {
487 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
488 			    desc, GFP_KERNEL);
489 }
490 
ublk_zoned_erase_report_desc(const struct request * req)491 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
492 		const struct request *req)
493 {
494 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
495 }
496 
ublk_zoned_get_report_desc(const struct request * req)497 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
498 		const struct request *req)
499 {
500 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
501 }
502 
ublk_get_nr_zones(const struct ublk_device * ub)503 static int ublk_get_nr_zones(const struct ublk_device *ub)
504 {
505 	const struct ublk_param_basic *p = &ub->params.basic;
506 
507 	/* Zone size is a power of 2 */
508 	return p->dev_sectors >> ilog2(p->chunk_sectors);
509 }
510 
ublk_revalidate_disk_zones(struct ublk_device * ub)511 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
512 {
513 	return blk_revalidate_disk_zones(ub->ub_disk);
514 }
515 
ublk_dev_param_zoned_validate(const struct ublk_device * ub)516 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
517 {
518 	const struct ublk_param_zoned *p = &ub->params.zoned;
519 	int nr_zones;
520 
521 	if (!ublk_dev_is_zoned(ub))
522 		return -EINVAL;
523 
524 	if (!p->max_zone_append_sectors)
525 		return -EINVAL;
526 
527 	nr_zones = ublk_get_nr_zones(ub);
528 
529 	if (p->max_active_zones > nr_zones)
530 		return -EINVAL;
531 
532 	if (p->max_open_zones > nr_zones)
533 		return -EINVAL;
534 
535 	return 0;
536 }
537 
ublk_dev_param_zoned_apply(struct ublk_device * ub)538 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
539 {
540 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
541 }
542 
543 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)544 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
545 				      unsigned int nr_zones, size_t *buflen)
546 {
547 	struct request_queue *q = ublk->ub_disk->queue;
548 	size_t bufsize;
549 	void *buf;
550 
551 	nr_zones = min_t(unsigned int, nr_zones,
552 			 ublk->ub_disk->nr_zones);
553 
554 	bufsize = nr_zones * sizeof(struct blk_zone);
555 	bufsize =
556 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
557 
558 	while (bufsize >= sizeof(struct blk_zone)) {
559 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
560 		if (buf) {
561 			*buflen = bufsize;
562 			return buf;
563 		}
564 		bufsize >>= 1;
565 	}
566 
567 	*buflen = 0;
568 	return NULL;
569 }
570 
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)571 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
572 		      unsigned int nr_zones, struct blk_report_zones_args *args)
573 {
574 	struct ublk_device *ub = disk->private_data;
575 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
576 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
577 	unsigned int done_zones = 0;
578 	unsigned int max_zones_per_request;
579 	int ret;
580 	struct blk_zone *buffer;
581 	size_t buffer_length;
582 
583 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
584 			 nr_zones);
585 
586 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
587 	if (!buffer)
588 		return -ENOMEM;
589 
590 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
591 
592 	while (done_zones < nr_zones) {
593 		unsigned int remaining_zones = nr_zones - done_zones;
594 		unsigned int zones_in_request =
595 			min_t(unsigned int, remaining_zones, max_zones_per_request);
596 		struct request *req;
597 		struct ublk_zoned_report_desc desc;
598 		blk_status_t status;
599 
600 		memset(buffer, 0, buffer_length);
601 
602 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
603 		if (IS_ERR(req)) {
604 			ret = PTR_ERR(req);
605 			goto out;
606 		}
607 
608 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
609 		desc.sector = sector;
610 		desc.nr_zones = zones_in_request;
611 		ret = ublk_zoned_insert_report_desc(req, &desc);
612 		if (ret)
613 			goto free_req;
614 
615 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
616 		if (ret)
617 			goto erase_desc;
618 
619 		status = blk_execute_rq(req, 0);
620 		ret = blk_status_to_errno(status);
621 erase_desc:
622 		ublk_zoned_erase_report_desc(req);
623 free_req:
624 		blk_mq_free_request(req);
625 		if (ret)
626 			goto out;
627 
628 		for (unsigned int i = 0; i < zones_in_request; i++) {
629 			struct blk_zone *zone = buffer + i;
630 
631 			/* A zero length zone means no more zones in this response */
632 			if (!zone->len)
633 				break;
634 
635 			ret = disk_report_zone(disk, zone, i, args);
636 			if (ret)
637 				goto out;
638 
639 			done_zones++;
640 			sector += zone_size_sectors;
641 
642 		}
643 	}
644 
645 	ret = done_zones;
646 
647 out:
648 	kvfree(buffer);
649 	return ret;
650 }
651 
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)652 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
653 					 struct request *req)
654 {
655 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
656 	struct ublk_io *io = &ubq->ios[req->tag];
657 	struct ublk_zoned_report_desc *desc;
658 	u32 ublk_op;
659 
660 	switch (req_op(req)) {
661 	case REQ_OP_ZONE_OPEN:
662 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
663 		break;
664 	case REQ_OP_ZONE_CLOSE:
665 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
666 		break;
667 	case REQ_OP_ZONE_FINISH:
668 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
669 		break;
670 	case REQ_OP_ZONE_RESET:
671 		ublk_op = UBLK_IO_OP_ZONE_RESET;
672 		break;
673 	case REQ_OP_ZONE_APPEND:
674 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
675 		break;
676 	case REQ_OP_ZONE_RESET_ALL:
677 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
678 		break;
679 	case REQ_OP_DRV_IN:
680 		desc = ublk_zoned_get_report_desc(req);
681 		if (!desc)
682 			return BLK_STS_IOERR;
683 		ublk_op = desc->operation;
684 		switch (ublk_op) {
685 		case UBLK_IO_OP_REPORT_ZONES:
686 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
687 			iod->nr_zones = desc->nr_zones;
688 			iod->start_sector = desc->sector;
689 			return BLK_STS_OK;
690 		default:
691 			return BLK_STS_IOERR;
692 		}
693 	case REQ_OP_DRV_OUT:
694 		/* We do not support drv_out */
695 		return BLK_STS_NOTSUPP;
696 	default:
697 		return BLK_STS_IOERR;
698 	}
699 
700 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
701 	iod->nr_sectors = blk_rq_sectors(req);
702 	iod->start_sector = blk_rq_pos(req);
703 	iod->addr = io->buf.addr;
704 
705 	return BLK_STS_OK;
706 }
707 
708 #else
709 
710 #define ublk_report_zones (NULL)
711 
ublk_dev_param_zoned_validate(const struct ublk_device * ub)712 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
713 {
714 	return -EOPNOTSUPP;
715 }
716 
ublk_dev_param_zoned_apply(struct ublk_device * ub)717 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
718 {
719 }
720 
ublk_revalidate_disk_zones(struct ublk_device * ub)721 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
722 {
723 	return 0;
724 }
725 
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)726 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
727 					 struct request *req)
728 {
729 	return BLK_STS_NOTSUPP;
730 }
731 
732 #endif
733 
734 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
735 				      bool need_map, struct io_comp_batch *iob);
736 
737 static dev_t ublk_chr_devt;
738 static const struct class ublk_chr_class = {
739 	.name = "ublk-char",
740 };
741 
742 static DEFINE_IDR(ublk_index_idr);
743 static DEFINE_SPINLOCK(ublk_idr_lock);
744 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
745 
746 static DEFINE_MUTEX(ublk_ctl_mutex);
747 
748 static struct ublk_batch_fetch_cmd *
ublk_batch_alloc_fcmd(struct io_uring_cmd * cmd)749 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
750 {
751 	struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
752 
753 	if (fcmd) {
754 		fcmd->cmd = cmd;
755 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
756 	}
757 	return fcmd;
758 }
759 
ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd * fcmd)760 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
761 {
762 	kfree(fcmd);
763 }
764 
__ublk_release_fcmd(struct ublk_queue * ubq)765 static void __ublk_release_fcmd(struct ublk_queue *ubq)
766 {
767 	WRITE_ONCE(ubq->active_fcmd, NULL);
768 }
769 
770 /*
771  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
772  * dispatching
773  */
ublk_batch_deinit_fetch_buf(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd,int res)774 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
775 					const struct ublk_batch_io_data *data,
776 					struct ublk_batch_fetch_cmd *fcmd,
777 					int res)
778 {
779 	spin_lock(&ubq->evts_lock);
780 	list_del_init(&fcmd->node);
781 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
782 	__ublk_release_fcmd(ubq);
783 	spin_unlock(&ubq->evts_lock);
784 
785 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
786 	ublk_batch_free_fcmd(fcmd);
787 }
788 
ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd * fcmd,struct io_br_sel * sel,unsigned int issue_flags)789 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
790 				     struct io_br_sel *sel,
791 				     unsigned int issue_flags)
792 {
793 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
794 		return -ENOBUFS;
795 	return 0;
796 }
797 
ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd * fcmd,void __user * buf,const u16 * tag_buf,unsigned int len)798 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
799 				       void __user *buf, const u16 *tag_buf,
800 				       unsigned int len)
801 {
802 	if (copy_to_user(buf, tag_buf, len))
803 		return -EFAULT;
804 	return len;
805 }
806 
807 #define UBLK_MAX_UBLKS UBLK_MINORS
808 
809 /*
810  * Max unprivileged ublk devices allowed to add
811  *
812  * It can be extended to one per-user limit in future or even controlled
813  * by cgroup.
814  */
815 static unsigned int unprivileged_ublks_max = 64;
816 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
817 
818 static struct miscdevice ublk_misc;
819 
ublk_pos_to_hwq(loff_t pos)820 static inline unsigned ublk_pos_to_hwq(loff_t pos)
821 {
822 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
823 		UBLK_QID_BITS_MASK;
824 }
825 
ublk_pos_to_buf_off(loff_t pos)826 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
827 {
828 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
829 }
830 
ublk_pos_to_tag(loff_t pos)831 static inline unsigned ublk_pos_to_tag(loff_t pos)
832 {
833 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
834 		UBLK_TAG_BITS_MASK;
835 }
836 
ublk_dev_param_basic_apply(struct ublk_device * ub)837 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
838 {
839 	const struct ublk_param_basic *p = &ub->params.basic;
840 
841 	if (p->attrs & UBLK_ATTR_READ_ONLY)
842 		set_disk_ro(ub->ub_disk, true);
843 
844 	set_capacity(ub->ub_disk, p->dev_sectors);
845 }
846 
ublk_integrity_flags(u32 flags)847 static int ublk_integrity_flags(u32 flags)
848 {
849 	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
850 
851 	if (flags & LBMD_PI_CAP_INTEGRITY) {
852 		flags &= ~LBMD_PI_CAP_INTEGRITY;
853 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
854 	}
855 	if (flags & LBMD_PI_CAP_REFTAG) {
856 		flags &= ~LBMD_PI_CAP_REFTAG;
857 		ret_flags |= BLK_INTEGRITY_REF_TAG;
858 	}
859 	return flags ? -EINVAL : ret_flags;
860 }
861 
ublk_integrity_pi_tuple_size(u8 csum_type)862 static int ublk_integrity_pi_tuple_size(u8 csum_type)
863 {
864 	switch (csum_type) {
865 	case LBMD_PI_CSUM_NONE:
866 		return 0;
867 	case LBMD_PI_CSUM_IP:
868 	case LBMD_PI_CSUM_CRC16_T10DIF:
869 		return 8;
870 	case LBMD_PI_CSUM_CRC64_NVME:
871 		return 16;
872 	default:
873 		return -EINVAL;
874 	}
875 }
876 
ublk_integrity_csum_type(u8 csum_type)877 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
878 {
879 	switch (csum_type) {
880 	case LBMD_PI_CSUM_NONE:
881 		return BLK_INTEGRITY_CSUM_NONE;
882 	case LBMD_PI_CSUM_IP:
883 		return BLK_INTEGRITY_CSUM_IP;
884 	case LBMD_PI_CSUM_CRC16_T10DIF:
885 		return BLK_INTEGRITY_CSUM_CRC;
886 	case LBMD_PI_CSUM_CRC64_NVME:
887 		return BLK_INTEGRITY_CSUM_CRC64;
888 	default:
889 		WARN_ON_ONCE(1);
890 		return BLK_INTEGRITY_CSUM_NONE;
891 	}
892 }
893 
ublk_validate_params(const struct ublk_device * ub)894 static int ublk_validate_params(const struct ublk_device *ub)
895 {
896 	/* basic param is the only one which must be set */
897 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
898 		const struct ublk_param_basic *p = &ub->params.basic;
899 
900 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
901 			return -EINVAL;
902 
903 		if (p->logical_bs_shift > p->physical_bs_shift)
904 			return -EINVAL;
905 
906 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
907 			return -EINVAL;
908 
909 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
910 			return -EINVAL;
911 	} else
912 		return -EINVAL;
913 
914 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
915 		const struct ublk_param_discard *p = &ub->params.discard;
916 
917 		/* So far, only support single segment discard */
918 		if (p->max_discard_sectors && p->max_discard_segments != 1)
919 			return -EINVAL;
920 
921 		if (!p->discard_granularity)
922 			return -EINVAL;
923 	}
924 
925 	/* dev_t is read-only */
926 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
927 		return -EINVAL;
928 
929 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
930 		return ublk_dev_param_zoned_validate(ub);
931 	else if (ublk_dev_is_zoned(ub))
932 		return -EINVAL;
933 
934 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
935 		const struct ublk_param_dma_align *p = &ub->params.dma;
936 
937 		if (p->alignment >= PAGE_SIZE)
938 			return -EINVAL;
939 
940 		if (!is_power_of_2(p->alignment + 1))
941 			return -EINVAL;
942 	}
943 
944 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
945 		const struct ublk_param_segment *p = &ub->params.seg;
946 
947 		if (!is_power_of_2(p->seg_boundary_mask + 1))
948 			return -EINVAL;
949 
950 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
951 			return -EINVAL;
952 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
953 			return -EINVAL;
954 	}
955 
956 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
957 		const struct ublk_param_integrity *p = &ub->params.integrity;
958 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
959 		int flags = ublk_integrity_flags(p->flags);
960 
961 		if (!ublk_dev_support_integrity(ub))
962 			return -EINVAL;
963 		if (flags < 0)
964 			return flags;
965 		if (pi_tuple_size < 0)
966 			return pi_tuple_size;
967 		if (!p->metadata_size)
968 			return -EINVAL;
969 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
970 		    p->flags & LBMD_PI_CAP_REFTAG)
971 			return -EINVAL;
972 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
973 			return -EINVAL;
974 		if (p->interval_exp < SECTOR_SHIFT ||
975 		    p->interval_exp > ub->params.basic.logical_bs_shift)
976 			return -EINVAL;
977 	}
978 
979 	return 0;
980 }
981 
ublk_apply_params(struct ublk_device * ub)982 static void ublk_apply_params(struct ublk_device *ub)
983 {
984 	ublk_dev_param_basic_apply(ub);
985 
986 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
987 		ublk_dev_param_zoned_apply(ub);
988 }
989 
ublk_need_map_io(const struct ublk_queue * ubq)990 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
991 {
992 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
993 		!ublk_support_auto_buf_reg(ubq);
994 }
995 
ublk_dev_need_map_io(const struct ublk_device * ub)996 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
997 {
998 	return !ublk_dev_support_user_copy(ub) &&
999 	       !ublk_dev_support_zero_copy(ub) &&
1000 	       !ublk_dev_support_auto_buf_reg(ub);
1001 }
1002 
ublk_need_req_ref(const struct ublk_queue * ubq)1003 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1004 {
1005 	/*
1006 	 * read()/write() is involved in user copy, so request reference
1007 	 * has to be grabbed
1008 	 *
1009 	 * for zero copy, request buffer need to be registered to io_uring
1010 	 * buffer table, so reference is needed
1011 	 *
1012 	 * For auto buffer register, ublk server still may issue
1013 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1014 	 * so reference is required too.
1015 	 */
1016 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1017 		ublk_support_auto_buf_reg(ubq);
1018 }
1019 
ublk_dev_need_req_ref(const struct ublk_device * ub)1020 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1021 {
1022 	return ublk_dev_support_user_copy(ub) ||
1023 	       ublk_dev_support_zero_copy(ub) ||
1024 	       ublk_dev_support_auto_buf_reg(ub);
1025 }
1026 
1027 /*
1028  * ublk IO Reference Counting Design
1029  * ==================================
1030  *
1031  * For user-copy and zero-copy modes, ublk uses a split reference model with
1032  * two counters that together track IO lifetime:
1033  *
1034  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
1035  *   - io->task_registered_buffers: count of buffers registered on the IO task
1036  *
1037  * Key Invariant:
1038  * --------------
1039  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1040  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1041  * when no active references exist. After IO completion, both counters become
1042  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1043  * task_registered_buffers are 0.
1044  *
1045  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1046  * exit to determine if all references have been released.
1047  *
1048  * Why Split Counters:
1049  * -------------------
1050  * Buffers registered on the IO daemon task can use the lightweight
1051  * task_registered_buffers counter (simple increment/decrement) instead of
1052  * atomic refcount operations. The ublk_io_release() callback checks if
1053  * current == io->task to decide which counter to update.
1054  *
1055  * This optimization only applies before IO completion. At completion,
1056  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1057  * After that, all subsequent buffer unregistrations must use the atomic ref
1058  * since they may be releasing the last reference.
1059  *
1060  * Reference Lifecycle:
1061  * --------------------
1062  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1063  *
1064  * 2. During IO processing:
1065  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1066  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1067  *    - Buffer unregister callback (ublk_io_release):
1068  *      * If on-task: task_registered_buffers--
1069  *      * If off-task: ref-- via ublk_put_req_ref()
1070  *
1071  * 3. ublk_sub_req_ref() at IO completion:
1072  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1073  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1074  *    - This effectively collapses task_registered_buffers into the atomic ref,
1075  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1076  *      buffers that were already counted
1077  *
1078  * Example (zero-copy, register on-task, unregister off-task):
1079  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1080  *   - Register buffer on-task: task_registered_buffers = 1
1081  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1082  *   - Completion via ublk_sub_req_ref():
1083  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1084  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1085  *
1086  * Example (auto buffer registration):
1087  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1088  *
1089  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1090  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1091  *   - Completion via ublk_sub_req_ref():
1092  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1093  *
1094  * Example (zero-copy, ublk server killed):
1095  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1096  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1097  *
1098  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1099  *   - Register buffer on-task: task_registered_buffers = 1
1100  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1101  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1102  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1103  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1104  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1105  *     and abort pending requests
1106  *
1107  * Batch IO Special Case:
1108  * ----------------------
1109  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1110  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1111  * task_registered_buffers counter still tracks registered buffers for the
1112  * invariant check, even though the callback doesn't decrement it.
1113  *
1114  * Note: updating task_registered_buffers is protected by io->lock.
1115  */
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)1116 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1117 		struct ublk_io *io)
1118 {
1119 	if (ublk_need_req_ref(ubq))
1120 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1121 }
1122 
ublk_get_req_ref(struct ublk_io * io)1123 static inline bool ublk_get_req_ref(struct ublk_io *io)
1124 {
1125 	return refcount_inc_not_zero(&io->ref);
1126 }
1127 
ublk_put_req_ref(struct ublk_io * io,struct request * req)1128 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1129 {
1130 	if (!refcount_dec_and_test(&io->ref))
1131 		return;
1132 
1133 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1134 	__ublk_complete_rq(req, io, false, NULL);
1135 }
1136 
ublk_sub_req_ref(struct ublk_io * io)1137 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1138 {
1139 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1140 
1141 	io->task_registered_buffers = 0;
1142 	return refcount_sub_and_test(sub_refs, &io->ref);
1143 }
1144 
ublk_need_get_data(const struct ublk_queue * ubq)1145 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1146 {
1147 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1148 }
1149 
ublk_dev_need_get_data(const struct ublk_device * ub)1150 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1151 {
1152 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1153 }
1154 
1155 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)1156 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1157 {
1158 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1159 		return ub;
1160 	return NULL;
1161 }
1162 
1163 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)1164 static noinline void ublk_put_device(struct ublk_device *ub)
1165 {
1166 	put_device(&ub->cdev_dev);
1167 }
1168 
ublk_get_queue(struct ublk_device * dev,int qid)1169 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1170 		int qid)
1171 {
1172 	return dev->queues[qid];
1173 }
1174 
ublk_rq_has_data(const struct request * rq)1175 static inline bool ublk_rq_has_data(const struct request *rq)
1176 {
1177 	return bio_has_data(rq->bio);
1178 }
1179 
1180 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)1181 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1182 {
1183 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1184 }
1185 
__ublk_queue_cmd_buf_size(int depth)1186 static inline int __ublk_queue_cmd_buf_size(int depth)
1187 {
1188 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1189 }
1190 
ublk_queue_cmd_buf_size(struct ublk_device * ub)1191 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1192 {
1193 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1194 }
1195 
ublk_max_cmd_buf_size(void)1196 static int ublk_max_cmd_buf_size(void)
1197 {
1198 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1199 }
1200 
1201 /*
1202  * Should I/O outstanding to the ublk server when it exits be reissued?
1203  * If not, outstanding I/O will get errors.
1204  */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)1205 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1206 {
1207 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1208 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1209 }
1210 
1211 /*
1212  * Should I/O issued while there is no ublk server queue? If not, I/O
1213  * issued while there is no ublk server will get errors.
1214  */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)1215 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1216 {
1217 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1218 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1219 }
1220 
1221 /*
1222  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1223  * of the device flags for smaller cache footprint - better for fast
1224  * paths.
1225  */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)1226 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1227 {
1228 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1229 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1230 }
1231 
1232 /*
1233  * Should ublk devices be stopped (i.e. no recovery possible) when the
1234  * ublk server exits? If not, devices can be used again by a future
1235  * incarnation of a ublk server via the start_recovery/end_recovery
1236  * commands.
1237  */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)1238 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1239 {
1240 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1241 }
1242 
ublk_dev_in_recoverable_state(struct ublk_device * ub)1243 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1244 {
1245 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1246 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1247 }
1248 
ublk_free_disk(struct gendisk * disk)1249 static void ublk_free_disk(struct gendisk *disk)
1250 {
1251 	struct ublk_device *ub = disk->private_data;
1252 
1253 	clear_bit(UB_STATE_USED, &ub->state);
1254 	ublk_put_device(ub);
1255 }
1256 
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)1257 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1258 		unsigned int *owner_gid)
1259 {
1260 	kuid_t uid;
1261 	kgid_t gid;
1262 
1263 	current_uid_gid(&uid, &gid);
1264 
1265 	*owner_uid = from_kuid(&init_user_ns, uid);
1266 	*owner_gid = from_kgid(&init_user_ns, gid);
1267 }
1268 
ublk_open(struct gendisk * disk,blk_mode_t mode)1269 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1270 {
1271 	struct ublk_device *ub = disk->private_data;
1272 
1273 	if (capable(CAP_SYS_ADMIN))
1274 		return 0;
1275 
1276 	/*
1277 	 * If it is one unprivileged device, only owner can open
1278 	 * the disk. Otherwise it could be one trap made by one
1279 	 * evil user who grants this disk's privileges to other
1280 	 * users deliberately.
1281 	 *
1282 	 * This way is reasonable too given anyone can create
1283 	 * unprivileged device, and no need other's grant.
1284 	 */
1285 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1286 		unsigned int curr_uid, curr_gid;
1287 
1288 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1289 
1290 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1291 				ub->dev_info.owner_gid)
1292 			return -EPERM;
1293 	}
1294 
1295 	if (ub->block_open)
1296 		return -ENXIO;
1297 
1298 	return 0;
1299 }
1300 
1301 static const struct block_device_operations ub_fops = {
1302 	.owner =	THIS_MODULE,
1303 	.open =		ublk_open,
1304 	.free_disk =	ublk_free_disk,
1305 	.report_zones =	ublk_report_zones,
1306 };
1307 
ublk_copy_user_bvec(const struct bio_vec * bv,unsigned * offset,struct iov_iter * uiter,int dir,size_t * done)1308 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1309 				struct iov_iter *uiter, int dir, size_t *done)
1310 {
1311 	unsigned len;
1312 	void *bv_buf;
1313 	size_t copied;
1314 
1315 	if (*offset >= bv->bv_len) {
1316 		*offset -= bv->bv_len;
1317 		return true;
1318 	}
1319 
1320 	len = bv->bv_len - *offset;
1321 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1322 	/*
1323 	 * Bio pages may originate from slab caches without a usercopy region
1324 	 * (e.g. jbd2 frozen metadata buffers).  This is the same data that
1325 	 * the loop driver writes to its backing file — no exposure risk.
1326 	 * The bvec length is always trusted, so the size check in
1327 	 * check_copy_size() is not needed either.  Use the unchecked
1328 	 * helpers to avoid false positives on slab pages.
1329 	 */
1330 	if (dir == ITER_DEST)
1331 		copied = _copy_to_iter(bv_buf, len, uiter);
1332 	else
1333 		copied = _copy_from_iter(bv_buf, len, uiter);
1334 
1335 	kunmap_local(bv_buf);
1336 
1337 	*done += copied;
1338 	if (copied < len)
1339 		return false;
1340 
1341 	*offset = 0;
1342 	return true;
1343 }
1344 
1345 /*
1346  * Copy data between request pages and io_iter, and 'offset'
1347  * is the start point of linear offset of request.
1348  */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1349 static size_t ublk_copy_user_pages(const struct request *req,
1350 		unsigned offset, struct iov_iter *uiter, int dir)
1351 {
1352 	struct req_iterator iter;
1353 	struct bio_vec bv;
1354 	size_t done = 0;
1355 
1356 	rq_for_each_segment(bv, req, iter) {
1357 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1358 			break;
1359 	}
1360 	return done;
1361 }
1362 
1363 #ifdef CONFIG_BLK_DEV_INTEGRITY
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1364 static size_t ublk_copy_user_integrity(const struct request *req,
1365 		unsigned offset, struct iov_iter *uiter, int dir)
1366 {
1367 	size_t done = 0;
1368 	struct bio *bio = req->bio;
1369 	struct bvec_iter iter;
1370 	struct bio_vec iv;
1371 
1372 	if (!blk_integrity_rq(req))
1373 		return 0;
1374 
1375 	bio_for_each_integrity_vec(iv, bio, iter) {
1376 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1377 			break;
1378 	}
1379 
1380 	return done;
1381 }
1382 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1383 static size_t ublk_copy_user_integrity(const struct request *req,
1384 		unsigned offset, struct iov_iter *uiter, int dir)
1385 {
1386 	return 0;
1387 }
1388 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1389 
ublk_need_map_req(const struct request * req)1390 static inline bool ublk_need_map_req(const struct request *req)
1391 {
1392 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1393 }
1394 
ublk_need_unmap_req(const struct request * req)1395 static inline bool ublk_need_unmap_req(const struct request *req)
1396 {
1397 	return ublk_rq_has_data(req) &&
1398 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1399 }
1400 
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)1401 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1402 				const struct request *req,
1403 				const struct ublk_io *io)
1404 {
1405 	const unsigned int rq_bytes = blk_rq_bytes(req);
1406 
1407 	if (!ublk_need_map_io(ubq))
1408 		return rq_bytes;
1409 
1410 	/*
1411 	 * no zero copy, we delay copy WRITE request data into ublksrv
1412 	 * context and the big benefit is that pinning pages in current
1413 	 * context is pretty fast, see ublk_pin_user_pages
1414 	 */
1415 	if (ublk_need_map_req(req)) {
1416 		struct iov_iter iter;
1417 		const int dir = ITER_DEST;
1418 
1419 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1420 		return ublk_copy_user_pages(req, 0, &iter, dir);
1421 	}
1422 	return rq_bytes;
1423 }
1424 
ublk_unmap_io(bool need_map,const struct request * req,const struct ublk_io * io)1425 static unsigned int ublk_unmap_io(bool need_map,
1426 		const struct request *req,
1427 		const struct ublk_io *io)
1428 {
1429 	const unsigned int rq_bytes = blk_rq_bytes(req);
1430 
1431 	if (!need_map)
1432 		return rq_bytes;
1433 
1434 	if (ublk_need_unmap_req(req)) {
1435 		struct iov_iter iter;
1436 		const int dir = ITER_SOURCE;
1437 
1438 		WARN_ON_ONCE(io->res > rq_bytes);
1439 
1440 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1441 		return ublk_copy_user_pages(req, 0, &iter, dir);
1442 	}
1443 	return rq_bytes;
1444 }
1445 
ublk_req_build_flags(struct request * req)1446 static inline unsigned int ublk_req_build_flags(struct request *req)
1447 {
1448 	unsigned flags = 0;
1449 
1450 	if (req->cmd_flags & REQ_FAILFAST_DEV)
1451 		flags |= UBLK_IO_F_FAILFAST_DEV;
1452 
1453 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1454 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1455 
1456 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1457 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
1458 
1459 	if (req->cmd_flags & REQ_META)
1460 		flags |= UBLK_IO_F_META;
1461 
1462 	if (req->cmd_flags & REQ_FUA)
1463 		flags |= UBLK_IO_F_FUA;
1464 
1465 	if (req->cmd_flags & REQ_NOUNMAP)
1466 		flags |= UBLK_IO_F_NOUNMAP;
1467 
1468 	if (req->cmd_flags & REQ_SWAP)
1469 		flags |= UBLK_IO_F_SWAP;
1470 
1471 	if (blk_integrity_rq(req))
1472 		flags |= UBLK_IO_F_INTEGRITY;
1473 
1474 	return flags;
1475 }
1476 
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1477 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1478 {
1479 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1480 	struct ublk_io *io = &ubq->ios[req->tag];
1481 	u32 ublk_op;
1482 
1483 	switch (req_op(req)) {
1484 	case REQ_OP_READ:
1485 		ublk_op = UBLK_IO_OP_READ;
1486 		break;
1487 	case REQ_OP_WRITE:
1488 		ublk_op = UBLK_IO_OP_WRITE;
1489 		break;
1490 	case REQ_OP_FLUSH:
1491 		ublk_op = UBLK_IO_OP_FLUSH;
1492 		break;
1493 	case REQ_OP_DISCARD:
1494 		ublk_op = UBLK_IO_OP_DISCARD;
1495 		break;
1496 	case REQ_OP_WRITE_ZEROES:
1497 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1498 		break;
1499 	default:
1500 		if (ublk_queue_is_zoned(ubq))
1501 			return ublk_setup_iod_zoned(ubq, req);
1502 		return BLK_STS_IOERR;
1503 	}
1504 
1505 	/* need to translate since kernel may change */
1506 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
1507 	iod->nr_sectors = blk_rq_sectors(req);
1508 	iod->start_sector = blk_rq_pos(req);
1509 
1510 	/* Try shmem zero-copy match before setting addr */
1511 	if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
1512 		u32 buf_idx, buf_off;
1513 
1514 		if (ublk_try_buf_match(ubq->dev, req,
1515 					  &buf_idx, &buf_off)) {
1516 			iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
1517 			iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
1518 			return BLK_STS_OK;
1519 		}
1520 	}
1521 
1522 	iod->addr = io->buf.addr;
1523 
1524 	return BLK_STS_OK;
1525 }
1526 
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1527 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1528 		struct io_uring_cmd *ioucmd)
1529 {
1530 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1531 }
1532 
ublk_end_request(struct request * req,blk_status_t error)1533 static void ublk_end_request(struct request *req, blk_status_t error)
1534 {
1535 	local_bh_disable();
1536 	blk_mq_end_request(req, error);
1537 	local_bh_enable();
1538 }
1539 
1540 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req,struct ublk_io * io,bool need_map,struct io_comp_batch * iob)1541 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1542 				      bool need_map, struct io_comp_batch *iob)
1543 {
1544 	unsigned int unmapped_bytes;
1545 	blk_status_t res = BLK_STS_OK;
1546 	bool requeue;
1547 
1548 	/* failed read IO if nothing is read */
1549 	if (!io->res && req_op(req) == REQ_OP_READ)
1550 		io->res = -EIO;
1551 
1552 	if (io->res < 0) {
1553 		res = errno_to_blk_status(io->res);
1554 		goto exit;
1555 	}
1556 
1557 	/*
1558 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1559 	 * directly.
1560 	 *
1561 	 * Both the two needn't unmap.
1562 	 */
1563 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1564 	    req_op(req) != REQ_OP_DRV_IN)
1565 		goto exit;
1566 
1567 	/* shmem zero copy: no data to unmap, pages already shared */
1568 	if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1569 		goto exit;
1570 
1571 	/* for READ request, writing data in iod->addr to rq buffers */
1572 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1573 
1574 	/*
1575 	 * Extremely impossible since we got data filled in just before
1576 	 *
1577 	 * Re-read simply for this unlikely case.
1578 	 */
1579 	if (unlikely(unmapped_bytes < io->res))
1580 		io->res = unmapped_bytes;
1581 
1582 	/*
1583 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1584 	 * happens off this path, then that will prevent ublk's blkdev_release()
1585 	 * from being called on current's task work, see fput() implementation.
1586 	 *
1587 	 * Otherwise, ublk server may not provide forward progress in case of
1588 	 * reading the partition table from bdev_open() with disk->open_mutex
1589 	 * held, and causes dead lock as we could already be holding
1590 	 * disk->open_mutex here.
1591 	 *
1592 	 * Preferably we would not be doing IO with a mutex held that is also
1593 	 * used for release, but this work-around will suffice for now.
1594 	 */
1595 	local_bh_disable();
1596 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1597 	local_bh_enable();
1598 	if (requeue)
1599 		blk_mq_requeue_request(req, true);
1600 	else if (likely(!blk_should_fake_timeout(req->q))) {
1601 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1602 			return;
1603 		__blk_mq_end_request(req, BLK_STS_OK);
1604 	}
1605 
1606 	return;
1607 exit:
1608 	ublk_end_request(req, res);
1609 }
1610 
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1611 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1612 						     struct request *req)
1613 {
1614 	/* read cmd first because req will overwrite it */
1615 	struct io_uring_cmd *cmd = io->cmd;
1616 
1617 	/* mark this cmd owned by ublksrv */
1618 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1619 
1620 	/*
1621 	 * clear ACTIVE since we are done with this sqe/cmd slot
1622 	 * We can only accept io cmd in case of being not active.
1623 	 */
1624 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1625 
1626 	io->req = req;
1627 	return cmd;
1628 }
1629 
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1630 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1631 				 int res, unsigned issue_flags)
1632 {
1633 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1634 
1635 	/* tell ublksrv one io request is coming */
1636 	io_uring_cmd_done(cmd, res, issue_flags);
1637 }
1638 
1639 #define UBLK_REQUEUE_DELAY_MS	3
1640 
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1641 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1642 		struct request *rq)
1643 {
1644 	/* We cannot process this rq so just requeue it. */
1645 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1646 		blk_mq_requeue_request(rq, false);
1647 	else
1648 		ublk_end_request(rq, BLK_STS_IOERR);
1649 }
1650 
1651 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,unsigned tag)1652 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1653 {
1654 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1655 
1656 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1657 }
1658 
1659 enum auto_buf_reg_res {
1660 	AUTO_BUF_REG_FAIL,
1661 	AUTO_BUF_REG_FALLBACK,
1662 	AUTO_BUF_REG_OK,
1663 };
1664 
1665 /*
1666  * Setup io state after auto buffer registration.
1667  *
1668  * Must be called after ublk_auto_buf_register() is done.
1669  * Caller must hold io->lock in batch context.
1670  */
ublk_auto_buf_io_setup(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,enum auto_buf_reg_res res)1671 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1672 				   struct request *req, struct ublk_io *io,
1673 				   struct io_uring_cmd *cmd,
1674 				   enum auto_buf_reg_res res)
1675 {
1676 	if (res == AUTO_BUF_REG_OK) {
1677 		io->task_registered_buffers = 1;
1678 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1679 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1680 	}
1681 	ublk_init_req_ref(ubq, io);
1682 	__ublk_prep_compl_io_cmd(io, req);
1683 }
1684 
1685 /* Register request bvec to io_uring for auto buffer registration. */
1686 static enum auto_buf_reg_res
ublk_auto_buf_register(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1687 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1688 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1689 		       unsigned int issue_flags)
1690 {
1691 	int ret;
1692 
1693 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1694 				      io->buf.auto_reg.index, issue_flags);
1695 	if (ret) {
1696 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1697 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1698 			return AUTO_BUF_REG_FALLBACK;
1699 		}
1700 		ublk_end_request(req, BLK_STS_IOERR);
1701 		return AUTO_BUF_REG_FAIL;
1702 	}
1703 
1704 	return AUTO_BUF_REG_OK;
1705 }
1706 
1707 /*
1708  * Dispatch IO to userspace with auto buffer registration.
1709  *
1710  * Only called in non-batch context from task work, io->lock not held.
1711  */
ublk_auto_buf_dispatch(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1712 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1713 				   struct request *req, struct ublk_io *io,
1714 				   struct io_uring_cmd *cmd,
1715 				   unsigned int issue_flags)
1716 {
1717 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1718 			issue_flags);
1719 
1720 	if (res != AUTO_BUF_REG_FAIL) {
1721 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1722 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1723 	}
1724 }
1725 
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1726 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1727 			  struct ublk_io *io)
1728 {
1729 	unsigned mapped_bytes;
1730 
1731 	/* shmem zero copy: skip data copy, pages already shared */
1732 	if (ublk_iod_is_shmem_zc(ubq, req->tag))
1733 		return true;
1734 
1735 	mapped_bytes = ublk_map_io(ubq, req, io);
1736 
1737 	/* partially mapped, update io descriptor */
1738 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1739 		/*
1740 		 * Nothing mapped, retry until we succeed.
1741 		 *
1742 		 * We may never succeed in mapping any bytes here because
1743 		 * of OOM. TODO: reserve one buffer with single page pinned
1744 		 * for providing forward progress guarantee.
1745 		 */
1746 		if (unlikely(!mapped_bytes)) {
1747 			blk_mq_requeue_request(req, false);
1748 			blk_mq_delay_kick_requeue_list(req->q,
1749 					UBLK_REQUEUE_DELAY_MS);
1750 			return false;
1751 		}
1752 
1753 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1754 			mapped_bytes >> 9;
1755 	}
1756 
1757 	return true;
1758 }
1759 
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req)1760 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1761 {
1762 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1763 	int tag = req->tag;
1764 	struct ublk_io *io = &ubq->ios[tag];
1765 
1766 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1767 			__func__, ubq->q_id, req->tag, io->flags,
1768 			ublk_get_iod(ubq, req->tag)->addr);
1769 
1770 	/*
1771 	 * Task is exiting if either:
1772 	 *
1773 	 * (1) current != io->task.
1774 	 * io_uring_cmd_complete_in_task() tries to run task_work
1775 	 * in a workqueue if cmd's task is PF_EXITING.
1776 	 *
1777 	 * (2) current->flags & PF_EXITING.
1778 	 */
1779 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1780 		__ublk_abort_rq(ubq, req);
1781 		return;
1782 	}
1783 
1784 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1785 		/*
1786 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1787 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1788 		 * and notify it.
1789 		 */
1790 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1791 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1792 				__func__, ubq->q_id, req->tag, io->flags);
1793 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1794 				     issue_flags);
1795 		return;
1796 	}
1797 
1798 	if (!ublk_start_io(ubq, req, io))
1799 		return;
1800 
1801 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1802 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1803 	} else {
1804 		ublk_init_req_ref(ubq, io);
1805 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1806 	}
1807 }
1808 
__ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short tag)1809 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1810 				       const struct ublk_batch_io_data *data,
1811 				       unsigned short tag)
1812 {
1813 	struct ublk_device *ub = data->ub;
1814 	struct ublk_io *io = &ubq->ios[tag];
1815 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1816 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1817 	struct io_uring_cmd *cmd = data->cmd;
1818 
1819 	if (!ublk_start_io(ubq, req, io))
1820 		return false;
1821 
1822 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1823 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1824 				data->issue_flags);
1825 
1826 		if (res == AUTO_BUF_REG_FAIL)
1827 			return false;
1828 	}
1829 
1830 	ublk_io_lock(io);
1831 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1832 	ublk_io_unlock(io);
1833 
1834 	return true;
1835 }
1836 
ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,unsigned int len)1837 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1838 				     const struct ublk_batch_io_data *data,
1839 				     unsigned short *tag_buf,
1840 				     unsigned int len)
1841 {
1842 	bool has_unused = false;
1843 	unsigned int i;
1844 
1845 	for (i = 0; i < len; i++) {
1846 		unsigned short tag = tag_buf[i];
1847 
1848 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1849 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1850 			has_unused = true;
1851 		}
1852 	}
1853 
1854 	return has_unused;
1855 }
1856 
1857 /*
1858  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1859  * Returns the new length after filtering.
1860  */
ublk_filter_unused_tags(unsigned short * tag_buf,unsigned int len)1861 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1862 					    unsigned int len)
1863 {
1864 	unsigned int i, j;
1865 
1866 	for (i = 0, j = 0; i < len; i++) {
1867 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1868 			if (i != j)
1869 				tag_buf[j] = tag_buf[i];
1870 			j++;
1871 		}
1872 	}
1873 
1874 	return j;
1875 }
1876 
ublk_batch_dispatch_fail(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,size_t len,int ret)1877 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1878 		const struct ublk_batch_io_data *data,
1879 		unsigned short *tag_buf, size_t len, int ret)
1880 {
1881 	int i, res;
1882 
1883 	/*
1884 	 * Undo prep state for all IOs since userspace never received them.
1885 	 * This restores IOs to pre-prepared state so they can be cleanly
1886 	 * re-prepared when tags are pulled from FIFO again.
1887 	 */
1888 	for (i = 0; i < len; i++) {
1889 		struct ublk_io *io = &ubq->ios[tag_buf[i]];
1890 		int index = -1;
1891 
1892 		ublk_io_lock(io);
1893 		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1894 			index = io->buf.auto_reg.index;
1895 		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1896 		io->flags |= UBLK_IO_FLAG_ACTIVE;
1897 		ublk_io_unlock(io);
1898 
1899 		if (index != -1)
1900 			io_buffer_unregister_bvec(data->cmd, index,
1901 					data->issue_flags);
1902 	}
1903 
1904 	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1905 		tag_buf, len, &ubq->evts_lock);
1906 
1907 	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1908 			"tags(%d %zu) ret %d\n", __func__, res, len,
1909 			ret);
1910 }
1911 
1912 #define MAX_NR_TAG 128
__ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)1913 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1914 				 const struct ublk_batch_io_data *data,
1915 				 struct ublk_batch_fetch_cmd *fcmd)
1916 {
1917 	const unsigned int tag_sz = sizeof(unsigned short);
1918 	unsigned short tag_buf[MAX_NR_TAG];
1919 	struct io_br_sel sel;
1920 	size_t len = 0;
1921 	bool needs_filter;
1922 	int ret;
1923 
1924 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1925 
1926 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1927 					 data->issue_flags);
1928 	if (sel.val < 0)
1929 		return sel.val;
1930 	if (!sel.addr)
1931 		return -ENOBUFS;
1932 
1933 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1934 	len = min(len, sizeof(tag_buf)) / tag_sz;
1935 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1936 
1937 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1938 	/* Filter out unused tags before posting to userspace */
1939 	if (unlikely(needs_filter)) {
1940 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1941 
1942 		/* return actual length if all are failed or requeued */
1943 		if (!new_len) {
1944 			/* release the selected buffer */
1945 			sel.val = 0;
1946 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1947 						&sel, data->issue_flags));
1948 			return len;
1949 		}
1950 		len = new_len;
1951 	}
1952 
1953 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1954 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1955 	if (unlikely(ret < 0))
1956 		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1957 	return ret;
1958 }
1959 
__ublk_acquire_fcmd(struct ublk_queue * ubq)1960 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1961 		struct ublk_queue *ubq)
1962 {
1963 	struct ublk_batch_fetch_cmd *fcmd;
1964 
1965 	lockdep_assert_held(&ubq->evts_lock);
1966 
1967 	/*
1968 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1969 	 *
1970 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1971 	 *
1972 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1973 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1974 	 */
1975 	smp_mb();
1976 	if (READ_ONCE(ubq->active_fcmd)) {
1977 		fcmd = NULL;
1978 	} else {
1979 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1980 				struct ublk_batch_fetch_cmd, node);
1981 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1982 	}
1983 	return fcmd;
1984 }
1985 
ublk_batch_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)1986 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1987 {
1988 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1989 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1990 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1991 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1992 	struct ublk_batch_io_data data = {
1993 		.ub = pdu->ubq->dev,
1994 		.cmd = fcmd->cmd,
1995 		.issue_flags = issue_flags,
1996 	};
1997 
1998 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1999 
2000 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
2001 }
2002 
2003 static void
ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)2004 ublk_batch_dispatch(struct ublk_queue *ubq,
2005 		    const struct ublk_batch_io_data *data,
2006 		    struct ublk_batch_fetch_cmd *fcmd)
2007 {
2008 	struct ublk_batch_fetch_cmd *new_fcmd;
2009 	unsigned tried = 0;
2010 	int ret = 0;
2011 
2012 again:
2013 	while (!ublk_io_evts_empty(ubq)) {
2014 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
2015 		if (ret <= 0)
2016 			break;
2017 	}
2018 
2019 	if (ret < 0) {
2020 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2021 		return;
2022 	}
2023 
2024 	__ublk_release_fcmd(ubq);
2025 	/*
2026 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2027 	 * checking ubq->evts_fifo.
2028 	 *
2029 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2030 	 */
2031 	smp_mb();
2032 	if (likely(ublk_io_evts_empty(ubq)))
2033 		return;
2034 
2035 	spin_lock(&ubq->evts_lock);
2036 	new_fcmd = __ublk_acquire_fcmd(ubq);
2037 	spin_unlock(&ubq->evts_lock);
2038 
2039 	if (!new_fcmd)
2040 		return;
2041 
2042 	/* Avoid lockup by allowing to handle at most 32 batches */
2043 	if (new_fcmd == fcmd && tried++ < 32)
2044 		goto again;
2045 
2046 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2047 }
2048 
ublk_cmd_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2049 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2050 {
2051 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2052 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2053 	struct ublk_queue *ubq = pdu->ubq;
2054 
2055 	ublk_dispatch_req(ubq, pdu->req);
2056 }
2057 
ublk_batch_queue_cmd(struct ublk_queue * ubq,struct request * rq,bool last)2058 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2059 {
2060 	unsigned short tag = rq->tag;
2061 	struct ublk_batch_fetch_cmd *fcmd = NULL;
2062 
2063 	spin_lock(&ubq->evts_lock);
2064 	kfifo_put(&ubq->evts_fifo, tag);
2065 	if (last)
2066 		fcmd = __ublk_acquire_fcmd(ubq);
2067 	spin_unlock(&ubq->evts_lock);
2068 
2069 	if (fcmd)
2070 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2071 }
2072 
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)2073 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2074 {
2075 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2076 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2077 
2078 	pdu->req = rq;
2079 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2080 }
2081 
ublk_cmd_list_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2082 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2083 {
2084 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2085 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2086 	struct request *rq = pdu->req_list;
2087 	struct request *next;
2088 
2089 	do {
2090 		next = rq->rq_next;
2091 		rq->rq_next = NULL;
2092 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2093 		rq = next;
2094 	} while (rq);
2095 }
2096 
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)2097 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2098 {
2099 	struct io_uring_cmd *cmd = io->cmd;
2100 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2101 
2102 	pdu->req_list = rq_list_peek(l);
2103 	rq_list_init(l);
2104 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2105 }
2106 
ublk_timeout(struct request * rq)2107 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2108 {
2109 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2110 	pid_t tgid = ubq->dev->ublksrv_tgid;
2111 	struct task_struct *p;
2112 	struct pid *pid;
2113 
2114 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2115 		return BLK_EH_RESET_TIMER;
2116 
2117 	if (unlikely(!tgid))
2118 		return BLK_EH_RESET_TIMER;
2119 
2120 	rcu_read_lock();
2121 	pid = find_vpid(tgid);
2122 	p = pid_task(pid, PIDTYPE_PID);
2123 	if (p)
2124 		send_sig(SIGKILL, p, 0);
2125 	rcu_read_unlock();
2126 	return BLK_EH_DONE;
2127 }
2128 
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)2129 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2130 				  bool check_cancel)
2131 {
2132 	blk_status_t res;
2133 
2134 	if (unlikely(READ_ONCE(ubq->fail_io)))
2135 		return BLK_STS_TARGET;
2136 
2137 	/* With recovery feature enabled, force_abort is set in
2138 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2139 	 * abort all requeued and new rqs here to let del_gendisk()
2140 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2141 	 * to avoid UAF on io_uring ctx.
2142 	 *
2143 	 * Note: force_abort is guaranteed to be seen because it is set
2144 	 * before request queue is unqiuesced.
2145 	 */
2146 	if (ublk_nosrv_should_queue_io(ubq) &&
2147 	    unlikely(READ_ONCE(ubq->force_abort)))
2148 		return BLK_STS_IOERR;
2149 
2150 	if (check_cancel && unlikely(ubq->canceling))
2151 		return BLK_STS_IOERR;
2152 
2153 	/* fill iod to slot in io cmd buffer */
2154 	res = ublk_setup_iod(ubq, rq);
2155 	if (unlikely(res != BLK_STS_OK))
2156 		return BLK_STS_IOERR;
2157 
2158 	blk_mq_start_request(rq);
2159 	return BLK_STS_OK;
2160 }
2161 
2162 /*
2163  * Common helper for queue_rq that handles request preparation and
2164  * cancellation checks. Returns status and sets should_queue to indicate
2165  * whether the caller should proceed with queuing the request.
2166  */
__ublk_queue_rq_common(struct ublk_queue * ubq,struct request * rq,bool * should_queue)2167 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2168 						   struct request *rq,
2169 						   bool *should_queue)
2170 {
2171 	blk_status_t res;
2172 
2173 	res = ublk_prep_req(ubq, rq, false);
2174 	if (res != BLK_STS_OK) {
2175 		*should_queue = false;
2176 		return res;
2177 	}
2178 
2179 	/*
2180 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2181 	 * is dealt with, otherwise this request may not be failed in case
2182 	 * of recovery, and cause hang when deleting disk
2183 	 */
2184 	if (unlikely(ubq->canceling)) {
2185 		*should_queue = false;
2186 		__ublk_abort_rq(ubq, rq);
2187 		return BLK_STS_OK;
2188 	}
2189 
2190 	*should_queue = true;
2191 	return BLK_STS_OK;
2192 }
2193 
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2194 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2195 		const struct blk_mq_queue_data *bd)
2196 {
2197 	struct ublk_queue *ubq = hctx->driver_data;
2198 	struct request *rq = bd->rq;
2199 	bool should_queue;
2200 	blk_status_t res;
2201 
2202 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2203 	if (!should_queue)
2204 		return res;
2205 
2206 	ublk_queue_cmd(ubq, rq);
2207 	return BLK_STS_OK;
2208 }
2209 
ublk_batch_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2210 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2211 		const struct blk_mq_queue_data *bd)
2212 {
2213 	struct ublk_queue *ubq = hctx->driver_data;
2214 	struct request *rq = bd->rq;
2215 	bool should_queue;
2216 	blk_status_t res;
2217 
2218 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2219 	if (!should_queue)
2220 		return res;
2221 
2222 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2223 	return BLK_STS_OK;
2224 }
2225 
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)2226 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2227 					     const struct ublk_io *io2)
2228 {
2229 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2230 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2231 		(io->task == io2->task);
2232 }
2233 
ublk_commit_rqs(struct blk_mq_hw_ctx * hctx)2234 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2235 {
2236 	struct ublk_queue *ubq = hctx->driver_data;
2237 	struct ublk_batch_fetch_cmd *fcmd;
2238 
2239 	spin_lock(&ubq->evts_lock);
2240 	fcmd = __ublk_acquire_fcmd(ubq);
2241 	spin_unlock(&ubq->evts_lock);
2242 
2243 	if (fcmd)
2244 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2245 }
2246 
ublk_queue_rqs(struct rq_list * rqlist)2247 static void ublk_queue_rqs(struct rq_list *rqlist)
2248 {
2249 	struct rq_list requeue_list = { };
2250 	struct rq_list submit_list = { };
2251 	struct ublk_io *io = NULL;
2252 	struct request *req;
2253 
2254 	while ((req = rq_list_pop(rqlist))) {
2255 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2256 		struct ublk_io *this_io = &this_q->ios[req->tag];
2257 
2258 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2259 			rq_list_add_tail(&requeue_list, req);
2260 			continue;
2261 		}
2262 
2263 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2264 				!rq_list_empty(&submit_list))
2265 			ublk_queue_cmd_list(io, &submit_list);
2266 		io = this_io;
2267 		rq_list_add_tail(&submit_list, req);
2268 	}
2269 
2270 	if (!rq_list_empty(&submit_list))
2271 		ublk_queue_cmd_list(io, &submit_list);
2272 	*rqlist = requeue_list;
2273 }
2274 
ublk_batch_queue_cmd_list(struct ublk_queue * ubq,struct rq_list * l)2275 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2276 {
2277 	unsigned short tags[MAX_NR_TAG];
2278 	struct ublk_batch_fetch_cmd *fcmd;
2279 	struct request *rq;
2280 	unsigned cnt = 0;
2281 
2282 	spin_lock(&ubq->evts_lock);
2283 	rq_list_for_each(l, rq) {
2284 		tags[cnt++] = (unsigned short)rq->tag;
2285 		if (cnt >= MAX_NR_TAG) {
2286 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2287 			cnt = 0;
2288 		}
2289 	}
2290 	if (cnt)
2291 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2292 	fcmd = __ublk_acquire_fcmd(ubq);
2293 	spin_unlock(&ubq->evts_lock);
2294 
2295 	rq_list_init(l);
2296 	if (fcmd)
2297 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2298 }
2299 
ublk_batch_queue_rqs(struct rq_list * rqlist)2300 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2301 {
2302 	struct rq_list requeue_list = { };
2303 	struct rq_list submit_list = { };
2304 	struct ublk_queue *ubq = NULL;
2305 	struct request *req;
2306 
2307 	while ((req = rq_list_pop(rqlist))) {
2308 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2309 
2310 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2311 			rq_list_add_tail(&requeue_list, req);
2312 			continue;
2313 		}
2314 
2315 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2316 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2317 		ubq = this_q;
2318 		rq_list_add_tail(&submit_list, req);
2319 	}
2320 
2321 	if (!rq_list_empty(&submit_list))
2322 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2323 	*rqlist = requeue_list;
2324 }
2325 
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)2326 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2327 		unsigned int hctx_idx)
2328 {
2329 	struct ublk_device *ub = driver_data;
2330 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2331 
2332 	hctx->driver_data = ubq;
2333 	return 0;
2334 }
2335 
2336 static const struct blk_mq_ops ublk_mq_ops = {
2337 	.queue_rq       = ublk_queue_rq,
2338 	.queue_rqs      = ublk_queue_rqs,
2339 	.init_hctx	= ublk_init_hctx,
2340 	.timeout	= ublk_timeout,
2341 };
2342 
2343 static const struct blk_mq_ops ublk_batch_mq_ops = {
2344 	.commit_rqs	= ublk_commit_rqs,
2345 	.queue_rq       = ublk_batch_queue_rq,
2346 	.queue_rqs      = ublk_batch_queue_rqs,
2347 	.init_hctx	= ublk_init_hctx,
2348 	.timeout	= ublk_timeout,
2349 };
2350 
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2351 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2352 {
2353 	int i;
2354 
2355 	ubq->nr_io_ready = 0;
2356 
2357 	for (i = 0; i < ubq->q_depth; i++) {
2358 		struct ublk_io *io = &ubq->ios[i];
2359 
2360 		/*
2361 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2362 		 * io->cmd
2363 		 */
2364 		io->flags &= UBLK_IO_FLAG_CANCELED;
2365 		io->cmd = NULL;
2366 		io->buf.addr = 0;
2367 
2368 		/*
2369 		 * old task is PF_EXITING, put it now
2370 		 *
2371 		 * It could be NULL in case of closing one quiesced
2372 		 * device.
2373 		 */
2374 		if (io->task) {
2375 			put_task_struct(io->task);
2376 			io->task = NULL;
2377 		}
2378 
2379 		WARN_ON_ONCE(refcount_read(&io->ref));
2380 		WARN_ON_ONCE(io->task_registered_buffers);
2381 	}
2382 }
2383 
ublk_ch_open(struct inode * inode,struct file * filp)2384 static int ublk_ch_open(struct inode *inode, struct file *filp)
2385 {
2386 	struct ublk_device *ub = container_of(inode->i_cdev,
2387 			struct ublk_device, cdev);
2388 
2389 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2390 		return -EBUSY;
2391 	filp->private_data = ub;
2392 	ub->ublksrv_tgid = current->tgid;
2393 	return 0;
2394 }
2395 
ublk_reset_ch_dev(struct ublk_device * ub)2396 static void ublk_reset_ch_dev(struct ublk_device *ub)
2397 {
2398 	int i;
2399 
2400 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2401 		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2402 
2403 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2404 	ub->mm = NULL;
2405 	ub->nr_queue_ready = 0;
2406 	ub->unprivileged_daemons = false;
2407 	ub->ublksrv_tgid = -1;
2408 }
2409 
ublk_get_disk(struct ublk_device * ub)2410 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2411 {
2412 	struct gendisk *disk;
2413 
2414 	spin_lock(&ub->lock);
2415 	disk = ub->ub_disk;
2416 	if (disk)
2417 		get_device(disk_to_dev(disk));
2418 	spin_unlock(&ub->lock);
2419 
2420 	return disk;
2421 }
2422 
ublk_put_disk(struct gendisk * disk)2423 static void ublk_put_disk(struct gendisk *disk)
2424 {
2425 	if (disk)
2426 		put_device(disk_to_dev(disk));
2427 }
2428 
ublk_partition_scan_work(struct work_struct * work)2429 static void ublk_partition_scan_work(struct work_struct *work)
2430 {
2431 	struct ublk_device *ub =
2432 		container_of(work, struct ublk_device, partition_scan_work);
2433 	/* Hold disk reference to prevent UAF during concurrent teardown */
2434 	struct gendisk *disk = ublk_get_disk(ub);
2435 
2436 	if (!disk)
2437 		return;
2438 
2439 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2440 					     &disk->state)))
2441 		goto out;
2442 
2443 	mutex_lock(&disk->open_mutex);
2444 	bdev_disk_changed(disk, false);
2445 	mutex_unlock(&disk->open_mutex);
2446 out:
2447 	ublk_put_disk(disk);
2448 }
2449 
2450 /*
2451  * Use this function to ensure that ->canceling is consistently set for
2452  * the device and all queues. Do not set these flags directly.
2453  *
2454  * Caller must ensure that:
2455  * - cancel_mutex is held. This ensures that there is no concurrent
2456  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2457  * - there are no concurrent reads of ubq->canceling from the queue_rq
2458  *   path. This can be done by quiescing the queue, or through other
2459  *   means.
2460  */
ublk_set_canceling(struct ublk_device * ub,bool canceling)2461 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2462 	__must_hold(&ub->cancel_mutex)
2463 {
2464 	int i;
2465 
2466 	ub->canceling = canceling;
2467 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2468 		ublk_get_queue(ub, i)->canceling = canceling;
2469 }
2470 
ublk_check_and_reset_active_ref(struct ublk_device * ub)2471 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2472 {
2473 	int i, j;
2474 
2475 	if (!ublk_dev_need_req_ref(ub))
2476 		return false;
2477 
2478 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2479 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2480 
2481 		for (j = 0; j < ubq->q_depth; j++) {
2482 			struct ublk_io *io = &ubq->ios[j];
2483 			unsigned int refs = refcount_read(&io->ref) +
2484 				io->task_registered_buffers;
2485 
2486 			/*
2487 			 * UBLK_REFCOUNT_INIT or zero means no active
2488 			 * reference
2489 			 */
2490 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2491 				return true;
2492 
2493 			/* reset to zero if the io hasn't active references */
2494 			refcount_set(&io->ref, 0);
2495 			io->task_registered_buffers = 0;
2496 		}
2497 	}
2498 	return false;
2499 }
2500 
ublk_ch_release_work_fn(struct work_struct * work)2501 static void ublk_ch_release_work_fn(struct work_struct *work)
2502 {
2503 	struct ublk_device *ub =
2504 		container_of(work, struct ublk_device, exit_work.work);
2505 	struct gendisk *disk;
2506 	int i;
2507 
2508 	/*
2509 	 * For zero-copy and auto buffer register modes, I/O references
2510 	 * might not be dropped naturally when the daemon is killed, but
2511 	 * io_uring guarantees that registered bvec kernel buffers are
2512 	 * unregistered finally when freeing io_uring context, then the
2513 	 * active references are dropped.
2514 	 *
2515 	 * Wait until active references are dropped for avoiding use-after-free
2516 	 *
2517 	 * registered buffer may be unregistered in io_ring's release hander,
2518 	 * so have to wait by scheduling work function for avoiding the two
2519 	 * file release dependency.
2520 	 */
2521 	if (ublk_check_and_reset_active_ref(ub)) {
2522 		schedule_delayed_work(&ub->exit_work, 1);
2523 		return;
2524 	}
2525 
2526 	/*
2527 	 * disk isn't attached yet, either device isn't live, or it has
2528 	 * been removed already, so we needn't to do anything
2529 	 */
2530 	disk = ublk_get_disk(ub);
2531 	if (!disk)
2532 		goto out;
2533 
2534 	/*
2535 	 * All uring_cmd are done now, so abort any request outstanding to
2536 	 * the ublk server
2537 	 *
2538 	 * This can be done in lockless way because ublk server has been
2539 	 * gone
2540 	 *
2541 	 * More importantly, we have to provide forward progress guarantee
2542 	 * without holding ub->mutex, otherwise control task grabbing
2543 	 * ub->mutex triggers deadlock
2544 	 *
2545 	 * All requests may be inflight, so ->canceling may not be set, set
2546 	 * it now.
2547 	 */
2548 	mutex_lock(&ub->cancel_mutex);
2549 	ublk_set_canceling(ub, true);
2550 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2551 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2552 	mutex_unlock(&ub->cancel_mutex);
2553 	blk_mq_kick_requeue_list(disk->queue);
2554 
2555 	/*
2556 	 * All infligh requests have been completed or requeued and any new
2557 	 * request will be failed or requeued via `->canceling` now, so it is
2558 	 * fine to grab ub->mutex now.
2559 	 */
2560 	mutex_lock(&ub->mutex);
2561 
2562 	/* double check after grabbing lock */
2563 	if (!ub->ub_disk)
2564 		goto unlock;
2565 
2566 	/*
2567 	 * Transition the device to the nosrv state. What exactly this
2568 	 * means depends on the recovery flags
2569 	 */
2570 	if (ublk_nosrv_should_stop_dev(ub)) {
2571 		/*
2572 		 * Allow any pending/future I/O to pass through quickly
2573 		 * with an error. This is needed because del_gendisk
2574 		 * waits for all pending I/O to complete
2575 		 */
2576 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2577 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2578 
2579 		ublk_stop_dev_unlocked(ub);
2580 	} else {
2581 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2582 			/* ->canceling is set and all requests are aborted */
2583 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2584 		} else {
2585 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2586 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2587 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2588 		}
2589 	}
2590 unlock:
2591 	mutex_unlock(&ub->mutex);
2592 	ublk_put_disk(disk);
2593 
2594 	/* all uring_cmd has been done now, reset device & ubq */
2595 	ublk_reset_ch_dev(ub);
2596 out:
2597 	clear_bit(UB_STATE_OPEN, &ub->state);
2598 
2599 	/* put the reference grabbed in ublk_ch_release() */
2600 	ublk_put_device(ub);
2601 }
2602 
ublk_ch_release(struct inode * inode,struct file * filp)2603 static int ublk_ch_release(struct inode *inode, struct file *filp)
2604 {
2605 	struct ublk_device *ub = filp->private_data;
2606 
2607 	/*
2608 	 * Grab ublk device reference, so it won't be gone until we are
2609 	 * really released from work function.
2610 	 */
2611 	ublk_get_device(ub);
2612 
2613 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2614 	schedule_delayed_work(&ub->exit_work, 0);
2615 	return 0;
2616 }
2617 
2618 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)2619 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2620 {
2621 	struct ublk_device *ub = filp->private_data;
2622 	size_t sz = vma->vm_end - vma->vm_start;
2623 	unsigned max_sz = ublk_max_cmd_buf_size();
2624 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2625 	int q_id, ret = 0;
2626 
2627 	spin_lock(&ub->lock);
2628 	if (!ub->mm)
2629 		ub->mm = current->mm;
2630 	if (current->mm != ub->mm)
2631 		ret = -EINVAL;
2632 	spin_unlock(&ub->lock);
2633 
2634 	if (ret)
2635 		return ret;
2636 
2637 	if (vma->vm_flags & VM_WRITE)
2638 		return -EPERM;
2639 
2640 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2641 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2642 		return -EINVAL;
2643 
2644 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2645 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2646 			__func__, q_id, current->pid, vma->vm_start,
2647 			phys_off, (unsigned long)sz);
2648 
2649 	if (sz != ublk_queue_cmd_buf_size(ub))
2650 		return -EINVAL;
2651 
2652 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2653 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2654 }
2655 
__ublk_fail_req(struct ublk_device * ub,struct ublk_io * io,struct request * req)2656 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2657 		struct request *req)
2658 {
2659 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2660 			io->flags & UBLK_IO_FLAG_ACTIVE);
2661 
2662 	if (ublk_nosrv_should_reissue_outstanding(ub))
2663 		blk_mq_requeue_request(req, false);
2664 	else {
2665 		io->res = -EIO;
2666 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2667 	}
2668 }
2669 
2670 /*
2671  * Request tag may just be filled to event kfifo, not get chance to
2672  * dispatch, abort these requests too
2673  */
ublk_abort_batch_queue(struct ublk_device * ub,struct ublk_queue * ubq)2674 static void ublk_abort_batch_queue(struct ublk_device *ub,
2675 				   struct ublk_queue *ubq)
2676 {
2677 	unsigned short tag;
2678 
2679 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2680 		struct request *req = blk_mq_tag_to_rq(
2681 				ub->tag_set.tags[ubq->q_id], tag);
2682 
2683 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2684 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2685 	}
2686 }
2687 
2688 /*
2689  * Called from ublk char device release handler, when any uring_cmd is
2690  * done, meantime request queue is "quiesced" since all inflight requests
2691  * can't be completed because ublk server is dead.
2692  *
2693  * So no one can hold our request IO reference any more, simply ignore the
2694  * reference, and complete the request immediately
2695  */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)2696 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2697 {
2698 	int i;
2699 
2700 	for (i = 0; i < ubq->q_depth; i++) {
2701 		struct ublk_io *io = &ubq->ios[i];
2702 
2703 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2704 			__ublk_fail_req(ub, io, io->req);
2705 	}
2706 
2707 	if (ublk_support_batch_io(ubq))
2708 		ublk_abort_batch_queue(ub, ubq);
2709 }
2710 
ublk_start_cancel(struct ublk_device * ub)2711 static void ublk_start_cancel(struct ublk_device *ub)
2712 {
2713 	struct gendisk *disk = ublk_get_disk(ub);
2714 
2715 	/* Our disk has been dead */
2716 	if (!disk)
2717 		return;
2718 
2719 	mutex_lock(&ub->cancel_mutex);
2720 	if (ub->canceling)
2721 		goto out;
2722 	/*
2723 	 * Now we are serialized with ublk_queue_rq()
2724 	 *
2725 	 * Make sure that ubq->canceling is set when queue is frozen,
2726 	 * because ublk_queue_rq() has to rely on this flag for avoiding to
2727 	 * touch completed uring_cmd
2728 	 */
2729 	blk_mq_quiesce_queue(disk->queue);
2730 	ublk_set_canceling(ub, true);
2731 	blk_mq_unquiesce_queue(disk->queue);
2732 out:
2733 	mutex_unlock(&ub->cancel_mutex);
2734 	ublk_put_disk(disk);
2735 }
2736 
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)2737 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2738 		unsigned int issue_flags)
2739 {
2740 	struct ublk_io *io = &ubq->ios[tag];
2741 	struct ublk_device *ub = ubq->dev;
2742 	struct request *req;
2743 	bool done;
2744 
2745 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2746 		return;
2747 
2748 	/*
2749 	 * Don't try to cancel this command if the request is started for
2750 	 * avoiding race between io_uring_cmd_done() and
2751 	 * io_uring_cmd_complete_in_task().
2752 	 *
2753 	 * Either the started request will be aborted via __ublk_abort_rq(),
2754 	 * then this uring_cmd is canceled next time, or it will be done in
2755 	 * task work function ublk_dispatch_req() because io_uring guarantees
2756 	 * that ublk_dispatch_req() is always called
2757 	 */
2758 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2759 	if (req && blk_mq_request_started(req) && req->tag == tag)
2760 		return;
2761 
2762 	spin_lock(&ubq->cancel_lock);
2763 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2764 	if (!done)
2765 		io->flags |= UBLK_IO_FLAG_CANCELED;
2766 	spin_unlock(&ubq->cancel_lock);
2767 
2768 	if (!done)
2769 		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2770 }
2771 
2772 /*
2773  * Cancel a batch fetch command if it hasn't been claimed by another path.
2774  *
2775  * An fcmd can only be cancelled if:
2776  * 1. It's not the active_fcmd (which is currently being processed)
2777  * 2. It's still on the list (!list_empty check) - once removed from the list,
2778  *    the fcmd is considered claimed and will be freed by whoever removed it
2779  *
2780  * Use list_del_init() so subsequent list_empty() checks work correctly.
2781  */
ublk_batch_cancel_cmd(struct ublk_queue * ubq,struct ublk_batch_fetch_cmd * fcmd,unsigned int issue_flags)2782 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2783 				  struct ublk_batch_fetch_cmd *fcmd,
2784 				  unsigned int issue_flags)
2785 {
2786 	bool done;
2787 
2788 	spin_lock(&ubq->evts_lock);
2789 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2790 	if (done)
2791 		list_del_init(&fcmd->node);
2792 	spin_unlock(&ubq->evts_lock);
2793 
2794 	if (done) {
2795 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2796 		ublk_batch_free_fcmd(fcmd);
2797 	}
2798 }
2799 
ublk_batch_cancel_queue(struct ublk_queue * ubq)2800 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2801 {
2802 	struct ublk_batch_fetch_cmd *fcmd;
2803 	LIST_HEAD(fcmd_list);
2804 
2805 	spin_lock(&ubq->evts_lock);
2806 	ubq->force_abort = true;
2807 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2808 	fcmd = READ_ONCE(ubq->active_fcmd);
2809 	if (fcmd)
2810 		list_move(&fcmd->node, &ubq->fcmd_head);
2811 	spin_unlock(&ubq->evts_lock);
2812 
2813 	while (!list_empty(&fcmd_list)) {
2814 		fcmd = list_first_entry(&fcmd_list,
2815 				struct ublk_batch_fetch_cmd, node);
2816 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2817 	}
2818 }
2819 
ublk_batch_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2820 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2821 				 unsigned int issue_flags)
2822 {
2823 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2824 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2825 	struct ublk_queue *ubq = pdu->ubq;
2826 
2827 	ublk_start_cancel(ubq->dev);
2828 
2829 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2830 }
2831 
2832 /*
2833  * The ublk char device won't be closed when calling cancel fn, so both
2834  * ublk device and queue are guaranteed to be live
2835  *
2836  * Two-stage cancel:
2837  *
2838  * - make every active uring_cmd done in ->cancel_fn()
2839  *
2840  * - aborting inflight ublk IO requests in ublk char device release handler,
2841  *   which depends on 1st stage because device can only be closed iff all
2842  *   uring_cmd are done
2843  *
2844  * Do _not_ try to acquire ub->mutex before all inflight requests are
2845  * aborted, otherwise deadlock may be caused.
2846  */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2847 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2848 		unsigned int issue_flags)
2849 {
2850 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2851 	struct ublk_queue *ubq = pdu->ubq;
2852 	struct task_struct *task;
2853 	struct ublk_io *io;
2854 
2855 	if (WARN_ON_ONCE(!ubq))
2856 		return;
2857 
2858 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2859 		return;
2860 
2861 	task = io_uring_cmd_get_task(cmd);
2862 	io = &ubq->ios[pdu->tag];
2863 	if (WARN_ON_ONCE(task && task != io->task))
2864 		return;
2865 
2866 	ublk_start_cancel(ubq->dev);
2867 
2868 	WARN_ON_ONCE(io->cmd != cmd);
2869 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2870 }
2871 
ublk_queue_ready(const struct ublk_queue * ubq)2872 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2873 {
2874 	return ubq->nr_io_ready == ubq->q_depth;
2875 }
2876 
ublk_dev_ready(const struct ublk_device * ub)2877 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2878 {
2879 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2880 }
2881 
ublk_cancel_queue(struct ublk_queue * ubq)2882 static void ublk_cancel_queue(struct ublk_queue *ubq)
2883 {
2884 	int i;
2885 
2886 	if (ublk_support_batch_io(ubq)) {
2887 		ublk_batch_cancel_queue(ubq);
2888 		return;
2889 	}
2890 
2891 	for (i = 0; i < ubq->q_depth; i++)
2892 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2893 }
2894 
2895 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)2896 static void ublk_cancel_dev(struct ublk_device *ub)
2897 {
2898 	int i;
2899 
2900 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2901 		ublk_cancel_queue(ublk_get_queue(ub, i));
2902 }
2903 
ublk_check_inflight_rq(struct request * rq,void * data)2904 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2905 {
2906 	bool *idle = data;
2907 
2908 	if (blk_mq_request_started(rq)) {
2909 		*idle = false;
2910 		return false;
2911 	}
2912 	return true;
2913 }
2914 
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)2915 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2916 {
2917 	bool idle;
2918 
2919 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2920 	while (true) {
2921 		idle = true;
2922 		blk_mq_tagset_busy_iter(&ub->tag_set,
2923 				ublk_check_inflight_rq, &idle);
2924 		if (idle)
2925 			break;
2926 		msleep(UBLK_REQUEUE_DELAY_MS);
2927 	}
2928 }
2929 
ublk_force_abort_dev(struct ublk_device * ub)2930 static void ublk_force_abort_dev(struct ublk_device *ub)
2931 {
2932 	int i;
2933 
2934 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2935 			__func__, ub->dev_info.dev_id,
2936 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2937 			"LIVE" : "QUIESCED");
2938 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2939 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2940 		ublk_wait_tagset_rqs_idle(ub);
2941 
2942 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2943 		ublk_get_queue(ub, i)->force_abort = true;
2944 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2945 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2946 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2947 }
2948 
ublk_detach_disk(struct ublk_device * ub)2949 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2950 {
2951 	struct gendisk *disk;
2952 
2953 	/* Sync with ublk_abort_queue() by holding the lock */
2954 	spin_lock(&ub->lock);
2955 	disk = ub->ub_disk;
2956 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2957 	ub->dev_info.ublksrv_pid = -1;
2958 	ub->ub_disk = NULL;
2959 	spin_unlock(&ub->lock);
2960 
2961 	return disk;
2962 }
2963 
ublk_stop_dev_unlocked(struct ublk_device * ub)2964 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2965 	__must_hold(&ub->mutex)
2966 {
2967 	struct gendisk *disk;
2968 
2969 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2970 		return;
2971 
2972 	if (ublk_nosrv_dev_should_queue_io(ub))
2973 		ublk_force_abort_dev(ub);
2974 	del_gendisk(ub->ub_disk);
2975 	disk = ublk_detach_disk(ub);
2976 	put_disk(disk);
2977 }
2978 
ublk_stop_dev(struct ublk_device * ub)2979 static void ublk_stop_dev(struct ublk_device *ub)
2980 {
2981 	mutex_lock(&ub->mutex);
2982 	ublk_stop_dev_unlocked(ub);
2983 	mutex_unlock(&ub->mutex);
2984 	cancel_work_sync(&ub->partition_scan_work);
2985 	ublk_cancel_dev(ub);
2986 }
2987 
ublk_reset_io_flags(struct ublk_queue * ubq,struct ublk_io * io)2988 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
2989 {
2990 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
2991 	spin_lock(&ubq->cancel_lock);
2992 	io->flags &= ~UBLK_IO_FLAG_CANCELED;
2993 	spin_unlock(&ubq->cancel_lock);
2994 }
2995 
2996 /* reset per-queue io flags */
ublk_queue_reset_io_flags(struct ublk_queue * ubq)2997 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2998 {
2999 	spin_lock(&ubq->cancel_lock);
3000 	ubq->canceling = false;
3001 	spin_unlock(&ubq->cancel_lock);
3002 	ubq->fail_io = false;
3003 }
3004 
3005 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,u16 q_id,struct ublk_io * io)3006 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
3007 	struct ublk_io *io)
3008 	__must_hold(&ub->mutex)
3009 {
3010 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3011 
3012 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3013 		ub->unprivileged_daemons = true;
3014 
3015 	ubq->nr_io_ready++;
3016 	ublk_reset_io_flags(ubq, io);
3017 
3018 	/* Check if this specific queue is now fully ready */
3019 	if (ublk_queue_ready(ubq)) {
3020 		ub->nr_queue_ready++;
3021 
3022 		/*
3023 		 * Reset queue flags as soon as this queue is ready.
3024 		 * This clears the canceling flag, allowing batch FETCH commands
3025 		 * to succeed during recovery without waiting for all queues.
3026 		 */
3027 		ublk_queue_reset_io_flags(ubq);
3028 	}
3029 
3030 	/* Check if all queues are ready */
3031 	if (ublk_dev_ready(ub)) {
3032 		/*
3033 		 * All queues ready - clear device-level canceling flag
3034 		 * and complete the recovery/initialization.
3035 		 */
3036 		mutex_lock(&ub->cancel_mutex);
3037 		ub->canceling = false;
3038 		mutex_unlock(&ub->cancel_mutex);
3039 		complete_all(&ub->completion);
3040 	}
3041 }
3042 
ublk_check_cmd_op(u32 cmd_op)3043 static inline int ublk_check_cmd_op(u32 cmd_op)
3044 {
3045 	u32 ioc_type = _IOC_TYPE(cmd_op);
3046 
3047 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3048 		return -EOPNOTSUPP;
3049 
3050 	if (ioc_type != 'u' && ioc_type != 0)
3051 		return -EOPNOTSUPP;
3052 
3053 	return 0;
3054 }
3055 
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)3056 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3057 {
3058 	struct ublk_auto_buf_reg buf;
3059 
3060 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3061 
3062 	if (buf.reserved0 || buf.reserved1)
3063 		return -EINVAL;
3064 
3065 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3066 		return -EINVAL;
3067 	io->buf.auto_reg = buf;
3068 	return 0;
3069 }
3070 
ublk_clear_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3071 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3072 				    struct io_uring_cmd *cmd,
3073 				    u16 *buf_idx)
3074 {
3075 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3076 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3077 
3078 		/*
3079 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3080 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3081 		 * `io_ring_ctx`.
3082 		 *
3083 		 * If this uring_cmd's io_ring_ctx isn't same with the
3084 		 * one for registering the buffer, it is ublk server's
3085 		 * responsibility for unregistering the buffer, otherwise
3086 		 * this ublk request gets stuck.
3087 		 */
3088 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3089 			*buf_idx = io->buf.auto_reg.index;
3090 	}
3091 }
3092 
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3093 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3094 				    struct io_uring_cmd *cmd,
3095 				    u16 *buf_idx)
3096 {
3097 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3098 	return ublk_set_auto_buf_reg(io, cmd);
3099 }
3100 
3101 /* Once we return, `io->req` can't be used any more */
3102 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)3103 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3104 {
3105 	struct request *req = io->req;
3106 
3107 	io->cmd = cmd;
3108 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3109 	/* now this cmd slot is owned by ublk driver */
3110 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3111 
3112 	return req;
3113 }
3114 
3115 static inline int
ublk_config_io_buf(const struct ublk_device * ub,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)3116 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3117 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3118 		   u16 *buf_idx)
3119 {
3120 	if (ublk_dev_support_auto_buf_reg(ub))
3121 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3122 
3123 	io->buf.addr = buf_addr;
3124 	return 0;
3125 }
3126 
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)3127 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3128 				    unsigned int issue_flags,
3129 				    struct ublk_queue *ubq, unsigned int tag)
3130 {
3131 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3132 
3133 	/*
3134 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3135 	 * commands are completed
3136 	 */
3137 	pdu->ubq = ubq;
3138 	pdu->tag = tag;
3139 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3140 }
3141 
ublk_io_release(void * priv)3142 static void ublk_io_release(void *priv)
3143 {
3144 	struct request *rq = priv;
3145 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3146 	struct ublk_io *io = &ubq->ios[rq->tag];
3147 
3148 	/*
3149 	 * task_registered_buffers may be 0 if buffers were registered off task
3150 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3151 	 */
3152 	if (current == io->task && io->task_registered_buffers)
3153 		io->task_registered_buffers--;
3154 	else
3155 		ublk_put_req_ref(io, rq);
3156 }
3157 
ublk_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned int index,unsigned int issue_flags)3158 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3159 				struct ublk_device *ub,
3160 				u16 q_id, u16 tag,
3161 				struct ublk_io *io,
3162 				unsigned int index, unsigned int issue_flags)
3163 {
3164 	struct request *req;
3165 	int ret;
3166 
3167 	if (!ublk_dev_support_zero_copy(ub))
3168 		return -EINVAL;
3169 
3170 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3171 	if (!req)
3172 		return -EINVAL;
3173 
3174 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3175 				      issue_flags);
3176 	if (ret) {
3177 		ublk_put_req_ref(io, req);
3178 		return ret;
3179 	}
3180 
3181 	return 0;
3182 }
3183 
3184 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned index,unsigned issue_flags)3185 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3186 			    struct ublk_device *ub,
3187 			    u16 q_id, u16 tag, struct ublk_io *io,
3188 			    unsigned index, unsigned issue_flags)
3189 {
3190 	unsigned new_registered_buffers;
3191 	struct request *req = io->req;
3192 	int ret;
3193 
3194 	/*
3195 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3196 	 * If not, fall back on the thread-safe buffer registration.
3197 	 */
3198 	new_registered_buffers = io->task_registered_buffers + 1;
3199 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3200 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3201 					    issue_flags);
3202 
3203 	if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3204 		return -EINVAL;
3205 
3206 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3207 				      issue_flags);
3208 	if (ret)
3209 		return ret;
3210 
3211 	io->task_registered_buffers = new_registered_buffers;
3212 	return 0;
3213 }
3214 
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)3215 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3216 				  const struct ublk_device *ub,
3217 				  unsigned int index, unsigned int issue_flags)
3218 {
3219 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3220 		return -EINVAL;
3221 
3222 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3223 }
3224 
ublk_check_fetch_buf(const struct ublk_device * ub,__u64 buf_addr)3225 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3226 {
3227 	if (ublk_dev_need_map_io(ub)) {
3228 		/*
3229 		 * FETCH_RQ has to provide IO buffer if NEED GET
3230 		 * DATA is not enabled
3231 		 */
3232 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3233 			return -EINVAL;
3234 	} else if (buf_addr) {
3235 		/* User copy requires addr to be unset */
3236 		return -EINVAL;
3237 	}
3238 	return 0;
3239 }
3240 
__ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,u16 q_id)3241 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3242 			struct ublk_io *io, u16 q_id)
3243 {
3244 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3245 	if (ublk_dev_ready(ub))
3246 		return -EBUSY;
3247 
3248 	/* allow each command to be FETCHed at most once */
3249 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3250 		return -EINVAL;
3251 
3252 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3253 
3254 	ublk_fill_io_cmd(io, cmd);
3255 
3256 	if (ublk_dev_support_batch_io(ub))
3257 		WRITE_ONCE(io->task, NULL);
3258 	else
3259 		WRITE_ONCE(io->task, get_task_struct(current));
3260 
3261 	return 0;
3262 }
3263 
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr,u16 q_id)3264 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3265 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3266 {
3267 	int ret;
3268 
3269 	/*
3270 	 * When handling FETCH command for setting up ublk uring queue,
3271 	 * ub->mutex is the innermost lock, and we won't block for handling
3272 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3273 	 */
3274 	mutex_lock(&ub->mutex);
3275 	ret = __ublk_fetch(cmd, ub, io, q_id);
3276 	if (!ret)
3277 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3278 	if (!ret)
3279 		ublk_mark_io_ready(ub, q_id, io);
3280 	mutex_unlock(&ub->mutex);
3281 	return ret;
3282 }
3283 
ublk_check_commit_and_fetch(const struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)3284 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3285 				       struct ublk_io *io, __u64 buf_addr)
3286 {
3287 	struct request *req = io->req;
3288 
3289 	if (ublk_dev_need_map_io(ub)) {
3290 		/*
3291 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3292 		 * NEED GET DATA is not enabled or it is Read IO.
3293 		 */
3294 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3295 					req_op(req) == REQ_OP_READ))
3296 			return -EINVAL;
3297 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3298 		/*
3299 		 * User copy requires addr to be unset when command is
3300 		 * not zone append
3301 		 */
3302 		return -EINVAL;
3303 	}
3304 
3305 	return 0;
3306 }
3307 
ublk_need_complete_req(const struct ublk_device * ub,struct ublk_io * io)3308 static bool ublk_need_complete_req(const struct ublk_device *ub,
3309 				   struct ublk_io *io)
3310 {
3311 	if (ublk_dev_need_req_ref(ub))
3312 		return ublk_sub_req_ref(io);
3313 	return true;
3314 }
3315 
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)3316 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3317 			  struct request *req)
3318 {
3319 	/*
3320 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3321 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3322 	 * do the copy work.
3323 	 */
3324 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3325 	/* update iod->addr because ublksrv may have passed a new io buffer */
3326 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3327 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3328 			__func__, ubq->q_id, req->tag, io->flags,
3329 			ublk_get_iod(ubq, req->tag)->addr);
3330 
3331 	return ublk_start_io(ubq, req, io);
3332 }
3333 
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)3334 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3335 		unsigned int issue_flags)
3336 {
3337 	/* May point to userspace-mapped memory */
3338 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3339 							       struct ublksrv_io_cmd);
3340 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3341 	struct ublk_device *ub = cmd->file->private_data;
3342 	struct ublk_queue *ubq;
3343 	struct ublk_io *io = NULL;
3344 	u32 cmd_op = cmd->cmd_op;
3345 	u16 q_id = READ_ONCE(ub_src->q_id);
3346 	u16 tag = READ_ONCE(ub_src->tag);
3347 	s32 result = READ_ONCE(ub_src->result);
3348 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3349 	struct request *req;
3350 	int ret;
3351 	bool compl;
3352 
3353 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3354 
3355 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3356 			__func__, cmd->cmd_op, q_id, tag, result);
3357 
3358 	ret = ublk_check_cmd_op(cmd_op);
3359 	if (ret)
3360 		goto out;
3361 
3362 	/*
3363 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3364 	 * so no need to validate the q_id, tag, or task
3365 	 */
3366 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3367 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3368 
3369 	ret = -EINVAL;
3370 	if (q_id >= ub->dev_info.nr_hw_queues)
3371 		goto out;
3372 
3373 	ubq = ublk_get_queue(ub, q_id);
3374 
3375 	if (tag >= ub->dev_info.queue_depth)
3376 		goto out;
3377 
3378 	io = &ubq->ios[tag];
3379 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3380 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3381 		ret = ublk_check_fetch_buf(ub, addr);
3382 		if (ret)
3383 			goto out;
3384 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3385 		if (ret)
3386 			goto out;
3387 
3388 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3389 		return -EIOCBQUEUED;
3390 	}
3391 
3392 	if (READ_ONCE(io->task) != current) {
3393 		/*
3394 		 * ublk_register_io_buf() accesses only the io's refcount,
3395 		 * so can be handled on any task
3396 		 */
3397 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3398 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3399 						    addr, issue_flags);
3400 
3401 		goto out;
3402 	}
3403 
3404 	/* there is pending io cmd, something must be wrong */
3405 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3406 		ret = -EBUSY;
3407 		goto out;
3408 	}
3409 
3410 	/*
3411 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3412 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3413 	 */
3414 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3415 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3416 		goto out;
3417 
3418 	switch (_IOC_NR(cmd_op)) {
3419 	case UBLK_IO_REGISTER_IO_BUF:
3420 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3421 						   issue_flags);
3422 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3423 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3424 		if (ret)
3425 			goto out;
3426 		io->res = result;
3427 		req = ublk_fill_io_cmd(io, cmd);
3428 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3429 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3430 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3431 		compl = ublk_need_complete_req(ub, io);
3432 
3433 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3434 			req->__sector = addr;
3435 		if (compl)
3436 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3437 
3438 		if (ret)
3439 			goto out;
3440 		break;
3441 	case UBLK_IO_NEED_GET_DATA:
3442 		/*
3443 		 * ublk_get_data() may fail and fallback to requeue, so keep
3444 		 * uring_cmd active first and prepare for handling new requeued
3445 		 * request
3446 		 */
3447 		req = ublk_fill_io_cmd(io, cmd);
3448 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3449 		WARN_ON_ONCE(ret);
3450 		if (likely(ublk_get_data(ubq, io, req))) {
3451 			__ublk_prep_compl_io_cmd(io, req);
3452 			return UBLK_IO_RES_OK;
3453 		}
3454 		break;
3455 	default:
3456 		goto out;
3457 	}
3458 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3459 	return -EIOCBQUEUED;
3460 
3461  out:
3462 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3463 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3464 	return ret;
3465 }
3466 
__ublk_check_and_get_req(struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io)3467 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3468 		u16 q_id, u16 tag, struct ublk_io *io)
3469 {
3470 	struct request *req;
3471 
3472 	/*
3473 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3474 	 * which would overwrite it with io->cmd
3475 	 */
3476 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3477 	if (!req)
3478 		return NULL;
3479 
3480 	if (!ublk_get_req_ref(io))
3481 		return NULL;
3482 
3483 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3484 		goto fail_put;
3485 
3486 	if (!ublk_rq_has_data(req))
3487 		goto fail_put;
3488 
3489 	return req;
3490 fail_put:
3491 	ublk_put_req_ref(io, req);
3492 	return NULL;
3493 }
3494 
ublk_ch_uring_cmd_cb(struct io_tw_req tw_req,io_tw_token_t tw)3495 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3496 {
3497 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3498 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3499 	int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3500 
3501 	if (ret != -EIOCBQUEUED)
3502 		io_uring_cmd_done(cmd, ret, issue_flags);
3503 }
3504 
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3505 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3506 {
3507 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3508 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3509 		return 0;
3510 	}
3511 
3512 	/* well-implemented server won't run into unlocked */
3513 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3514 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3515 		return -EIOCBQUEUED;
3516 	}
3517 
3518 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3519 }
3520 
ublk_batch_buf_addr(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3521 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3522 					const struct ublk_elem_header *elem)
3523 {
3524 	const void *buf = elem;
3525 
3526 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3527 		return *(const __u64 *)(buf + sizeof(*elem));
3528 	return 0;
3529 }
3530 
ublk_batch_zone_lba(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3531 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3532 					const struct ublk_elem_header *elem)
3533 {
3534 	const void *buf = elem;
3535 
3536 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3537 		return *(const __u64 *)(buf + sizeof(*elem) +
3538 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3539 	return -1;
3540 }
3541 
3542 static struct ublk_auto_buf_reg
ublk_batch_auto_buf_reg(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3543 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3544 			const struct ublk_elem_header *elem)
3545 {
3546 	struct ublk_auto_buf_reg reg = {
3547 		.index = elem->buf_index,
3548 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3549 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3550 	};
3551 
3552 	return reg;
3553 }
3554 
3555 /*
3556  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3557  * it is the least common multiple(LCM) of 8, 16 and 24
3558  */
3559 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3560 struct ublk_batch_io_iter {
3561 	void __user *uaddr;
3562 	unsigned done, total;
3563 	unsigned char elem_bytes;
3564 	/* copy to this buffer from user space */
3565 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3566 };
3567 
3568 static inline int
__ublk_walk_cmd_buf(struct ublk_queue * ubq,struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,unsigned bytes,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3569 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3570 		    struct ublk_batch_io_iter *iter,
3571 		    const struct ublk_batch_io_data *data,
3572 		    unsigned bytes,
3573 		    int (*cb)(struct ublk_queue *q,
3574 			    const struct ublk_batch_io_data *data,
3575 			    const struct ublk_elem_header *elem))
3576 {
3577 	unsigned int i;
3578 	int ret = 0;
3579 
3580 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3581 		const struct ublk_elem_header *elem =
3582 			(const struct ublk_elem_header *)&iter->buf[i];
3583 
3584 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3585 			ret = -EINVAL;
3586 			break;
3587 		}
3588 
3589 		ret = cb(ubq, data, elem);
3590 		if (unlikely(ret))
3591 			break;
3592 	}
3593 
3594 	iter->done += i;
3595 	return ret;
3596 }
3597 
ublk_walk_cmd_buf(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3598 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3599 			     const struct ublk_batch_io_data *data,
3600 			     int (*cb)(struct ublk_queue *q,
3601 				     const struct ublk_batch_io_data *data,
3602 				     const struct ublk_elem_header *elem))
3603 {
3604 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3605 	int ret = 0;
3606 
3607 	while (iter->done < iter->total) {
3608 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3609 
3610 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3611 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3612 					data->ub->dev_info.dev_id);
3613 			return -EFAULT;
3614 		}
3615 
3616 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3617 		if (ret)
3618 			return ret;
3619 	}
3620 	return 0;
3621 }
3622 
ublk_batch_unprep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3623 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3624 				const struct ublk_batch_io_data *data,
3625 				const struct ublk_elem_header *elem)
3626 {
3627 	struct ublk_io *io = &ubq->ios[elem->tag];
3628 
3629 	/*
3630 	 * If queue was ready before this decrement, it won't be anymore,
3631 	 * so we need to decrement the queue ready count and restore the
3632 	 * canceling flag to prevent new requests from being queued.
3633 	 */
3634 	if (ublk_queue_ready(ubq)) {
3635 		data->ub->nr_queue_ready--;
3636 		spin_lock(&ubq->cancel_lock);
3637 		ubq->canceling = true;
3638 		spin_unlock(&ubq->cancel_lock);
3639 	}
3640 	ubq->nr_io_ready--;
3641 
3642 	ublk_io_lock(io);
3643 	io->flags = 0;
3644 	ublk_io_unlock(io);
3645 	return 0;
3646 }
3647 
ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data)3648 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3649 				       const struct ublk_batch_io_data *data)
3650 {
3651 	int ret;
3652 
3653 	/* Re-process only what we've already processed, starting from beginning */
3654 	iter->total = iter->done;
3655 	iter->done = 0;
3656 
3657 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3658 	WARN_ON_ONCE(ret);
3659 }
3660 
ublk_batch_prep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3661 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3662 			      const struct ublk_batch_io_data *data,
3663 			      const struct ublk_elem_header *elem)
3664 {
3665 	struct ublk_io *io = &ubq->ios[elem->tag];
3666 	const struct ublk_batch_io *uc = &data->header;
3667 	union ublk_io_buf buf = { 0 };
3668 	int ret;
3669 
3670 	if (ublk_dev_support_auto_buf_reg(data->ub))
3671 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3672 	else if (ublk_dev_need_map_io(data->ub)) {
3673 		buf.addr = ublk_batch_buf_addr(uc, elem);
3674 
3675 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3676 		if (ret)
3677 			return ret;
3678 	}
3679 
3680 	ublk_io_lock(io);
3681 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3682 	if (!ret)
3683 		io->buf = buf;
3684 	ublk_io_unlock(io);
3685 
3686 	if (!ret)
3687 		ublk_mark_io_ready(data->ub, ubq->q_id, io);
3688 
3689 	return ret;
3690 }
3691 
ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data * data)3692 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3693 {
3694 	const struct ublk_batch_io *uc = &data->header;
3695 	struct io_uring_cmd *cmd = data->cmd;
3696 	struct ublk_batch_io_iter iter = {
3697 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3698 		.total = uc->nr_elem * uc->elem_bytes,
3699 		.elem_bytes = uc->elem_bytes,
3700 	};
3701 	int ret;
3702 
3703 	mutex_lock(&data->ub->mutex);
3704 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3705 
3706 	if (ret && iter.done)
3707 		ublk_batch_revert_prep_cmd(&iter, data);
3708 	mutex_unlock(&data->ub->mutex);
3709 	return ret;
3710 }
3711 
ublk_batch_commit_io_check(const struct ublk_queue * ubq,struct ublk_io * io,union ublk_io_buf * buf)3712 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3713 				      struct ublk_io *io,
3714 				      union ublk_io_buf *buf)
3715 {
3716 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3717 		return -EBUSY;
3718 
3719 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3720 	if (ublk_need_map_io(ubq) && !buf->addr)
3721 		return -EINVAL;
3722 	return 0;
3723 }
3724 
ublk_batch_commit_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3725 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3726 				const struct ublk_batch_io_data *data,
3727 				const struct ublk_elem_header *elem)
3728 {
3729 	struct ublk_io *io = &ubq->ios[elem->tag];
3730 	const struct ublk_batch_io *uc = &data->header;
3731 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3732 	union ublk_io_buf buf = { 0 };
3733 	struct request *req = NULL;
3734 	bool auto_reg = false;
3735 	bool compl = false;
3736 	int ret;
3737 
3738 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3739 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3740 		auto_reg = true;
3741 	} else if (ublk_dev_need_map_io(data->ub))
3742 		buf.addr = ublk_batch_buf_addr(uc, elem);
3743 
3744 	ublk_io_lock(io);
3745 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3746 	if (!ret) {
3747 		io->res = elem->result;
3748 		io->buf = buf;
3749 		req = ublk_fill_io_cmd(io, data->cmd);
3750 
3751 		if (auto_reg)
3752 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3753 		compl = ublk_need_complete_req(data->ub, io);
3754 	}
3755 	ublk_io_unlock(io);
3756 
3757 	if (unlikely(ret)) {
3758 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3759 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3760 			elem->tag, ret);
3761 		return ret;
3762 	}
3763 
3764 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3765 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3766 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3767 		req->__sector = ublk_batch_zone_lba(uc, elem);
3768 	if (compl)
3769 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3770 	return 0;
3771 }
3772 
ublk_handle_batch_commit_cmd(struct ublk_batch_io_data * data)3773 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3774 {
3775 	const struct ublk_batch_io *uc = &data->header;
3776 	struct io_uring_cmd *cmd = data->cmd;
3777 	struct ublk_batch_io_iter iter = {
3778 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3779 		.total = uc->nr_elem * uc->elem_bytes,
3780 		.elem_bytes = uc->elem_bytes,
3781 	};
3782 	DEFINE_IO_COMP_BATCH(iob);
3783 	int ret;
3784 
3785 	data->iob = &iob;
3786 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3787 
3788 	if (iob.complete)
3789 		iob.complete(&iob);
3790 
3791 	return iter.done == 0 ? ret : iter.done;
3792 }
3793 
ublk_check_batch_cmd_flags(const struct ublk_batch_io * uc)3794 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3795 {
3796 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3797 
3798 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3799 		return -EINVAL;
3800 
3801 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3802 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3803 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3804 		return -EINVAL;
3805 
3806 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3807 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3808 	if (uc->elem_bytes != elem_bytes)
3809 		return -EINVAL;
3810 	return 0;
3811 }
3812 
ublk_check_batch_cmd(const struct ublk_batch_io_data * data)3813 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3814 {
3815 	const struct ublk_batch_io *uc = &data->header;
3816 
3817 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3818 		return -EINVAL;
3819 
3820 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3821 		return -E2BIG;
3822 
3823 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3824 			!ublk_dev_is_zoned(data->ub))
3825 		return -EINVAL;
3826 
3827 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3828 			!ublk_dev_need_map_io(data->ub))
3829 		return -EINVAL;
3830 
3831 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3832 			!ublk_dev_support_auto_buf_reg(data->ub))
3833 		return -EINVAL;
3834 
3835 	return ublk_check_batch_cmd_flags(uc);
3836 }
3837 
ublk_batch_attach(struct ublk_queue * ubq,struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)3838 static int ublk_batch_attach(struct ublk_queue *ubq,
3839 			     struct ublk_batch_io_data *data,
3840 			     struct ublk_batch_fetch_cmd *fcmd)
3841 {
3842 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3843 	bool free = false;
3844 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3845 
3846 	spin_lock(&ubq->evts_lock);
3847 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3848 		free = true;
3849 	} else {
3850 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3851 		new_fcmd = __ublk_acquire_fcmd(ubq);
3852 	}
3853 	spin_unlock(&ubq->evts_lock);
3854 
3855 	if (unlikely(free)) {
3856 		ublk_batch_free_fcmd(fcmd);
3857 		return -ENODEV;
3858 	}
3859 
3860 	pdu->ubq = ubq;
3861 	pdu->fcmd = fcmd;
3862 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3863 
3864 	if (!new_fcmd)
3865 		goto out;
3866 
3867 	/*
3868 	 * If the two fetch commands are originated from same io_ring_ctx,
3869 	 * run batch dispatch directly. Otherwise, schedule task work for
3870 	 * doing it.
3871 	 */
3872 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3873 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3874 		data->cmd = new_fcmd->cmd;
3875 		ublk_batch_dispatch(ubq, data, new_fcmd);
3876 	} else {
3877 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3878 				ublk_batch_tw_cb);
3879 	}
3880 out:
3881 	return -EIOCBQUEUED;
3882 }
3883 
ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data * data)3884 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3885 {
3886 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3887 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3888 
3889 	if (!fcmd)
3890 		return -ENOMEM;
3891 
3892 	return ublk_batch_attach(ubq, data, fcmd);
3893 }
3894 
ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data * data)3895 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3896 {
3897 	const struct ublk_batch_io *uc = &data->header;
3898 
3899 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3900 		return -EINVAL;
3901 
3902 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3903 		return -EINVAL;
3904 
3905 	if (uc->elem_bytes != sizeof(__u16))
3906 		return -EINVAL;
3907 
3908 	if (uc->flags != 0)
3909 		return -EINVAL;
3910 
3911 	return 0;
3912 }
3913 
ublk_handle_non_batch_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3914 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3915 				     unsigned int issue_flags)
3916 {
3917 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3918 							       struct ublksrv_io_cmd);
3919 	struct ublk_device *ub = cmd->file->private_data;
3920 	unsigned tag = READ_ONCE(ub_cmd->tag);
3921 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3922 	unsigned index = READ_ONCE(ub_cmd->addr);
3923 	struct ublk_queue *ubq;
3924 	struct ublk_io *io;
3925 
3926 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3927 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3928 
3929 	if (q_id >= ub->dev_info.nr_hw_queues)
3930 		return -EINVAL;
3931 
3932 	if (tag >= ub->dev_info.queue_depth)
3933 		return -EINVAL;
3934 
3935 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3936 		return -EOPNOTSUPP;
3937 
3938 	ubq = ublk_get_queue(ub, q_id);
3939 	io = &ubq->ios[tag];
3940 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3941 			issue_flags);
3942 }
3943 
ublk_ch_batch_io_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3944 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3945 				       unsigned int issue_flags)
3946 {
3947 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3948 							  struct ublk_batch_io);
3949 	struct ublk_device *ub = cmd->file->private_data;
3950 	struct ublk_batch_io_data data = {
3951 		.ub  = ub,
3952 		.cmd = cmd,
3953 		.header = (struct ublk_batch_io) {
3954 			.q_id = READ_ONCE(uc->q_id),
3955 			.flags = READ_ONCE(uc->flags),
3956 			.nr_elem = READ_ONCE(uc->nr_elem),
3957 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3958 		},
3959 		.issue_flags = issue_flags,
3960 	};
3961 	u32 cmd_op = cmd->cmd_op;
3962 	int ret = -EINVAL;
3963 
3964 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3965 		ublk_batch_cancel_fn(cmd, issue_flags);
3966 		return 0;
3967 	}
3968 
3969 	switch (cmd_op) {
3970 	case UBLK_U_IO_PREP_IO_CMDS:
3971 		ret = ublk_check_batch_cmd(&data);
3972 		if (ret)
3973 			goto out;
3974 		ret = ublk_handle_batch_prep_cmd(&data);
3975 		break;
3976 	case UBLK_U_IO_COMMIT_IO_CMDS:
3977 		ret = ublk_check_batch_cmd(&data);
3978 		if (ret)
3979 			goto out;
3980 		ret = ublk_handle_batch_commit_cmd(&data);
3981 		break;
3982 	case UBLK_U_IO_FETCH_IO_CMDS:
3983 		ret = ublk_validate_batch_fetch_cmd(&data);
3984 		if (ret)
3985 			goto out;
3986 		ret = ublk_handle_batch_fetch_cmd(&data);
3987 		break;
3988 	default:
3989 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3990 		break;
3991 	}
3992 out:
3993 	return ret;
3994 }
3995 
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)3996 static inline bool ublk_check_ubuf_dir(const struct request *req,
3997 		int ubuf_dir)
3998 {
3999 	/* copy ubuf to request pages */
4000 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
4001 	    ubuf_dir == ITER_SOURCE)
4002 		return true;
4003 
4004 	/* copy request pages to ubuf */
4005 	if ((req_op(req) == REQ_OP_WRITE ||
4006 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
4007 	    ubuf_dir == ITER_DEST)
4008 		return true;
4009 
4010 	return false;
4011 }
4012 
4013 static ssize_t
ublk_user_copy(struct kiocb * iocb,struct iov_iter * iter,int dir)4014 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4015 {
4016 	struct ublk_device *ub = iocb->ki_filp->private_data;
4017 	struct ublk_queue *ubq;
4018 	struct request *req;
4019 	struct ublk_io *io;
4020 	unsigned data_len;
4021 	bool is_integrity;
4022 	bool on_daemon;
4023 	size_t buf_off;
4024 	u16 tag, q_id;
4025 	ssize_t ret;
4026 
4027 	if (!user_backed_iter(iter))
4028 		return -EACCES;
4029 
4030 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4031 		return -EACCES;
4032 
4033 	tag = ublk_pos_to_tag(iocb->ki_pos);
4034 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
4035 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4036 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4037 
4038 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4039 		return -EINVAL;
4040 
4041 	if (q_id >= ub->dev_info.nr_hw_queues)
4042 		return -EINVAL;
4043 
4044 	ubq = ublk_get_queue(ub, q_id);
4045 	if (!ublk_dev_support_user_copy(ub))
4046 		return -EACCES;
4047 
4048 	if (tag >= ub->dev_info.queue_depth)
4049 		return -EINVAL;
4050 
4051 	io = &ubq->ios[tag];
4052 	on_daemon = current == READ_ONCE(io->task);
4053 	if (on_daemon) {
4054 		/* On daemon, io can't be completed concurrently, so skip ref */
4055 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4056 			return -EINVAL;
4057 
4058 		req = io->req;
4059 		if (!ublk_rq_has_data(req))
4060 			return -EINVAL;
4061 	} else {
4062 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
4063 		if (!req)
4064 			return -EINVAL;
4065 	}
4066 
4067 	if (is_integrity) {
4068 		struct blk_integrity *bi = &req->q->limits.integrity;
4069 
4070 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4071 	} else {
4072 		data_len = blk_rq_bytes(req);
4073 	}
4074 	if (buf_off > data_len) {
4075 		ret = -EINVAL;
4076 		goto out;
4077 	}
4078 
4079 	if (!ublk_check_ubuf_dir(req, dir)) {
4080 		ret = -EACCES;
4081 		goto out;
4082 	}
4083 
4084 	if (is_integrity)
4085 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4086 	else
4087 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4088 
4089 out:
4090 	if (!on_daemon)
4091 		ublk_put_req_ref(io, req);
4092 	return ret;
4093 }
4094 
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)4095 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4096 {
4097 	return ublk_user_copy(iocb, to, ITER_DEST);
4098 }
4099 
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)4100 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4101 {
4102 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4103 }
4104 
4105 static const struct file_operations ublk_ch_fops = {
4106 	.owner = THIS_MODULE,
4107 	.open = ublk_ch_open,
4108 	.release = ublk_ch_release,
4109 	.read_iter = ublk_ch_read_iter,
4110 	.write_iter = ublk_ch_write_iter,
4111 	.uring_cmd = ublk_ch_uring_cmd,
4112 	.mmap = ublk_ch_mmap,
4113 };
4114 
4115 static const struct file_operations ublk_ch_batch_io_fops = {
4116 	.owner = THIS_MODULE,
4117 	.open = ublk_ch_open,
4118 	.release = ublk_ch_release,
4119 	.read_iter = ublk_ch_read_iter,
4120 	.write_iter = ublk_ch_write_iter,
4121 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4122 	.mmap = ublk_ch_mmap,
4123 };
4124 
__ublk_deinit_queue(struct ublk_device * ub,struct ublk_queue * ubq)4125 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4126 {
4127 	int size, i;
4128 
4129 	size = ublk_queue_cmd_buf_size(ub);
4130 
4131 	for (i = 0; i < ubq->q_depth; i++) {
4132 		struct ublk_io *io = &ubq->ios[i];
4133 		if (io->task)
4134 			put_task_struct(io->task);
4135 		WARN_ON_ONCE(refcount_read(&io->ref));
4136 		WARN_ON_ONCE(io->task_registered_buffers);
4137 	}
4138 
4139 	if (ubq->io_cmd_buf)
4140 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4141 
4142 	if (ublk_dev_support_batch_io(ub))
4143 		ublk_io_evts_deinit(ubq);
4144 
4145 	kvfree(ubq);
4146 }
4147 
ublk_deinit_queue(struct ublk_device * ub,int q_id)4148 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4149 {
4150 	struct ublk_queue *ubq = ub->queues[q_id];
4151 
4152 	if (!ubq)
4153 		return;
4154 
4155 	__ublk_deinit_queue(ub, ubq);
4156 	ub->queues[q_id] = NULL;
4157 }
4158 
ublk_get_queue_numa_node(struct ublk_device * ub,int q_id)4159 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4160 {
4161 	unsigned int cpu;
4162 
4163 	/* Find first CPU mapped to this queue */
4164 	for_each_possible_cpu(cpu) {
4165 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4166 			return cpu_to_node(cpu);
4167 	}
4168 
4169 	return NUMA_NO_NODE;
4170 }
4171 
ublk_init_queue(struct ublk_device * ub,int q_id)4172 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4173 {
4174 	int depth = ub->dev_info.queue_depth;
4175 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4176 	struct ublk_queue *ubq;
4177 	struct page *page;
4178 	int numa_node;
4179 	int size, i, ret;
4180 
4181 	/* Determine NUMA node based on queue's CPU affinity */
4182 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4183 
4184 	/* Allocate queue structure on local NUMA node */
4185 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4186 			    numa_node);
4187 	if (!ubq)
4188 		return -ENOMEM;
4189 
4190 	spin_lock_init(&ubq->cancel_lock);
4191 	ubq->flags = ub->dev_info.flags;
4192 	ubq->q_id = q_id;
4193 	ubq->q_depth = depth;
4194 	size = ublk_queue_cmd_buf_size(ub);
4195 
4196 	/* Allocate I/O command buffer on local NUMA node */
4197 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4198 	if (!page) {
4199 		kvfree(ubq);
4200 		return -ENOMEM;
4201 	}
4202 	ubq->io_cmd_buf = page_address(page);
4203 
4204 	for (i = 0; i < ubq->q_depth; i++)
4205 		spin_lock_init(&ubq->ios[i].lock);
4206 
4207 	if (ublk_dev_support_batch_io(ub)) {
4208 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4209 		if (ret)
4210 			goto fail;
4211 		INIT_LIST_HEAD(&ubq->fcmd_head);
4212 	}
4213 	ub->queues[q_id] = ubq;
4214 	ubq->dev = ub;
4215 
4216 	return 0;
4217 fail:
4218 	__ublk_deinit_queue(ub, ubq);
4219 	return ret;
4220 }
4221 
ublk_deinit_queues(struct ublk_device * ub)4222 static void ublk_deinit_queues(struct ublk_device *ub)
4223 {
4224 	int i;
4225 
4226 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4227 		ublk_deinit_queue(ub, i);
4228 }
4229 
ublk_init_queues(struct ublk_device * ub)4230 static int ublk_init_queues(struct ublk_device *ub)
4231 {
4232 	int i, ret;
4233 
4234 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4235 		ret = ublk_init_queue(ub, i);
4236 		if (ret)
4237 			goto fail;
4238 	}
4239 
4240 	init_completion(&ub->completion);
4241 	return 0;
4242 
4243  fail:
4244 	ublk_deinit_queues(ub);
4245 	return ret;
4246 }
4247 
ublk_alloc_dev_number(struct ublk_device * ub,int idx)4248 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4249 {
4250 	int i = idx;
4251 	int err;
4252 
4253 	spin_lock(&ublk_idr_lock);
4254 	/* allocate id, if @id >= 0, we're requesting that specific id */
4255 	if (i >= 0) {
4256 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4257 		if (err == -ENOSPC)
4258 			err = -EEXIST;
4259 	} else {
4260 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4261 				GFP_NOWAIT);
4262 	}
4263 	spin_unlock(&ublk_idr_lock);
4264 
4265 	if (err >= 0)
4266 		ub->ub_number = err;
4267 
4268 	return err;
4269 }
4270 
ublk_free_dev_number(struct ublk_device * ub)4271 static void ublk_free_dev_number(struct ublk_device *ub)
4272 {
4273 	spin_lock(&ublk_idr_lock);
4274 	idr_remove(&ublk_index_idr, ub->ub_number);
4275 	wake_up_all(&ublk_idr_wq);
4276 	spin_unlock(&ublk_idr_lock);
4277 }
4278 
ublk_cdev_rel(struct device * dev)4279 static void ublk_cdev_rel(struct device *dev)
4280 {
4281 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4282 
4283 	ublk_buf_cleanup(ub);
4284 	blk_mq_free_tag_set(&ub->tag_set);
4285 	ublk_deinit_queues(ub);
4286 	ublk_free_dev_number(ub);
4287 	mutex_destroy(&ub->mutex);
4288 	mutex_destroy(&ub->cancel_mutex);
4289 	kfree(ub);
4290 }
4291 
ublk_add_chdev(struct ublk_device * ub)4292 static int ublk_add_chdev(struct ublk_device *ub)
4293 {
4294 	struct device *dev = &ub->cdev_dev;
4295 	int minor = ub->ub_number;
4296 	int ret;
4297 
4298 	dev->parent = ublk_misc.this_device;
4299 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4300 	dev->class = &ublk_chr_class;
4301 	dev->release = ublk_cdev_rel;
4302 	device_initialize(dev);
4303 
4304 	ret = dev_set_name(dev, "ublkc%d", minor);
4305 	if (ret)
4306 		goto fail;
4307 
4308 	if (ublk_dev_support_batch_io(ub))
4309 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4310 	else
4311 		cdev_init(&ub->cdev, &ublk_ch_fops);
4312 	ret = cdev_device_add(&ub->cdev, dev);
4313 	if (ret)
4314 		goto fail;
4315 
4316 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4317 		unprivileged_ublks_added++;
4318 	return 0;
4319  fail:
4320 	put_device(dev);
4321 	return ret;
4322 }
4323 
4324 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)4325 static void ublk_align_max_io_size(struct ublk_device *ub)
4326 {
4327 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4328 
4329 	ub->dev_info.max_io_buf_bytes =
4330 		round_down(max_io_bytes, PAGE_SIZE);
4331 }
4332 
ublk_add_tag_set(struct ublk_device * ub)4333 static int ublk_add_tag_set(struct ublk_device *ub)
4334 {
4335 	if (ublk_dev_support_batch_io(ub))
4336 		ub->tag_set.ops = &ublk_batch_mq_ops;
4337 	else
4338 		ub->tag_set.ops = &ublk_mq_ops;
4339 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4340 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4341 	ub->tag_set.numa_node = NUMA_NO_NODE;
4342 	ub->tag_set.driver_data = ub;
4343 	return blk_mq_alloc_tag_set(&ub->tag_set);
4344 }
4345 
ublk_remove(struct ublk_device * ub)4346 static void ublk_remove(struct ublk_device *ub)
4347 {
4348 	bool unprivileged;
4349 
4350 	ublk_stop_dev(ub);
4351 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4352 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4353 	ublk_put_device(ub);
4354 
4355 	if (unprivileged)
4356 		unprivileged_ublks_added--;
4357 }
4358 
ublk_get_device_from_id(int idx)4359 static struct ublk_device *ublk_get_device_from_id(int idx)
4360 {
4361 	struct ublk_device *ub = NULL;
4362 
4363 	if (idx < 0)
4364 		return NULL;
4365 
4366 	spin_lock(&ublk_idr_lock);
4367 	ub = idr_find(&ublk_index_idr, idx);
4368 	if (ub)
4369 		ub = ublk_get_device(ub);
4370 	spin_unlock(&ublk_idr_lock);
4371 
4372 	return ub;
4373 }
4374 
ublk_validate_user_pid(struct ublk_device * ub,pid_t ublksrv_pid)4375 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4376 {
4377 	rcu_read_lock();
4378 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4379 	rcu_read_unlock();
4380 
4381 	return ub->ublksrv_tgid == ublksrv_pid;
4382 }
4383 
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4384 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4385 		const struct ublksrv_ctrl_cmd *header)
4386 {
4387 	const struct ublk_param_basic *p = &ub->params.basic;
4388 	int ublksrv_pid = (int)header->data[0];
4389 	struct queue_limits lim = {
4390 		.logical_block_size	= 1 << p->logical_bs_shift,
4391 		.physical_block_size	= 1 << p->physical_bs_shift,
4392 		.io_min			= 1 << p->io_min_shift,
4393 		.io_opt			= 1 << p->io_opt_shift,
4394 		.max_hw_sectors		= p->max_sectors,
4395 		.chunk_sectors		= p->chunk_sectors,
4396 		.virt_boundary_mask	= p->virt_boundary_mask,
4397 		.max_segments		= USHRT_MAX,
4398 		.max_segment_size	= UINT_MAX,
4399 		.dma_alignment		= 3,
4400 	};
4401 	struct gendisk *disk;
4402 	int ret = -EINVAL;
4403 
4404 	if (ublksrv_pid <= 0)
4405 		return -EINVAL;
4406 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4407 		return -EINVAL;
4408 
4409 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4410 		const struct ublk_param_discard *pd = &ub->params.discard;
4411 
4412 		lim.discard_alignment = pd->discard_alignment;
4413 		lim.discard_granularity = pd->discard_granularity;
4414 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4415 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4416 		lim.max_discard_segments = pd->max_discard_segments;
4417 	}
4418 
4419 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4420 		const struct ublk_param_zoned *p = &ub->params.zoned;
4421 
4422 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4423 			return -EOPNOTSUPP;
4424 
4425 		lim.features |= BLK_FEAT_ZONED;
4426 		lim.max_active_zones = p->max_active_zones;
4427 		lim.max_open_zones =  p->max_open_zones;
4428 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4429 	}
4430 
4431 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4432 		lim.features |= BLK_FEAT_WRITE_CACHE;
4433 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4434 			lim.features |= BLK_FEAT_FUA;
4435 	}
4436 
4437 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4438 		lim.features |= BLK_FEAT_ROTATIONAL;
4439 
4440 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4441 		lim.dma_alignment = ub->params.dma.alignment;
4442 
4443 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4444 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4445 		lim.max_segment_size = ub->params.seg.max_segment_size;
4446 		lim.max_segments = ub->params.seg.max_segments;
4447 	}
4448 
4449 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4450 		const struct ublk_param_integrity *p = &ub->params.integrity;
4451 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4452 
4453 		lim.max_integrity_segments =
4454 			p->max_integrity_segments ?: USHRT_MAX;
4455 		lim.integrity = (struct blk_integrity) {
4456 			.flags = ublk_integrity_flags(p->flags),
4457 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4458 			.metadata_size = p->metadata_size,
4459 			.pi_offset = p->pi_offset,
4460 			.interval_exp = p->interval_exp,
4461 			.tag_size = p->tag_size,
4462 			.pi_tuple_size = pi_tuple_size,
4463 		};
4464 	}
4465 
4466 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4467 		return -EINTR;
4468 
4469 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4470 		return -EINVAL;
4471 
4472 	mutex_lock(&ub->mutex);
4473 	/* device may become not ready in case of F_BATCH */
4474 	if (!ublk_dev_ready(ub)) {
4475 		ret = -EINVAL;
4476 		goto out_unlock;
4477 	}
4478 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4479 	    test_bit(UB_STATE_USED, &ub->state)) {
4480 		ret = -EEXIST;
4481 		goto out_unlock;
4482 	}
4483 
4484 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4485 	if (IS_ERR(disk)) {
4486 		ret = PTR_ERR(disk);
4487 		goto out_unlock;
4488 	}
4489 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4490 	disk->fops = &ub_fops;
4491 	disk->private_data = ub;
4492 
4493 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4494 	ub->ub_disk = disk;
4495 
4496 	ublk_apply_params(ub);
4497 
4498 	/*
4499 	 * Suppress partition scan to avoid potential IO hang.
4500 	 *
4501 	 * If ublk server error occurs during partition scan, the IO may
4502 	 * wait while holding ub->mutex, which can deadlock with other
4503 	 * operations that need the mutex. Defer partition scan to async
4504 	 * work.
4505 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4506 	 * permanently.
4507 	 */
4508 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4509 
4510 	ublk_get_device(ub);
4511 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4512 
4513 	if (ublk_dev_is_zoned(ub)) {
4514 		ret = ublk_revalidate_disk_zones(ub);
4515 		if (ret)
4516 			goto out_put_cdev;
4517 	}
4518 
4519 	ret = add_disk(disk);
4520 	if (ret)
4521 		goto out_put_cdev;
4522 
4523 	set_bit(UB_STATE_USED, &ub->state);
4524 
4525 	/* Skip partition scan if disabled by user */
4526 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4527 		/* Not clear for unprivileged daemons, see comment above */
4528 		if (!ub->unprivileged_daemons)
4529 			clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4530 	} else {
4531 		/* Schedule async partition scan for trusted daemons */
4532 		if (!ub->unprivileged_daemons)
4533 			schedule_work(&ub->partition_scan_work);
4534 	}
4535 
4536 out_put_cdev:
4537 	if (ret) {
4538 		ublk_detach_disk(ub);
4539 		ublk_put_device(ub);
4540 	}
4541 	if (ret)
4542 		put_disk(disk);
4543 out_unlock:
4544 	mutex_unlock(&ub->mutex);
4545 	return ret;
4546 }
4547 
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4548 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4549 		const struct ublksrv_ctrl_cmd *header)
4550 {
4551 	void __user *argp = (void __user *)(unsigned long)header->addr;
4552 	cpumask_var_t cpumask;
4553 	unsigned long queue;
4554 	unsigned int retlen;
4555 	unsigned int i;
4556 	int ret;
4557 
4558 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4559 		return -EINVAL;
4560 	if (header->len & (sizeof(unsigned long)-1))
4561 		return -EINVAL;
4562 	if (!header->addr)
4563 		return -EINVAL;
4564 
4565 	queue = header->data[0];
4566 	if (queue >= ub->dev_info.nr_hw_queues)
4567 		return -EINVAL;
4568 
4569 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4570 		return -ENOMEM;
4571 
4572 	for_each_possible_cpu(i) {
4573 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4574 			cpumask_set_cpu(i, cpumask);
4575 	}
4576 
4577 	ret = -EFAULT;
4578 	retlen = min_t(unsigned short, header->len, cpumask_size());
4579 	if (copy_to_user(argp, cpumask, retlen))
4580 		goto out_free_cpumask;
4581 	if (retlen != header->len &&
4582 	    clear_user(argp + retlen, header->len - retlen))
4583 		goto out_free_cpumask;
4584 
4585 	ret = 0;
4586 out_free_cpumask:
4587 	free_cpumask_var(cpumask);
4588 	return ret;
4589 }
4590 
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)4591 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4592 {
4593 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4594 			info->dev_id, info->flags);
4595 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4596 			info->nr_hw_queues, info->queue_depth);
4597 }
4598 
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)4599 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4600 {
4601 	void __user *argp = (void __user *)(unsigned long)header->addr;
4602 	struct ublksrv_ctrl_dev_info info;
4603 	struct ublk_device *ub;
4604 	int ret = -EINVAL;
4605 
4606 	if (header->len < sizeof(info) || !header->addr)
4607 		return -EINVAL;
4608 	if (header->queue_id != (u16)-1) {
4609 		pr_warn("%s: queue_id is wrong %x\n",
4610 			__func__, header->queue_id);
4611 		return -EINVAL;
4612 	}
4613 
4614 	if (copy_from_user(&info, argp, sizeof(info)))
4615 		return -EFAULT;
4616 
4617 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4618 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4619 		return -EINVAL;
4620 
4621 	if (capable(CAP_SYS_ADMIN))
4622 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4623 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4624 		return -EPERM;
4625 
4626 	/* forbid nonsense combinations of recovery flags */
4627 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4628 	case 0:
4629 	case UBLK_F_USER_RECOVERY:
4630 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4631 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4632 		break;
4633 	default:
4634 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4635 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4636 		return -EINVAL;
4637 	}
4638 
4639 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4640 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4641 		return -EINVAL;
4642 	}
4643 
4644 	/*
4645 	 * unprivileged device can't be trusted, but RECOVERY and
4646 	 * RECOVERY_REISSUE still may hang error handling, so can't
4647 	 * support recovery features for unprivileged ublk now
4648 	 *
4649 	 * TODO: provide forward progress for RECOVERY handler, so that
4650 	 * unprivileged device can benefit from it
4651 	 */
4652 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4653 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4654 				UBLK_F_USER_RECOVERY);
4655 
4656 		/*
4657 		 * For USER_COPY, we depends on userspace to fill request
4658 		 * buffer by pwrite() to ublk char device, which can't be
4659 		 * used for unprivileged device
4660 		 *
4661 		 * Same with zero copy or auto buffer register.
4662 		 */
4663 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4664 					UBLK_F_AUTO_BUF_REG))
4665 			return -EINVAL;
4666 	}
4667 
4668 	/* User copy is required to access integrity buffer */
4669 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4670 		return -EINVAL;
4671 
4672 	/* the created device is always owned by current user */
4673 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4674 
4675 	if (header->dev_id != info.dev_id) {
4676 		pr_warn("%s: dev id not match %u %u\n",
4677 			__func__, header->dev_id, info.dev_id);
4678 		return -EINVAL;
4679 	}
4680 
4681 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4682 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4683 			__func__, UBLK_MAX_UBLKS - 1);
4684 		return -EINVAL;
4685 	}
4686 
4687 	ublk_dump_dev_info(&info);
4688 
4689 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4690 	if (ret)
4691 		return ret;
4692 
4693 	ret = -EACCES;
4694 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4695 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4696 		goto out_unlock;
4697 
4698 	ret = -ENOMEM;
4699 	ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4700 	if (!ub)
4701 		goto out_unlock;
4702 	mutex_init(&ub->mutex);
4703 	spin_lock_init(&ub->lock);
4704 	mutex_init(&ub->cancel_mutex);
4705 	mt_init(&ub->buf_tree);
4706 	ida_init(&ub->buf_ida);
4707 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4708 
4709 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4710 	if (ret < 0)
4711 		goto out_free_ub;
4712 
4713 	memcpy(&ub->dev_info, &info, sizeof(info));
4714 
4715 	/* update device id */
4716 	ub->dev_info.dev_id = ub->ub_number;
4717 
4718 	/*
4719 	 * 64bit flags will be copied back to userspace as feature
4720 	 * negotiation result, so have to clear flags which driver
4721 	 * doesn't support yet, then userspace can get correct flags
4722 	 * (features) to handle.
4723 	 */
4724 	ub->dev_info.flags &= UBLK_F_ALL;
4725 
4726 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4727 		UBLK_F_URING_CMD_COMP_IN_TASK |
4728 		UBLK_F_PER_IO_DAEMON |
4729 		UBLK_F_BUF_REG_OFF_DAEMON |
4730 		UBLK_F_SAFE_STOP_DEV;
4731 
4732 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4733 	if (ublk_dev_support_batch_io(ub))
4734 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4735 
4736 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4737 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4738 				UBLK_F_AUTO_BUF_REG))
4739 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4740 
4741 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4742 	if (ublk_dev_support_batch_io(ub))
4743 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4744 
4745 	/*
4746 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4747 	 * returning write_append_lba, which is only allowed in case of
4748 	 * user copy or zero copy
4749 	 */
4750 	if (ublk_dev_is_zoned(ub) &&
4751 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4752 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4753 		ret = -EINVAL;
4754 		goto out_free_dev_number;
4755 	}
4756 
4757 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4758 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4759 	ublk_align_max_io_size(ub);
4760 
4761 	ret = ublk_add_tag_set(ub);
4762 	if (ret)
4763 		goto out_free_dev_number;
4764 
4765 	ret = ublk_init_queues(ub);
4766 	if (ret)
4767 		goto out_free_tag_set;
4768 
4769 	ret = -EFAULT;
4770 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4771 		goto out_deinit_queues;
4772 
4773 	/*
4774 	 * Add the char dev so that ublksrv daemon can be setup.
4775 	 * ublk_add_chdev() will cleanup everything if it fails.
4776 	 */
4777 	ret = ublk_add_chdev(ub);
4778 	goto out_unlock;
4779 
4780 out_deinit_queues:
4781 	ublk_deinit_queues(ub);
4782 out_free_tag_set:
4783 	blk_mq_free_tag_set(&ub->tag_set);
4784 out_free_dev_number:
4785 	ublk_free_dev_number(ub);
4786 out_free_ub:
4787 	mutex_destroy(&ub->mutex);
4788 	mutex_destroy(&ub->cancel_mutex);
4789 	kfree(ub);
4790 out_unlock:
4791 	mutex_unlock(&ublk_ctl_mutex);
4792 	return ret;
4793 }
4794 
ublk_idr_freed(int id)4795 static inline bool ublk_idr_freed(int id)
4796 {
4797 	void *ptr;
4798 
4799 	spin_lock(&ublk_idr_lock);
4800 	ptr = idr_find(&ublk_index_idr, id);
4801 	spin_unlock(&ublk_idr_lock);
4802 
4803 	return ptr == NULL;
4804 }
4805 
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)4806 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4807 {
4808 	struct ublk_device *ub = *p_ub;
4809 	int idx = ub->ub_number;
4810 	int ret;
4811 
4812 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4813 	if (ret)
4814 		return ret;
4815 
4816 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4817 		ublk_remove(ub);
4818 		set_bit(UB_STATE_DELETED, &ub->state);
4819 	}
4820 
4821 	/* Mark the reference as consumed */
4822 	*p_ub = NULL;
4823 	ublk_put_device(ub);
4824 	mutex_unlock(&ublk_ctl_mutex);
4825 
4826 	/*
4827 	 * Wait until the idr is removed, then it can be reused after
4828 	 * DEL_DEV command is returned.
4829 	 *
4830 	 * If we returns because of user interrupt, future delete command
4831 	 * may come:
4832 	 *
4833 	 * - the device number isn't freed, this device won't or needn't
4834 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4835 	 *   will be released after the last reference is dropped
4836 	 *
4837 	 * - the device number is freed already, we will not find this
4838 	 *   device via ublk_get_device_from_id()
4839 	 */
4840 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4841 		return -EINTR;
4842 	return 0;
4843 }
4844 
ublk_ctrl_cmd_dump(u32 cmd_op,const struct ublksrv_ctrl_cmd * header)4845 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4846 				      const struct ublksrv_ctrl_cmd *header)
4847 {
4848 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4849 			__func__, cmd_op, header->dev_id, header->queue_id,
4850 			header->data[0], header->addr, header->len);
4851 }
4852 
ublk_ctrl_stop_dev(struct ublk_device * ub)4853 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4854 {
4855 	ublk_stop_dev(ub);
4856 }
4857 
ublk_ctrl_try_stop_dev(struct ublk_device * ub)4858 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4859 {
4860 	struct gendisk *disk;
4861 	int ret = 0;
4862 
4863 	disk = ublk_get_disk(ub);
4864 	if (!disk)
4865 		return -ENODEV;
4866 
4867 	mutex_lock(&disk->open_mutex);
4868 	if (disk_openers(disk) > 0) {
4869 		ret = -EBUSY;
4870 		goto unlock;
4871 	}
4872 	ub->block_open = true;
4873 	/* release open_mutex as del_gendisk() will reacquire it */
4874 	mutex_unlock(&disk->open_mutex);
4875 
4876 	ublk_ctrl_stop_dev(ub);
4877 	goto out;
4878 
4879 unlock:
4880 	mutex_unlock(&disk->open_mutex);
4881 out:
4882 	ublk_put_disk(disk);
4883 	return ret;
4884 }
4885 
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4886 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4887 		const struct ublksrv_ctrl_cmd *header)
4888 {
4889 	struct task_struct *p;
4890 	struct pid *pid;
4891 	struct ublksrv_ctrl_dev_info dev_info;
4892 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4893 	void __user *argp = (void __user *)(unsigned long)header->addr;
4894 
4895 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4896 		return -EINVAL;
4897 
4898 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4899 	dev_info.ublksrv_pid = -1;
4900 
4901 	if (init_ublksrv_tgid > 0) {
4902 		rcu_read_lock();
4903 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4904 		p = pid_task(pid, PIDTYPE_TGID);
4905 		if (p) {
4906 			int vnr = task_tgid_vnr(p);
4907 
4908 			if (vnr)
4909 				dev_info.ublksrv_pid = vnr;
4910 		}
4911 		rcu_read_unlock();
4912 	}
4913 
4914 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4915 		return -EFAULT;
4916 
4917 	return 0;
4918 }
4919 
4920 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)4921 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4922 {
4923 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4924 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4925 
4926 	if (ub->ub_disk) {
4927 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4928 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4929 	} else {
4930 		ub->params.devt.disk_major = 0;
4931 		ub->params.devt.disk_minor = 0;
4932 	}
4933 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4934 }
4935 
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4936 static int ublk_ctrl_get_params(struct ublk_device *ub,
4937 		const struct ublksrv_ctrl_cmd *header)
4938 {
4939 	void __user *argp = (void __user *)(unsigned long)header->addr;
4940 	struct ublk_params_header ph;
4941 	int ret;
4942 
4943 	if (header->len <= sizeof(ph) || !header->addr)
4944 		return -EINVAL;
4945 
4946 	if (copy_from_user(&ph, argp, sizeof(ph)))
4947 		return -EFAULT;
4948 
4949 	if (ph.len > header->len || !ph.len)
4950 		return -EINVAL;
4951 
4952 	if (ph.len > sizeof(struct ublk_params))
4953 		ph.len = sizeof(struct ublk_params);
4954 
4955 	mutex_lock(&ub->mutex);
4956 	ublk_ctrl_fill_params_devt(ub);
4957 	if (copy_to_user(argp, &ub->params, ph.len))
4958 		ret = -EFAULT;
4959 	else
4960 		ret = 0;
4961 	mutex_unlock(&ub->mutex);
4962 
4963 	return ret;
4964 }
4965 
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4966 static int ublk_ctrl_set_params(struct ublk_device *ub,
4967 		const struct ublksrv_ctrl_cmd *header)
4968 {
4969 	void __user *argp = (void __user *)(unsigned long)header->addr;
4970 	struct ublk_params_header ph;
4971 	int ret = -EFAULT;
4972 
4973 	if (header->len <= sizeof(ph) || !header->addr)
4974 		return -EINVAL;
4975 
4976 	if (copy_from_user(&ph, argp, sizeof(ph)))
4977 		return -EFAULT;
4978 
4979 	if (ph.len > header->len || !ph.len || !ph.types)
4980 		return -EINVAL;
4981 
4982 	if (ph.len > sizeof(struct ublk_params))
4983 		ph.len = sizeof(struct ublk_params);
4984 
4985 	mutex_lock(&ub->mutex);
4986 	if (test_bit(UB_STATE_USED, &ub->state)) {
4987 		/*
4988 		 * Parameters can only be changed when device hasn't
4989 		 * been started yet
4990 		 */
4991 		ret = -EACCES;
4992 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
4993 		ret = -EFAULT;
4994 	} else {
4995 		/* clear all we don't support yet */
4996 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
4997 		ret = ublk_validate_params(ub);
4998 		if (ret)
4999 			ub->params.types = 0;
5000 	}
5001 	mutex_unlock(&ub->mutex);
5002 
5003 	return ret;
5004 }
5005 
ublk_ctrl_start_recovery(struct ublk_device * ub)5006 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
5007 {
5008 	int ret = -EINVAL;
5009 
5010 	mutex_lock(&ub->mutex);
5011 	if (ublk_nosrv_should_stop_dev(ub))
5012 		goto out_unlock;
5013 	/*
5014 	 * START_RECOVERY is only allowd after:
5015 	 *
5016 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5017 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
5018 	 *     released.
5019 	 *
5020 	 * and one of the following holds
5021 	 *
5022 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5023 	 *     (a)has quiesced request queue
5024 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
5025 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5026 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
5027 	 *
5028 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5029 	 *     quiesced, but all I/O is being immediately errored
5030 	 */
5031 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5032 		ret = -EBUSY;
5033 		goto out_unlock;
5034 	}
5035 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5036 	init_completion(&ub->completion);
5037 	ret = 0;
5038  out_unlock:
5039 	mutex_unlock(&ub->mutex);
5040 	return ret;
5041 }
5042 
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5043 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5044 		const struct ublksrv_ctrl_cmd *header)
5045 {
5046 	int ublksrv_pid = (int)header->data[0];
5047 	int ret = -EINVAL;
5048 
5049 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5050 		 header->dev_id);
5051 
5052 	if (wait_for_completion_interruptible(&ub->completion))
5053 		return -EINTR;
5054 
5055 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5056 		 header->dev_id);
5057 
5058 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
5059 		return -EINVAL;
5060 
5061 	mutex_lock(&ub->mutex);
5062 	if (ublk_nosrv_should_stop_dev(ub))
5063 		goto out_unlock;
5064 
5065 	if (!ublk_dev_in_recoverable_state(ub)) {
5066 		ret = -EBUSY;
5067 		goto out_unlock;
5068 	}
5069 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5070 	ub->dev_info.state = UBLK_S_DEV_LIVE;
5071 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5072 			__func__, ublksrv_pid, header->dev_id);
5073 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
5074 	ret = 0;
5075  out_unlock:
5076 	mutex_unlock(&ub->mutex);
5077 	return ret;
5078 }
5079 
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)5080 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5081 {
5082 	void __user *argp = (void __user *)(unsigned long)header->addr;
5083 	u64 features = UBLK_F_ALL;
5084 
5085 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
5086 		return -EINVAL;
5087 
5088 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5089 		return -EFAULT;
5090 
5091 	return 0;
5092 }
5093 
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5094 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5095 {
5096 	struct ublk_param_basic *p = &ub->params.basic;
5097 	u64 new_size = header->data[0];
5098 	int ret = 0;
5099 
5100 	mutex_lock(&ub->mutex);
5101 	if (!ub->ub_disk) {
5102 		ret = -ENODEV;
5103 		goto out;
5104 	}
5105 	p->dev_sectors = new_size;
5106 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5107 out:
5108 	mutex_unlock(&ub->mutex);
5109 	return ret;
5110 }
5111 
5112 struct count_busy {
5113 	const struct ublk_queue *ubq;
5114 	unsigned int nr_busy;
5115 };
5116 
ublk_count_busy_req(struct request * rq,void * data)5117 static bool ublk_count_busy_req(struct request *rq, void *data)
5118 {
5119 	struct count_busy *idle = data;
5120 
5121 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5122 		idle->nr_busy += 1;
5123 	return true;
5124 }
5125 
5126 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)5127 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5128 {
5129 	struct count_busy data = {
5130 		.ubq = ubq,
5131 	};
5132 
5133 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5134 	return data.nr_busy < ubq->q_depth;
5135 }
5136 
5137 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)5138 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5139 				 unsigned int timeout_ms)
5140 {
5141 	unsigned int elapsed = 0;
5142 	int ret;
5143 
5144 	/*
5145 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5146 	 * or new fetch command, so needn't wait any more
5147 	 */
5148 	if (ublk_dev_support_batch_io(ub))
5149 		return 0;
5150 
5151 	while (elapsed < timeout_ms && !signal_pending(current)) {
5152 		unsigned int queues_cancelable = 0;
5153 		int i;
5154 
5155 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5156 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5157 
5158 			queues_cancelable += !!ubq_has_idle_io(ubq);
5159 		}
5160 
5161 		/*
5162 		 * Each queue needs at least one active command for
5163 		 * notifying ublk server
5164 		 */
5165 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5166 			break;
5167 
5168 		msleep(UBLK_REQUEUE_DELAY_MS);
5169 		elapsed += UBLK_REQUEUE_DELAY_MS;
5170 	}
5171 
5172 	if (signal_pending(current))
5173 		ret = -EINTR;
5174 	else if (elapsed >= timeout_ms)
5175 		ret = -EBUSY;
5176 	else
5177 		ret = 0;
5178 
5179 	return ret;
5180 }
5181 
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5182 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5183 				 const struct ublksrv_ctrl_cmd *header)
5184 {
5185 	/* zero means wait forever */
5186 	u64 timeout_ms = header->data[0];
5187 	struct gendisk *disk;
5188 	int ret = -ENODEV;
5189 
5190 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5191 		return -EOPNOTSUPP;
5192 
5193 	mutex_lock(&ub->mutex);
5194 	disk = ublk_get_disk(ub);
5195 	if (!disk)
5196 		goto unlock;
5197 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5198 		goto put_disk;
5199 
5200 	ret = 0;
5201 	/* already in expected state */
5202 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5203 		goto put_disk;
5204 
5205 	/* Mark the device as canceling */
5206 	mutex_lock(&ub->cancel_mutex);
5207 	blk_mq_quiesce_queue(disk->queue);
5208 	ublk_set_canceling(ub, true);
5209 	blk_mq_unquiesce_queue(disk->queue);
5210 	mutex_unlock(&ub->cancel_mutex);
5211 
5212 	if (!timeout_ms)
5213 		timeout_ms = UINT_MAX;
5214 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5215 
5216 put_disk:
5217 	ublk_put_disk(disk);
5218 unlock:
5219 	mutex_unlock(&ub->mutex);
5220 
5221 	/* Cancel pending uring_cmd */
5222 	if (!ret)
5223 		ublk_cancel_dev(ub);
5224 	return ret;
5225 }
5226 
5227 /*
5228  * All control commands are sent via /dev/ublk-control, so we have to check
5229  * the destination device's permission
5230  */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)5231 static int ublk_char_dev_permission(struct ublk_device *ub,
5232 		const char *dev_path, int mask)
5233 {
5234 	int err;
5235 	struct path path;
5236 	struct kstat stat;
5237 
5238 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5239 	if (err)
5240 		return err;
5241 
5242 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5243 	if (err)
5244 		goto exit;
5245 
5246 	err = -EPERM;
5247 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5248 		goto exit;
5249 
5250 	err = inode_permission(&nop_mnt_idmap,
5251 			d_backing_inode(path.dentry), mask);
5252 exit:
5253 	path_put(&path);
5254 	return err;
5255 }
5256 
5257 /*
5258  * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5259  * if device is started. If device is not yet started, only mutex is
5260  * needed since no I/O path can access the tree.
5261  *
5262  * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5263  * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5264 */
ublk_lock_buf_tree(struct ublk_device * ub)5265 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5266 {
5267 	unsigned int memflags = 0;
5268 
5269 	mutex_lock(&ub->mutex);
5270 	if (ub->ub_disk)
5271 		memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5272 
5273 	return memflags;
5274 }
5275 
ublk_unlock_buf_tree(struct ublk_device * ub,unsigned int memflags)5276 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5277 {
5278 	if (ub->ub_disk)
5279 		blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5280 	mutex_unlock(&ub->mutex);
5281 }
5282 
5283 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
ublk_buf_erase_ranges(struct ublk_device * ub,int buf_index)5284 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5285 {
5286 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5287 	struct ublk_buf_range *range;
5288 
5289 	mas_lock(&mas);
5290 	mas_for_each(&mas, range, ULONG_MAX) {
5291 		if (range->buf_index == buf_index) {
5292 			mas_erase(&mas);
5293 			kfree(range);
5294 		}
5295 	}
5296 	mas_unlock(&mas);
5297 }
5298 
__ublk_ctrl_reg_buf(struct ublk_device * ub,struct page ** pages,unsigned long nr_pages,int index,unsigned short flags)5299 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5300 			       struct page **pages, unsigned long nr_pages,
5301 			       int index, unsigned short flags)
5302 {
5303 	unsigned long i;
5304 	int ret;
5305 
5306 	for (i = 0; i < nr_pages; i++) {
5307 		unsigned long pfn = page_to_pfn(pages[i]);
5308 		unsigned long start = i;
5309 		struct ublk_buf_range *range;
5310 
5311 		/* Find run of consecutive PFNs */
5312 		while (i + 1 < nr_pages &&
5313 		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5314 			i++;
5315 
5316 		range = kzalloc(sizeof(*range), GFP_KERNEL);
5317 		if (!range) {
5318 			ret = -ENOMEM;
5319 			goto unwind;
5320 		}
5321 		range->buf_index = index;
5322 		range->flags = flags;
5323 		range->base_offset = start << PAGE_SHIFT;
5324 
5325 		ret = mtree_insert_range(&ub->buf_tree, pfn,
5326 					 pfn + (i - start),
5327 					 range, GFP_KERNEL);
5328 		if (ret) {
5329 			kfree(range);
5330 			goto unwind;
5331 		}
5332 	}
5333 	return 0;
5334 
5335 unwind:
5336 	ublk_buf_erase_ranges(ub, index);
5337 	return ret;
5338 }
5339 
5340 /*
5341  * Register a shared memory buffer for zero-copy I/O.
5342  * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5343  * internally. Returns buffer index (>= 0) on success.
5344  */
ublk_ctrl_reg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5345 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5346 			     struct ublksrv_ctrl_cmd *header)
5347 {
5348 	void __user *argp = (void __user *)(unsigned long)header->addr;
5349 	struct ublk_shmem_buf_reg buf_reg;
5350 	unsigned long nr_pages;
5351 	struct page **pages = NULL;
5352 	unsigned int gup_flags;
5353 	unsigned int memflags;
5354 	long pinned;
5355 	int index;
5356 	int ret;
5357 
5358 	if (!ublk_dev_support_shmem_zc(ub))
5359 		return -EOPNOTSUPP;
5360 
5361 	memset(&buf_reg, 0, sizeof(buf_reg));
5362 	if (copy_from_user(&buf_reg, argp,
5363 			   min_t(size_t, header->len, sizeof(buf_reg))))
5364 		return -EFAULT;
5365 
5366 	if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5367 		return -EINVAL;
5368 
5369 	if (buf_reg.reserved)
5370 		return -EINVAL;
5371 
5372 	if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5373 	    !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5374 		return -EINVAL;
5375 
5376 	nr_pages = buf_reg.len >> PAGE_SHIFT;
5377 
5378 	/* Pin pages before any locks (may sleep) */
5379 	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5380 	if (!pages)
5381 		return -ENOMEM;
5382 
5383 	gup_flags = FOLL_LONGTERM;
5384 	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5385 		gup_flags |= FOLL_WRITE;
5386 
5387 	pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5388 	if (pinned < 0) {
5389 		ret = pinned;
5390 		goto err_free_pages;
5391 	}
5392 	if (pinned != nr_pages) {
5393 		ret = -EFAULT;
5394 		goto err_unpin;
5395 	}
5396 
5397 	memflags = ublk_lock_buf_tree(ub);
5398 
5399 	index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5400 	if (index < 0) {
5401 		ret = index;
5402 		goto err_unlock;
5403 	}
5404 
5405 	ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5406 	if (ret) {
5407 		ida_free(&ub->buf_ida, index);
5408 		goto err_unlock;
5409 	}
5410 
5411 	ublk_unlock_buf_tree(ub, memflags);
5412 	kvfree(pages);
5413 	return index;
5414 
5415 err_unlock:
5416 	ublk_unlock_buf_tree(ub, memflags);
5417 err_unpin:
5418 	unpin_user_pages(pages, pinned);
5419 err_free_pages:
5420 	kvfree(pages);
5421 	return ret;
5422 }
5423 
ublk_unpin_range_pages(unsigned long base_pfn,unsigned long nr_pages)5424 static void ublk_unpin_range_pages(unsigned long base_pfn,
5425 				   unsigned long nr_pages)
5426 {
5427 #define UBLK_UNPIN_BATCH	32
5428 	struct page *pages[UBLK_UNPIN_BATCH];
5429 	unsigned long off;
5430 
5431 	for (off = 0; off < nr_pages; ) {
5432 		unsigned int batch = min_t(unsigned long,
5433 					   nr_pages - off, UBLK_UNPIN_BATCH);
5434 		unsigned int j;
5435 
5436 		for (j = 0; j < batch; j++)
5437 			pages[j] = pfn_to_page(base_pfn + off + j);
5438 		unpin_user_pages(pages, batch);
5439 		off += batch;
5440 	}
5441 }
5442 
5443 /*
5444  * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under
5445  * mas_lock, collecting them into an xarray. Then drop the lock and
5446  * unpin pages + free ranges outside spinlock context.
5447  *
5448  * Returns true if the tree walk completed, false if more ranges remain.
5449  * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value().
5450  */
5451 #define UBLK_REMOVE_BATCH	64
5452 
__ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index,int * ret)5453 static bool __ublk_shmem_remove_ranges(struct ublk_device *ub,
5454 					int buf_index, int *ret)
5455 {
5456 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5457 	struct ublk_buf_range *range;
5458 	struct xarray to_unpin;
5459 	unsigned long idx;
5460 	unsigned int count = 0;
5461 	bool done = false;
5462 	void *entry;
5463 
5464 	xa_init(&to_unpin);
5465 
5466 	mas_lock(&mas);
5467 	mas_for_each(&mas, range, ULONG_MAX) {
5468 		unsigned long nr;
5469 
5470 		if (buf_index >= 0 && range->buf_index != buf_index)
5471 			continue;
5472 
5473 		*ret = 0;
5474 		nr = mas.last - mas.index + 1;
5475 		if (xa_err(xa_store(&to_unpin, mas.index,
5476 				    xa_mk_value(nr), GFP_ATOMIC)))
5477 			goto unlock;
5478 		mas_erase(&mas);
5479 		kfree(range);
5480 		if (++count >= UBLK_REMOVE_BATCH)
5481 			goto unlock;
5482 	}
5483 	done = true;
5484 unlock:
5485 	mas_unlock(&mas);
5486 
5487 	xa_for_each(&to_unpin, idx, entry)
5488 		ublk_unpin_range_pages(idx, xa_to_value(entry));
5489 	xa_destroy(&to_unpin);
5490 
5491 	return done;
5492 }
5493 
5494 /*
5495  * Remove ranges from the maple tree matching buf_index, unpin pages
5496  * and free range structs. If buf_index < 0, remove all ranges.
5497  * Processes ranges in batches to avoid holding the maple tree spinlock
5498  * across potentially expensive page unpinning.
5499  */
ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index)5500 static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index)
5501 {
5502 	int ret = -ENOENT;
5503 
5504 	while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret))
5505 		cond_resched();
5506 	return ret;
5507 }
5508 
ublk_ctrl_unreg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5509 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5510 			       struct ublksrv_ctrl_cmd *header)
5511 {
5512 	int index = (int)header->data[0];
5513 	unsigned int memflags;
5514 	int ret;
5515 
5516 	if (!ublk_dev_support_shmem_zc(ub))
5517 		return -EOPNOTSUPP;
5518 
5519 	if (index < 0 || index > USHRT_MAX)
5520 		return -EINVAL;
5521 
5522 	memflags = ublk_lock_buf_tree(ub);
5523 
5524 	ret = ublk_shmem_remove_ranges(ub, index);
5525 	if (!ret)
5526 		ida_free(&ub->buf_ida, index);
5527 
5528 	ublk_unlock_buf_tree(ub, memflags);
5529 	return ret;
5530 }
5531 
ublk_buf_cleanup(struct ublk_device * ub)5532 static void ublk_buf_cleanup(struct ublk_device *ub)
5533 {
5534 	ublk_shmem_remove_ranges(ub, -1);
5535 	mtree_destroy(&ub->buf_tree);
5536 	ida_destroy(&ub->buf_ida);
5537 }
5538 
5539 /* Check if request pages match a registered shared memory buffer */
ublk_try_buf_match(struct ublk_device * ub,struct request * rq,u32 * buf_idx,u32 * buf_off)5540 static bool ublk_try_buf_match(struct ublk_device *ub,
5541 				   struct request *rq,
5542 				   u32 *buf_idx, u32 *buf_off)
5543 {
5544 	struct req_iterator iter;
5545 	struct bio_vec bv;
5546 	int index = -1;
5547 	unsigned long expected_offset = 0;
5548 	bool first = true;
5549 
5550 	rq_for_each_bvec(bv, rq, iter) {
5551 		unsigned long pfn = page_to_pfn(bv.bv_page);
5552 		unsigned long end_pfn = pfn +
5553 			((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5554 		struct ublk_buf_range *range;
5555 		unsigned long off;
5556 		MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5557 
5558 		range = mas_walk(&mas);
5559 		if (!range)
5560 			return false;
5561 
5562 		/* verify all pages in this bvec fall within the range */
5563 		if (end_pfn > mas.last)
5564 			return false;
5565 
5566 		off = range->base_offset +
5567 			(pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5568 
5569 		if (first) {
5570 			/* Read-only buffer can't serve READ (kernel writes) */
5571 			if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5572 			    req_op(rq) != REQ_OP_WRITE)
5573 				return false;
5574 			index = range->buf_index;
5575 			expected_offset = off;
5576 			*buf_off = off;
5577 			first = false;
5578 		} else {
5579 			if (range->buf_index != index)
5580 				return false;
5581 			if (off != expected_offset)
5582 				return false;
5583 		}
5584 		expected_offset += bv.bv_len;
5585 	}
5586 
5587 	if (first)
5588 		return false;
5589 
5590 	*buf_idx = index;
5591 	return true;
5592 }
5593 
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,u32 cmd_op,struct ublksrv_ctrl_cmd * header)5594 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5595 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5596 {
5597 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5598 	void __user *argp = (void __user *)(unsigned long)header->addr;
5599 	char *dev_path = NULL;
5600 	int ret = 0;
5601 	int mask;
5602 
5603 	if (!unprivileged) {
5604 		if (!capable(CAP_SYS_ADMIN))
5605 			return -EPERM;
5606 		/*
5607 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5608 		 * char_dev_path in payload too, since userspace may not
5609 		 * know if the specified device is created as unprivileged
5610 		 * mode.
5611 		 */
5612 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5613 			return 0;
5614 	}
5615 
5616 	/*
5617 	 * User has to provide the char device path for unprivileged ublk
5618 	 *
5619 	 * header->addr always points to the dev path buffer, and
5620 	 * header->dev_path_len records length of dev path buffer.
5621 	 */
5622 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5623 		return -EINVAL;
5624 
5625 	if (header->len < header->dev_path_len)
5626 		return -EINVAL;
5627 
5628 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5629 	if (IS_ERR(dev_path))
5630 		return PTR_ERR(dev_path);
5631 
5632 	ret = -EINVAL;
5633 	switch (_IOC_NR(cmd_op)) {
5634 	case UBLK_CMD_GET_DEV_INFO:
5635 	case UBLK_CMD_GET_DEV_INFO2:
5636 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5637 	case UBLK_CMD_GET_PARAMS:
5638 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5639 		mask = MAY_READ;
5640 		break;
5641 	case UBLK_CMD_START_DEV:
5642 	case UBLK_CMD_STOP_DEV:
5643 	case UBLK_CMD_ADD_DEV:
5644 	case UBLK_CMD_DEL_DEV:
5645 	case UBLK_CMD_SET_PARAMS:
5646 	case UBLK_CMD_START_USER_RECOVERY:
5647 	case UBLK_CMD_END_USER_RECOVERY:
5648 	case UBLK_CMD_UPDATE_SIZE:
5649 	case UBLK_CMD_QUIESCE_DEV:
5650 	case UBLK_CMD_TRY_STOP_DEV:
5651 	case UBLK_CMD_REG_BUF:
5652 	case UBLK_CMD_UNREG_BUF:
5653 		mask = MAY_READ | MAY_WRITE;
5654 		break;
5655 	default:
5656 		goto exit;
5657 	}
5658 
5659 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5660 	if (!ret) {
5661 		header->len -= header->dev_path_len;
5662 		header->addr += header->dev_path_len;
5663 	}
5664 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5665 			__func__, ub->ub_number, cmd_op,
5666 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5667 			dev_path, ret);
5668 exit:
5669 	kfree(dev_path);
5670 	return ret;
5671 }
5672 
ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)5673 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5674 {
5675 	switch (_IOC_NR(cmd_op)) {
5676 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5677 	case UBLK_CMD_GET_DEV_INFO:
5678 	case UBLK_CMD_GET_DEV_INFO2:
5679 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5680 		return false;
5681 	default:
5682 		return true;
5683 	}
5684 }
5685 
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)5686 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5687 		unsigned int issue_flags)
5688 {
5689 	/* May point to userspace-mapped memory */
5690 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5691 								    struct ublksrv_ctrl_cmd);
5692 	struct ublksrv_ctrl_cmd header;
5693 	struct ublk_device *ub = NULL;
5694 	u32 cmd_op = cmd->cmd_op;
5695 	int ret = -EINVAL;
5696 
5697 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5698 	    issue_flags & IO_URING_F_NONBLOCK)
5699 		return -EAGAIN;
5700 
5701 	if (!(issue_flags & IO_URING_F_SQE128))
5702 		return -EINVAL;
5703 
5704 	header.dev_id = READ_ONCE(ub_src->dev_id);
5705 	header.queue_id = READ_ONCE(ub_src->queue_id);
5706 	header.len = READ_ONCE(ub_src->len);
5707 	header.addr = READ_ONCE(ub_src->addr);
5708 	header.data[0] = READ_ONCE(ub_src->data[0]);
5709 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5710 	ublk_ctrl_cmd_dump(cmd_op, &header);
5711 
5712 	ret = ublk_check_cmd_op(cmd_op);
5713 	if (ret)
5714 		goto out;
5715 
5716 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5717 		ret = ublk_ctrl_get_features(&header);
5718 		goto out;
5719 	}
5720 
5721 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5722 		ret = -ENODEV;
5723 		ub = ublk_get_device_from_id(header.dev_id);
5724 		if (!ub)
5725 			goto out;
5726 
5727 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5728 		if (ret)
5729 			goto put_dev;
5730 	}
5731 
5732 	switch (_IOC_NR(cmd_op)) {
5733 	case UBLK_CMD_START_DEV:
5734 		ret = ublk_ctrl_start_dev(ub, &header);
5735 		break;
5736 	case UBLK_CMD_STOP_DEV:
5737 		ublk_ctrl_stop_dev(ub);
5738 		ret = 0;
5739 		break;
5740 	case UBLK_CMD_GET_DEV_INFO:
5741 	case UBLK_CMD_GET_DEV_INFO2:
5742 		ret = ublk_ctrl_get_dev_info(ub, &header);
5743 		break;
5744 	case UBLK_CMD_ADD_DEV:
5745 		ret = ublk_ctrl_add_dev(&header);
5746 		break;
5747 	case UBLK_CMD_DEL_DEV:
5748 		ret = ublk_ctrl_del_dev(&ub, true);
5749 		break;
5750 	case UBLK_CMD_DEL_DEV_ASYNC:
5751 		ret = ublk_ctrl_del_dev(&ub, false);
5752 		break;
5753 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5754 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5755 		break;
5756 	case UBLK_CMD_GET_PARAMS:
5757 		ret = ublk_ctrl_get_params(ub, &header);
5758 		break;
5759 	case UBLK_CMD_SET_PARAMS:
5760 		ret = ublk_ctrl_set_params(ub, &header);
5761 		break;
5762 	case UBLK_CMD_START_USER_RECOVERY:
5763 		ret = ublk_ctrl_start_recovery(ub);
5764 		break;
5765 	case UBLK_CMD_END_USER_RECOVERY:
5766 		ret = ublk_ctrl_end_recovery(ub, &header);
5767 		break;
5768 	case UBLK_CMD_UPDATE_SIZE:
5769 		ret = ublk_ctrl_set_size(ub, &header);
5770 		break;
5771 	case UBLK_CMD_QUIESCE_DEV:
5772 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5773 		break;
5774 	case UBLK_CMD_TRY_STOP_DEV:
5775 		ret = ublk_ctrl_try_stop_dev(ub);
5776 		break;
5777 	case UBLK_CMD_REG_BUF:
5778 		ret = ublk_ctrl_reg_buf(ub, &header);
5779 		break;
5780 	case UBLK_CMD_UNREG_BUF:
5781 		ret = ublk_ctrl_unreg_buf(ub, &header);
5782 		break;
5783 	default:
5784 		ret = -EOPNOTSUPP;
5785 		break;
5786 	}
5787 
5788  put_dev:
5789 	if (ub)
5790 		ublk_put_device(ub);
5791  out:
5792 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5793 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5794 	return ret;
5795 }
5796 
5797 static const struct file_operations ublk_ctl_fops = {
5798 	.open		= nonseekable_open,
5799 	.uring_cmd      = ublk_ctrl_uring_cmd,
5800 	.owner		= THIS_MODULE,
5801 	.llseek		= noop_llseek,
5802 };
5803 
5804 static struct miscdevice ublk_misc = {
5805 	.minor		= MISC_DYNAMIC_MINOR,
5806 	.name		= "ublk-control",
5807 	.fops		= &ublk_ctl_fops,
5808 };
5809 
ublk_init(void)5810 static int __init ublk_init(void)
5811 {
5812 	int ret;
5813 
5814 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5815 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5816 	/*
5817 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5818 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5819 	 */
5820 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5821 		     UBLKSRV_IO_INTEGRITY_FLAG);
5822 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5823 
5824 	init_waitqueue_head(&ublk_idr_wq);
5825 
5826 	ret = misc_register(&ublk_misc);
5827 	if (ret)
5828 		return ret;
5829 
5830 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5831 	if (ret)
5832 		goto unregister_mis;
5833 
5834 	ret = class_register(&ublk_chr_class);
5835 	if (ret)
5836 		goto free_chrdev_region;
5837 
5838 	return 0;
5839 
5840 free_chrdev_region:
5841 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5842 unregister_mis:
5843 	misc_deregister(&ublk_misc);
5844 	return ret;
5845 }
5846 
ublk_exit(void)5847 static void __exit ublk_exit(void)
5848 {
5849 	struct ublk_device *ub;
5850 	int id;
5851 
5852 	idr_for_each_entry(&ublk_index_idr, ub, id)
5853 		ublk_remove(ub);
5854 
5855 	class_unregister(&ublk_chr_class);
5856 	misc_deregister(&ublk_misc);
5857 
5858 	idr_destroy(&ublk_index_idr);
5859 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5860 }
5861 
5862 module_init(ublk_init);
5863 module_exit(ublk_exit);
5864 
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)5865 static int ublk_set_max_unprivileged_ublks(const char *buf,
5866 					   const struct kernel_param *kp)
5867 {
5868 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5869 }
5870 
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)5871 static int ublk_get_max_unprivileged_ublks(char *buf,
5872 					   const struct kernel_param *kp)
5873 {
5874 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5875 }
5876 
5877 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5878 	.set = ublk_set_max_unprivileged_ublks,
5879 	.get = ublk_get_max_unprivileged_ublks,
5880 };
5881 
5882 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5883 		&unprivileged_ublks_max, 0644);
5884 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5885 
5886 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5887 MODULE_DESCRIPTION("Userspace block device");
5888 MODULE_LICENSE("GPL");
5889