xref: /linux/drivers/block/ublk_drv.c (revision d458a240344c4369bf6f3da203f2779515177738)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53 
54 #define UBLK_MINORS		(1U << MINORBITS)
55 
56 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
57 
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF	_IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF	_IOC_NR(UBLK_U_CMD_UNREG_BUF)
65 
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX	(1ULL << 32)
68 
69 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71 
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 		| UBLK_F_URING_CMD_COMP_IN_TASK \
75 		| UBLK_F_NEED_GET_DATA \
76 		| UBLK_F_USER_RECOVERY \
77 		| UBLK_F_USER_RECOVERY_REISSUE \
78 		| UBLK_F_UNPRIVILEGED_DEV \
79 		| UBLK_F_CMD_IOCTL_ENCODE \
80 		| UBLK_F_USER_COPY \
81 		| UBLK_F_ZONED \
82 		| UBLK_F_USER_RECOVERY_FAIL_IO \
83 		| UBLK_F_UPDATE_SIZE \
84 		| UBLK_F_AUTO_BUF_REG \
85 		| UBLK_F_QUIESCE \
86 		| UBLK_F_PER_IO_DAEMON \
87 		| UBLK_F_BUF_REG_OFF_DAEMON \
88 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 		| UBLK_F_SAFE_STOP_DEV \
90 		| UBLK_F_BATCH_IO \
91 		| UBLK_F_NO_AUTO_PART_SCAN \
92 		| UBLK_F_SHMEM_ZC)
93 
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 		| UBLK_F_USER_RECOVERY_REISSUE \
96 		| UBLK_F_USER_RECOVERY_FAIL_IO)
97 
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL                                \
100 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
102 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 	 UBLK_PARAM_TYPE_INTEGRITY)
104 
105 #define UBLK_BATCH_F_ALL  \
106 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
107 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109 
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 	struct list_head node;
113 	struct io_uring_cmd *cmd;
114 	unsigned short buf_group;
115 };
116 
117 struct ublk_uring_cmd_pdu {
118 	/*
119 	 * Store requests in same batch temporarily for queuing them to
120 	 * daemon context.
121 	 *
122 	 * It should have been stored to request payload, but we do want
123 	 * to avoid extra pre-allocation, and uring_cmd payload is always
124 	 * free for us
125 	 */
126 	union {
127 		struct request *req;
128 		struct request *req_list;
129 	};
130 
131 	/*
132 	 * The following two are valid in this cmd whole lifetime, and
133 	 * setup in ublk uring_cmd handler
134 	 */
135 	struct ublk_queue *ubq;
136 
137 	union {
138 		u16 tag;
139 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 	};
141 };
142 
143 struct ublk_batch_io_data {
144 	struct ublk_device *ub;
145 	struct io_uring_cmd *cmd;
146 	struct ublk_batch_io header;
147 	unsigned int issue_flags;
148 	struct io_comp_batch *iob;
149 };
150 
151 /*
152  * io command is active: sqe cmd is received, and its cqe isn't done
153  *
154  * If the flag is set, the io command is owned by ublk driver, and waited
155  * for incoming blk-mq request from the ublk block device.
156  *
157  * If the flag is cleared, the io command will be completed, and owned by
158  * ublk server.
159  */
160 #define UBLK_IO_FLAG_ACTIVE	0x01
161 
162 /*
163  * IO command is completed via cqe, and it is being handled by ublksrv, and
164  * not committed yet
165  *
166  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167  * cross verification
168  */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170 
171 /*
172  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173  * get data buffer address from ublksrv.
174  *
175  * Then, bio data could be copied into this data buffer for a WRITE request
176  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177  */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179 
180 /*
181  * request buffer is registered automatically, so we have to unregister it
182  * before completing this request.
183  *
184  * io_uring will unregister buffer automatically for us during exiting.
185  */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
187 
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED	0x80000000
190 
191 /*
192  * Initialize refcount to a large number to include any registered buffers.
193  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194  * any buffers registered on the io daemon task.
195  */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197 
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
200 
201 union ublk_io_buf {
202 	__u64	addr;
203 	struct ublk_auto_buf_reg auto_reg;
204 };
205 
206 struct ublk_io {
207 	union ublk_io_buf buf;
208 	unsigned int flags;
209 	int res;
210 
211 	union {
212 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
213 		struct io_uring_cmd *cmd;
214 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 		struct request *req;
216 	};
217 
218 	struct task_struct *task;
219 
220 	/*
221 	 * The number of uses of this I/O by the ublk server
222 	 * if user copy or zero copy are enabled:
223 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
225 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 	 * - 1 for each io_uring registered buffer not registered on task
227 	 * The I/O can only be completed once all references are dropped.
228 	 * User copy and buffer registration operations are only permitted
229 	 * if the reference count is nonzero.
230 	 */
231 	refcount_t ref;
232 	/* Count of buffers registered on task and not yet unregistered */
233 	unsigned task_registered_buffers;
234 
235 	void *buf_ctx_handle;
236 	spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238 
239 struct ublk_queue {
240 	int q_id;
241 	int q_depth;
242 
243 	unsigned long flags;
244 	struct ublksrv_io_desc *io_cmd_buf;
245 
246 	bool force_abort;
247 	bool canceling;
248 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 	spinlock_t		cancel_lock;
250 	struct ublk_device *dev;
251 	u32 nr_io_ready;
252 
253 	/*
254 	 * For supporting UBLK_F_BATCH_IO only.
255 	 *
256 	 * Inflight ublk request tag is saved in this fifo
257 	 *
258 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 	 * so lock is required for storing request tag to fifo
260 	 *
261 	 * Make sure just one reader for fetching request from task work
262 	 * function to ublk server, so no need to grab the lock in reader
263 	 * side.
264 	 *
265 	 * Batch I/O State Management:
266 	 *
267 	 * The batch I/O system uses implicit state management based on the
268 	 * combination of three key variables below.
269 	 *
270 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 	 *   No fetch commands available, events queue in evts_fifo
272 	 *
273 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 	 *   Fetch commands available but none processing events
275 	 *
276 	 * - ACTIVE: active_fcmd
277 	 *   One fetch command actively processing events from evts_fifo
278 	 *
279 	 * Key Invariants:
280 	 * - At most one active_fcmd at any time (single reader)
281 	 * - active_fcmd is always from fcmd_head list when non-NULL
282 	 * - evts_fifo can be read locklessly by the single active reader
283 	 * - All state transitions require evts_lock protection
284 	 * - Multiple writers to evts_fifo require lock protection
285 	 */
286 	struct {
287 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 		spinlock_t evts_lock;
289 
290 		/* List of fetch commands available to process events */
291 		struct list_head fcmd_head;
292 
293 		/* Currently active fetch command (NULL = none active) */
294 		struct ublk_batch_fetch_cmd  *active_fcmd;
295 	}____cacheline_aligned_in_smp;
296 
297 	struct ublk_io ios[] __counted_by(q_depth);
298 };
299 
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 	unsigned short buf_index;
303 	unsigned short flags;
304 	unsigned int base_offset;	/* byte offset within buffer */
305 };
306 
307 struct ublk_device {
308 	struct gendisk		*ub_disk;
309 
310 	struct ublksrv_ctrl_dev_info	dev_info;
311 
312 	struct blk_mq_tag_set	tag_set;
313 
314 	struct cdev		cdev;
315 	struct device		cdev_dev;
316 
317 #define UB_STATE_OPEN		0
318 #define UB_STATE_USED		1
319 #define UB_STATE_DELETED	2
320 	unsigned long		state;
321 	int			ub_number;
322 
323 	struct mutex		mutex;
324 
325 	spinlock_t		lock;
326 	struct mm_struct	*mm;
327 
328 	struct ublk_params	params;
329 
330 	struct completion	completion;
331 	u32			nr_queue_ready;
332 	bool 			unprivileged_daemons;
333 	struct mutex cancel_mutex;
334 	bool canceling;
335 	pid_t 	ublksrv_tgid;
336 	struct delayed_work	exit_work;
337 	struct work_struct	partition_scan_work;
338 
339 	bool			block_open; /* protected by open_mutex */
340 
341 	/* shared memory zero copy */
342 	struct maple_tree	buf_tree;
343 	struct ida		buf_ida;
344 
345 	struct ublk_queue       *queues[];
346 };
347 
348 /* header of ublk_params */
349 struct ublk_params_header {
350 	__u32	len;
351 	__u32	types;
352 };
353 
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 				  u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 		u16 q_id, u16 tag, struct ublk_io *io);
362 static inline unsigned int ublk_req_build_flags(struct request *req);
363 static void ublk_batch_dispatch(struct ublk_queue *ubq,
364 				const struct ublk_batch_io_data *data,
365 				struct ublk_batch_fetch_cmd *fcmd);
366 
ublk_dev_support_batch_io(const struct ublk_device * ub)367 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
368 {
369 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
370 }
371 
ublk_support_batch_io(const struct ublk_queue * ubq)372 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
373 {
374 	return ubq->flags & UBLK_F_BATCH_IO;
375 }
376 
ublk_io_lock(struct ublk_io * io)377 static inline void ublk_io_lock(struct ublk_io *io)
378 {
379 	spin_lock(&io->lock);
380 }
381 
ublk_io_unlock(struct ublk_io * io)382 static inline void ublk_io_unlock(struct ublk_io *io)
383 {
384 	spin_unlock(&io->lock);
385 }
386 
387 /* Initialize the event queue */
ublk_io_evts_init(struct ublk_queue * q,unsigned int size,int numa_node)388 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
389 				    int numa_node)
390 {
391 	spin_lock_init(&q->evts_lock);
392 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
393 }
394 
395 /* Check if event queue is empty */
ublk_io_evts_empty(const struct ublk_queue * q)396 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
397 {
398 	return kfifo_is_empty(&q->evts_fifo);
399 }
400 
ublk_io_evts_deinit(struct ublk_queue * q)401 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
402 {
403 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
404 	kfifo_free(&q->evts_fifo);
405 }
406 
407 static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue * ubq,unsigned tag)408 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
409 {
410 	return &ubq->io_cmd_buf[tag];
411 }
412 
ublk_support_zero_copy(const struct ublk_queue * ubq)413 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
414 {
415 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
416 }
417 
ublk_dev_support_zero_copy(const struct ublk_device * ub)418 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
419 {
420 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
421 }
422 
ublk_support_shmem_zc(const struct ublk_queue * ubq)423 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
424 {
425 	return ubq->flags & UBLK_F_SHMEM_ZC;
426 }
427 
ublk_iod_is_shmem_zc(const struct ublk_queue * ubq,unsigned int tag)428 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
429 					unsigned int tag)
430 {
431 	return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
432 }
433 
ublk_dev_support_shmem_zc(const struct ublk_device * ub)434 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
435 {
436 	return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
437 }
438 
ublk_support_auto_buf_reg(const struct ublk_queue * ubq)439 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
440 {
441 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
442 }
443 
ublk_dev_support_auto_buf_reg(const struct ublk_device * ub)444 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
445 {
446 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
447 }
448 
ublk_support_user_copy(const struct ublk_queue * ubq)449 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
450 {
451 	return ubq->flags & UBLK_F_USER_COPY;
452 }
453 
ublk_dev_support_user_copy(const struct ublk_device * ub)454 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
455 {
456 	return ub->dev_info.flags & UBLK_F_USER_COPY;
457 }
458 
ublk_dev_is_zoned(const struct ublk_device * ub)459 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
460 {
461 	return ub->dev_info.flags & UBLK_F_ZONED;
462 }
463 
ublk_queue_is_zoned(const struct ublk_queue * ubq)464 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
465 {
466 	return ubq->flags & UBLK_F_ZONED;
467 }
468 
ublk_dev_support_integrity(const struct ublk_device * ub)469 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
470 {
471 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
472 }
473 
474 #ifdef CONFIG_BLK_DEV_ZONED
475 
476 struct ublk_zoned_report_desc {
477 	__u64 sector;
478 	__u32 operation;
479 	__u32 nr_zones;
480 };
481 
482 static DEFINE_XARRAY(ublk_zoned_report_descs);
483 
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)484 static int ublk_zoned_insert_report_desc(const struct request *req,
485 		struct ublk_zoned_report_desc *desc)
486 {
487 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
488 			    desc, GFP_KERNEL);
489 }
490 
ublk_zoned_erase_report_desc(const struct request * req)491 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
492 		const struct request *req)
493 {
494 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
495 }
496 
ublk_zoned_get_report_desc(const struct request * req)497 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
498 		const struct request *req)
499 {
500 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
501 }
502 
ublk_get_nr_zones(const struct ublk_device * ub)503 static int ublk_get_nr_zones(const struct ublk_device *ub)
504 {
505 	const struct ublk_param_basic *p = &ub->params.basic;
506 
507 	/* Zone size is a power of 2 */
508 	return p->dev_sectors >> ilog2(p->chunk_sectors);
509 }
510 
ublk_revalidate_disk_zones(struct ublk_device * ub)511 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
512 {
513 	return blk_revalidate_disk_zones(ub->ub_disk);
514 }
515 
ublk_dev_param_zoned_validate(const struct ublk_device * ub)516 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
517 {
518 	const struct ublk_param_zoned *p = &ub->params.zoned;
519 	int nr_zones;
520 
521 	if (!ublk_dev_is_zoned(ub))
522 		return -EINVAL;
523 
524 	if (!p->max_zone_append_sectors)
525 		return -EINVAL;
526 
527 	nr_zones = ublk_get_nr_zones(ub);
528 
529 	if (p->max_active_zones > nr_zones)
530 		return -EINVAL;
531 
532 	if (p->max_open_zones > nr_zones)
533 		return -EINVAL;
534 
535 	return 0;
536 }
537 
ublk_dev_param_zoned_apply(struct ublk_device * ub)538 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
539 {
540 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
541 }
542 
543 /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)544 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
545 				      unsigned int nr_zones, size_t *buflen)
546 {
547 	struct request_queue *q = ublk->ub_disk->queue;
548 	size_t bufsize;
549 	void *buf;
550 
551 	nr_zones = min_t(unsigned int, nr_zones,
552 			 ublk->ub_disk->nr_zones);
553 
554 	bufsize = nr_zones * sizeof(struct blk_zone);
555 	bufsize =
556 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
557 
558 	while (bufsize >= sizeof(struct blk_zone)) {
559 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
560 		if (buf) {
561 			*buflen = bufsize;
562 			return buf;
563 		}
564 		bufsize >>= 1;
565 	}
566 
567 	*buflen = 0;
568 	return NULL;
569 }
570 
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)571 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
572 		      unsigned int nr_zones, struct blk_report_zones_args *args)
573 {
574 	struct ublk_device *ub = disk->private_data;
575 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
576 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
577 	unsigned int done_zones = 0;
578 	unsigned int max_zones_per_request;
579 	int ret;
580 	struct blk_zone *buffer;
581 	size_t buffer_length;
582 
583 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
584 			 nr_zones);
585 
586 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
587 	if (!buffer)
588 		return -ENOMEM;
589 
590 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
591 
592 	while (done_zones < nr_zones) {
593 		unsigned int remaining_zones = nr_zones - done_zones;
594 		unsigned int zones_in_request =
595 			min_t(unsigned int, remaining_zones, max_zones_per_request);
596 		struct request *req;
597 		struct ublk_zoned_report_desc desc;
598 		blk_status_t status;
599 
600 		memset(buffer, 0, buffer_length);
601 
602 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
603 		if (IS_ERR(req)) {
604 			ret = PTR_ERR(req);
605 			goto out;
606 		}
607 
608 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
609 		desc.sector = sector;
610 		desc.nr_zones = zones_in_request;
611 		ret = ublk_zoned_insert_report_desc(req, &desc);
612 		if (ret)
613 			goto free_req;
614 
615 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
616 		if (ret)
617 			goto erase_desc;
618 
619 		status = blk_execute_rq(req, 0);
620 		ret = blk_status_to_errno(status);
621 erase_desc:
622 		ublk_zoned_erase_report_desc(req);
623 free_req:
624 		blk_mq_free_request(req);
625 		if (ret)
626 			goto out;
627 
628 		for (unsigned int i = 0; i < zones_in_request; i++) {
629 			struct blk_zone *zone = buffer + i;
630 
631 			/* A zero length zone means no more zones in this response */
632 			if (!zone->len)
633 				break;
634 
635 			ret = disk_report_zone(disk, zone, i, args);
636 			if (ret)
637 				goto out;
638 
639 			done_zones++;
640 			sector += zone_size_sectors;
641 
642 		}
643 	}
644 
645 	ret = done_zones;
646 
647 out:
648 	kvfree(buffer);
649 	return ret;
650 }
651 
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)652 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
653 					 struct request *req)
654 {
655 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
656 	struct ublk_io *io = &ubq->ios[req->tag];
657 	struct ublk_zoned_report_desc *desc;
658 	u32 ublk_op;
659 
660 	switch (req_op(req)) {
661 	case REQ_OP_ZONE_OPEN:
662 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
663 		break;
664 	case REQ_OP_ZONE_CLOSE:
665 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
666 		break;
667 	case REQ_OP_ZONE_FINISH:
668 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
669 		break;
670 	case REQ_OP_ZONE_RESET:
671 		ublk_op = UBLK_IO_OP_ZONE_RESET;
672 		break;
673 	case REQ_OP_ZONE_APPEND:
674 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
675 		break;
676 	case REQ_OP_ZONE_RESET_ALL:
677 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
678 		break;
679 	case REQ_OP_DRV_IN:
680 		desc = ublk_zoned_get_report_desc(req);
681 		if (!desc)
682 			return BLK_STS_IOERR;
683 		ublk_op = desc->operation;
684 		switch (ublk_op) {
685 		case UBLK_IO_OP_REPORT_ZONES:
686 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
687 			iod->nr_zones = desc->nr_zones;
688 			iod->start_sector = desc->sector;
689 			return BLK_STS_OK;
690 		default:
691 			return BLK_STS_IOERR;
692 		}
693 	case REQ_OP_DRV_OUT:
694 		/* We do not support drv_out */
695 		return BLK_STS_NOTSUPP;
696 	default:
697 		return BLK_STS_IOERR;
698 	}
699 
700 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
701 	iod->nr_sectors = blk_rq_sectors(req);
702 	iod->start_sector = blk_rq_pos(req);
703 	iod->addr = io->buf.addr;
704 
705 	return BLK_STS_OK;
706 }
707 
708 #else
709 
710 #define ublk_report_zones (NULL)
711 
ublk_dev_param_zoned_validate(const struct ublk_device * ub)712 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
713 {
714 	return -EOPNOTSUPP;
715 }
716 
ublk_dev_param_zoned_apply(struct ublk_device * ub)717 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
718 {
719 }
720 
ublk_revalidate_disk_zones(struct ublk_device * ub)721 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
722 {
723 	return 0;
724 }
725 
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)726 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
727 					 struct request *req)
728 {
729 	return BLK_STS_NOTSUPP;
730 }
731 
732 #endif
733 
734 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
735 				      bool need_map, struct io_comp_batch *iob);
736 
737 static dev_t ublk_chr_devt;
738 static const struct class ublk_chr_class = {
739 	.name = "ublk-char",
740 };
741 
742 static DEFINE_IDR(ublk_index_idr);
743 static DEFINE_SPINLOCK(ublk_idr_lock);
744 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
745 
746 static DEFINE_MUTEX(ublk_ctl_mutex);
747 
748 static struct ublk_batch_fetch_cmd *
ublk_batch_alloc_fcmd(struct io_uring_cmd * cmd)749 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
750 {
751 	struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
752 
753 	if (fcmd) {
754 		fcmd->cmd = cmd;
755 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
756 	}
757 	return fcmd;
758 }
759 
ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd * fcmd)760 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
761 {
762 	kfree(fcmd);
763 }
764 
__ublk_release_fcmd(struct ublk_queue * ubq)765 static void __ublk_release_fcmd(struct ublk_queue *ubq)
766 {
767 	WRITE_ONCE(ubq->active_fcmd, NULL);
768 }
769 
770 /*
771  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
772  * dispatching
773  */
ublk_batch_deinit_fetch_buf(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd,int res)774 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
775 					const struct ublk_batch_io_data *data,
776 					struct ublk_batch_fetch_cmd *fcmd,
777 					int res)
778 {
779 	spin_lock(&ubq->evts_lock);
780 	list_del_init(&fcmd->node);
781 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
782 	__ublk_release_fcmd(ubq);
783 	spin_unlock(&ubq->evts_lock);
784 
785 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
786 	ublk_batch_free_fcmd(fcmd);
787 }
788 
ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd * fcmd,struct io_br_sel * sel,unsigned int issue_flags)789 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
790 				     struct io_br_sel *sel,
791 				     unsigned int issue_flags)
792 {
793 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
794 		return -ENOBUFS;
795 	return 0;
796 }
797 
ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd * fcmd,void __user * buf,const u16 * tag_buf,unsigned int len)798 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
799 				       void __user *buf, const u16 *tag_buf,
800 				       unsigned int len)
801 {
802 	if (copy_to_user(buf, tag_buf, len))
803 		return -EFAULT;
804 	return len;
805 }
806 
807 #define UBLK_MAX_UBLKS UBLK_MINORS
808 
809 /*
810  * Max unprivileged ublk devices allowed to add
811  *
812  * It can be extended to one per-user limit in future or even controlled
813  * by cgroup.
814  */
815 static unsigned int unprivileged_ublks_max = 64;
816 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
817 
818 static struct miscdevice ublk_misc;
819 
ublk_pos_to_hwq(loff_t pos)820 static inline unsigned ublk_pos_to_hwq(loff_t pos)
821 {
822 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
823 		UBLK_QID_BITS_MASK;
824 }
825 
ublk_pos_to_buf_off(loff_t pos)826 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
827 {
828 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
829 }
830 
ublk_pos_to_tag(loff_t pos)831 static inline unsigned ublk_pos_to_tag(loff_t pos)
832 {
833 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
834 		UBLK_TAG_BITS_MASK;
835 }
836 
ublk_dev_param_basic_apply(struct ublk_device * ub)837 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
838 {
839 	const struct ublk_param_basic *p = &ub->params.basic;
840 
841 	if (p->attrs & UBLK_ATTR_READ_ONLY)
842 		set_disk_ro(ub->ub_disk, true);
843 
844 	set_capacity(ub->ub_disk, p->dev_sectors);
845 }
846 
ublk_integrity_flags(u32 flags)847 static int ublk_integrity_flags(u32 flags)
848 {
849 	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
850 
851 	if (flags & LBMD_PI_CAP_INTEGRITY) {
852 		flags &= ~LBMD_PI_CAP_INTEGRITY;
853 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
854 	}
855 	if (flags & LBMD_PI_CAP_REFTAG) {
856 		flags &= ~LBMD_PI_CAP_REFTAG;
857 		ret_flags |= BLK_INTEGRITY_REF_TAG;
858 	}
859 	return flags ? -EINVAL : ret_flags;
860 }
861 
ublk_integrity_pi_tuple_size(u8 csum_type)862 static int ublk_integrity_pi_tuple_size(u8 csum_type)
863 {
864 	switch (csum_type) {
865 	case LBMD_PI_CSUM_NONE:
866 		return 0;
867 	case LBMD_PI_CSUM_IP:
868 	case LBMD_PI_CSUM_CRC16_T10DIF:
869 		return 8;
870 	case LBMD_PI_CSUM_CRC64_NVME:
871 		return 16;
872 	default:
873 		return -EINVAL;
874 	}
875 }
876 
ublk_integrity_csum_type(u8 csum_type)877 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
878 {
879 	switch (csum_type) {
880 	case LBMD_PI_CSUM_NONE:
881 		return BLK_INTEGRITY_CSUM_NONE;
882 	case LBMD_PI_CSUM_IP:
883 		return BLK_INTEGRITY_CSUM_IP;
884 	case LBMD_PI_CSUM_CRC16_T10DIF:
885 		return BLK_INTEGRITY_CSUM_CRC;
886 	case LBMD_PI_CSUM_CRC64_NVME:
887 		return BLK_INTEGRITY_CSUM_CRC64;
888 	default:
889 		WARN_ON_ONCE(1);
890 		return BLK_INTEGRITY_CSUM_NONE;
891 	}
892 }
893 
ublk_validate_params(const struct ublk_device * ub)894 static int ublk_validate_params(const struct ublk_device *ub)
895 {
896 	/* basic param is the only one which must be set */
897 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
898 		const struct ublk_param_basic *p = &ub->params.basic;
899 
900 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
901 			return -EINVAL;
902 
903 		/*
904 		 * 256M is a reasonable upper bound for physical block size,
905 		 * io_min and io_opt; it aligns with the maximum physical
906 		 * block size possible in NVMe.
907 		 */
908 		if (p->physical_bs_shift > ilog2(SZ_256M))
909 			return -EINVAL;
910 
911 		if (p->io_min_shift > ilog2(SZ_256M))
912 			return -EINVAL;
913 
914 		if (p->io_opt_shift > ilog2(SZ_256M))
915 			return -EINVAL;
916 
917 		if (p->logical_bs_shift > p->physical_bs_shift)
918 			return -EINVAL;
919 
920 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
921 			return -EINVAL;
922 
923 		if (p->max_sectors < PAGE_SECTORS)
924 			return -EINVAL;
925 
926 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
927 			return -EINVAL;
928 	} else
929 		return -EINVAL;
930 
931 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
932 		const struct ublk_param_discard *p = &ub->params.discard;
933 
934 		/* So far, only support single segment discard */
935 		if (p->max_discard_sectors && p->max_discard_segments != 1)
936 			return -EINVAL;
937 
938 		if (!p->discard_granularity)
939 			return -EINVAL;
940 	}
941 
942 	/* dev_t is read-only */
943 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
944 		return -EINVAL;
945 
946 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
947 		return ublk_dev_param_zoned_validate(ub);
948 	else if (ublk_dev_is_zoned(ub))
949 		return -EINVAL;
950 
951 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
952 		const struct ublk_param_dma_align *p = &ub->params.dma;
953 
954 		if (p->alignment >= PAGE_SIZE)
955 			return -EINVAL;
956 
957 		if (!is_power_of_2(p->alignment + 1))
958 			return -EINVAL;
959 	}
960 
961 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
962 		const struct ublk_param_segment *p = &ub->params.seg;
963 
964 		if (!is_power_of_2(p->seg_boundary_mask + 1))
965 			return -EINVAL;
966 
967 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
968 			return -EINVAL;
969 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
970 			return -EINVAL;
971 	}
972 
973 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
974 		const struct ublk_param_integrity *p = &ub->params.integrity;
975 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
976 		int flags = ublk_integrity_flags(p->flags);
977 
978 		if (!ublk_dev_support_integrity(ub))
979 			return -EINVAL;
980 		if (flags < 0)
981 			return flags;
982 		if (pi_tuple_size < 0)
983 			return pi_tuple_size;
984 		if (!p->metadata_size)
985 			return -EINVAL;
986 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
987 		    p->flags & LBMD_PI_CAP_REFTAG)
988 			return -EINVAL;
989 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
990 			return -EINVAL;
991 		if (p->interval_exp < SECTOR_SHIFT ||
992 		    p->interval_exp > ub->params.basic.logical_bs_shift)
993 			return -EINVAL;
994 	}
995 
996 	return 0;
997 }
998 
ublk_apply_params(struct ublk_device * ub)999 static void ublk_apply_params(struct ublk_device *ub)
1000 {
1001 	ublk_dev_param_basic_apply(ub);
1002 
1003 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
1004 		ublk_dev_param_zoned_apply(ub);
1005 }
1006 
ublk_need_map_io(const struct ublk_queue * ubq)1007 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
1008 {
1009 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
1010 		!ublk_support_auto_buf_reg(ubq);
1011 }
1012 
ublk_dev_need_map_io(const struct ublk_device * ub)1013 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
1014 {
1015 	return !ublk_dev_support_user_copy(ub) &&
1016 	       !ublk_dev_support_zero_copy(ub) &&
1017 	       !ublk_dev_support_auto_buf_reg(ub);
1018 }
1019 
ublk_need_req_ref(const struct ublk_queue * ubq)1020 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1021 {
1022 	/*
1023 	 * read()/write() is involved in user copy, so request reference
1024 	 * has to be grabbed
1025 	 *
1026 	 * for zero copy, request buffer need to be registered to io_uring
1027 	 * buffer table, so reference is needed
1028 	 *
1029 	 * For auto buffer register, ublk server still may issue
1030 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1031 	 * so reference is required too.
1032 	 */
1033 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1034 		ublk_support_auto_buf_reg(ubq);
1035 }
1036 
ublk_dev_need_req_ref(const struct ublk_device * ub)1037 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1038 {
1039 	return ublk_dev_support_user_copy(ub) ||
1040 	       ublk_dev_support_zero_copy(ub) ||
1041 	       ublk_dev_support_auto_buf_reg(ub);
1042 }
1043 
1044 /*
1045  * ublk IO Reference Counting Design
1046  * ==================================
1047  *
1048  * For user-copy and zero-copy modes, ublk uses a split reference model with
1049  * two counters that together track IO lifetime:
1050  *
1051  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
1052  *   - io->task_registered_buffers: count of buffers registered on the IO task
1053  *
1054  * Key Invariant:
1055  * --------------
1056  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1057  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1058  * when no active references exist. After IO completion, both counters become
1059  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1060  * task_registered_buffers are 0.
1061  *
1062  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1063  * exit to determine if all references have been released.
1064  *
1065  * Why Split Counters:
1066  * -------------------
1067  * Buffers registered on the IO daemon task can use the lightweight
1068  * task_registered_buffers counter (simple increment/decrement) instead of
1069  * atomic refcount operations. The ublk_io_release() callback checks if
1070  * current == io->task to decide which counter to update.
1071  *
1072  * This optimization only applies before IO completion. At completion,
1073  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1074  * After that, all subsequent buffer unregistrations must use the atomic ref
1075  * since they may be releasing the last reference.
1076  *
1077  * Reference Lifecycle:
1078  * --------------------
1079  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1080  *
1081  * 2. During IO processing:
1082  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1083  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1084  *    - Buffer unregister callback (ublk_io_release):
1085  *      * If on-task: task_registered_buffers--
1086  *      * If off-task: ref-- via ublk_put_req_ref()
1087  *
1088  * 3. ublk_sub_req_ref() at IO completion:
1089  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1090  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1091  *    - This effectively collapses task_registered_buffers into the atomic ref,
1092  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1093  *      buffers that were already counted
1094  *
1095  * Example (zero-copy, register on-task, unregister off-task):
1096  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1097  *   - Register buffer on-task: task_registered_buffers = 1
1098  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1099  *   - Completion via ublk_sub_req_ref():
1100  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1101  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1102  *
1103  * Example (auto buffer registration):
1104  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1105  *
1106  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1107  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1108  *   - Completion via ublk_sub_req_ref():
1109  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1110  *
1111  * Example (zero-copy, ublk server killed):
1112  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1113  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1114  *
1115  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1116  *   - Register buffer on-task: task_registered_buffers = 1
1117  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1118  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1119  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1120  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1121  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1122  *     and abort pending requests
1123  *
1124  * Batch IO Special Case:
1125  * ----------------------
1126  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1127  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1128  * task_registered_buffers counter still tracks registered buffers for the
1129  * invariant check, even though the callback doesn't decrement it.
1130  *
1131  * Note: updating task_registered_buffers is protected by io->lock.
1132  */
ublk_init_req_ref(const struct ublk_queue * ubq,struct ublk_io * io)1133 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1134 		struct ublk_io *io)
1135 {
1136 	if (ublk_need_req_ref(ubq))
1137 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1138 }
1139 
ublk_get_req_ref(struct ublk_io * io)1140 static inline bool ublk_get_req_ref(struct ublk_io *io)
1141 {
1142 	return refcount_inc_not_zero(&io->ref);
1143 }
1144 
ublk_put_req_ref(struct ublk_io * io,struct request * req)1145 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1146 {
1147 	if (!refcount_dec_and_test(&io->ref))
1148 		return;
1149 
1150 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1151 	__ublk_complete_rq(req, io, false, NULL);
1152 }
1153 
ublk_sub_req_ref(struct ublk_io * io)1154 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1155 {
1156 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1157 
1158 	io->task_registered_buffers = 0;
1159 	return refcount_sub_and_test(sub_refs, &io->ref);
1160 }
1161 
ublk_need_get_data(const struct ublk_queue * ubq)1162 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1163 {
1164 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1165 }
1166 
ublk_dev_need_get_data(const struct ublk_device * ub)1167 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1168 {
1169 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1170 }
1171 
1172 /* Called in slow path only, keep it noinline for trace purpose */
ublk_get_device(struct ublk_device * ub)1173 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1174 {
1175 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1176 		return ub;
1177 	return NULL;
1178 }
1179 
1180 /* Called in slow path only, keep it noinline for trace purpose */
ublk_put_device(struct ublk_device * ub)1181 static noinline void ublk_put_device(struct ublk_device *ub)
1182 {
1183 	put_device(&ub->cdev_dev);
1184 }
1185 
ublk_get_queue(struct ublk_device * dev,int qid)1186 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1187 		int qid)
1188 {
1189 	return dev->queues[qid];
1190 }
1191 
ublk_rq_has_data(const struct request * rq)1192 static inline bool ublk_rq_has_data(const struct request *rq)
1193 {
1194 	return bio_has_data(rq->bio);
1195 }
1196 
1197 static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)1198 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1199 {
1200 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1201 }
1202 
__ublk_queue_cmd_buf_size(int depth)1203 static inline int __ublk_queue_cmd_buf_size(int depth)
1204 {
1205 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1206 }
1207 
ublk_queue_cmd_buf_size(struct ublk_device * ub)1208 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1209 {
1210 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1211 }
1212 
ublk_max_cmd_buf_size(void)1213 static int ublk_max_cmd_buf_size(void)
1214 {
1215 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1216 }
1217 
1218 /*
1219  * Should I/O outstanding to the ublk server when it exits be reissued?
1220  * If not, outstanding I/O will get errors.
1221  */
ublk_nosrv_should_reissue_outstanding(struct ublk_device * ub)1222 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1223 {
1224 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1225 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1226 }
1227 
1228 /*
1229  * Should I/O issued while there is no ublk server queue? If not, I/O
1230  * issued while there is no ublk server will get errors.
1231  */
ublk_nosrv_dev_should_queue_io(struct ublk_device * ub)1232 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1233 {
1234 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1235 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1236 }
1237 
1238 /*
1239  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1240  * of the device flags for smaller cache footprint - better for fast
1241  * paths.
1242  */
ublk_nosrv_should_queue_io(struct ublk_queue * ubq)1243 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1244 {
1245 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1246 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1247 }
1248 
1249 /*
1250  * Should ublk devices be stopped (i.e. no recovery possible) when the
1251  * ublk server exits? If not, devices can be used again by a future
1252  * incarnation of a ublk server via the start_recovery/end_recovery
1253  * commands.
1254  */
ublk_nosrv_should_stop_dev(struct ublk_device * ub)1255 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1256 {
1257 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1258 }
1259 
ublk_dev_in_recoverable_state(struct ublk_device * ub)1260 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1261 {
1262 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1263 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1264 }
1265 
ublk_free_disk(struct gendisk * disk)1266 static void ublk_free_disk(struct gendisk *disk)
1267 {
1268 	struct ublk_device *ub = disk->private_data;
1269 
1270 	clear_bit(UB_STATE_USED, &ub->state);
1271 	ublk_put_device(ub);
1272 }
1273 
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)1274 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1275 		unsigned int *owner_gid)
1276 {
1277 	kuid_t uid;
1278 	kgid_t gid;
1279 
1280 	current_uid_gid(&uid, &gid);
1281 
1282 	*owner_uid = from_kuid(&init_user_ns, uid);
1283 	*owner_gid = from_kgid(&init_user_ns, gid);
1284 }
1285 
ublk_open(struct gendisk * disk,blk_mode_t mode)1286 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1287 {
1288 	struct ublk_device *ub = disk->private_data;
1289 
1290 	if (capable(CAP_SYS_ADMIN))
1291 		return 0;
1292 
1293 	/*
1294 	 * If it is one unprivileged device, only owner can open
1295 	 * the disk. Otherwise it could be one trap made by one
1296 	 * evil user who grants this disk's privileges to other
1297 	 * users deliberately.
1298 	 *
1299 	 * This way is reasonable too given anyone can create
1300 	 * unprivileged device, and no need other's grant.
1301 	 */
1302 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1303 		unsigned int curr_uid, curr_gid;
1304 
1305 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1306 
1307 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1308 				ub->dev_info.owner_gid)
1309 			return -EPERM;
1310 	}
1311 
1312 	if (ub->block_open)
1313 		return -ENXIO;
1314 
1315 	return 0;
1316 }
1317 
1318 static const struct block_device_operations ub_fops = {
1319 	.owner =	THIS_MODULE,
1320 	.open =		ublk_open,
1321 	.free_disk =	ublk_free_disk,
1322 	.report_zones =	ublk_report_zones,
1323 };
1324 
ublk_copy_user_bvec(const struct bio_vec * bv,unsigned * offset,struct iov_iter * uiter,int dir,size_t * done)1325 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1326 				struct iov_iter *uiter, int dir, size_t *done)
1327 {
1328 	unsigned len;
1329 	void *bv_buf;
1330 	size_t copied;
1331 
1332 	if (*offset >= bv->bv_len) {
1333 		*offset -= bv->bv_len;
1334 		return true;
1335 	}
1336 
1337 	len = bv->bv_len - *offset;
1338 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1339 	/*
1340 	 * Bio pages may originate from slab caches without a usercopy region
1341 	 * (e.g. jbd2 frozen metadata buffers).  This is the same data that
1342 	 * the loop driver writes to its backing file — no exposure risk.
1343 	 * The bvec length is always trusted, so the size check in
1344 	 * check_copy_size() is not needed either.  Use the unchecked
1345 	 * helpers to avoid false positives on slab pages.
1346 	 */
1347 	if (dir == ITER_DEST)
1348 		copied = _copy_to_iter(bv_buf, len, uiter);
1349 	else
1350 		copied = _copy_from_iter(bv_buf, len, uiter);
1351 
1352 	kunmap_local(bv_buf);
1353 
1354 	*done += copied;
1355 	if (copied < len)
1356 		return false;
1357 
1358 	*offset = 0;
1359 	return true;
1360 }
1361 
1362 /*
1363  * Copy data between request pages and io_iter, and 'offset'
1364  * is the start point of linear offset of request.
1365  */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1366 static size_t ublk_copy_user_pages(const struct request *req,
1367 		unsigned offset, struct iov_iter *uiter, int dir)
1368 {
1369 	struct req_iterator iter;
1370 	struct bio_vec bv;
1371 	size_t done = 0;
1372 
1373 	rq_for_each_segment(bv, req, iter) {
1374 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1375 			break;
1376 	}
1377 	return done;
1378 }
1379 
1380 #ifdef CONFIG_BLK_DEV_INTEGRITY
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1381 static size_t ublk_copy_user_integrity(const struct request *req,
1382 		unsigned offset, struct iov_iter *uiter, int dir)
1383 {
1384 	size_t done = 0;
1385 	struct bio *bio = req->bio;
1386 	struct bvec_iter iter;
1387 	struct bio_vec iv;
1388 
1389 	if (!blk_integrity_rq(req))
1390 		return 0;
1391 
1392 	bio_for_each_integrity_vec(iv, bio, iter) {
1393 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1394 			break;
1395 	}
1396 
1397 	return done;
1398 }
1399 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
ublk_copy_user_integrity(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)1400 static size_t ublk_copy_user_integrity(const struct request *req,
1401 		unsigned offset, struct iov_iter *uiter, int dir)
1402 {
1403 	return 0;
1404 }
1405 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1406 
ublk_need_map_req(const struct request * req)1407 static inline bool ublk_need_map_req(const struct request *req)
1408 {
1409 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1410 }
1411 
ublk_need_unmap_req(const struct request * req)1412 static inline bool ublk_need_unmap_req(const struct request *req)
1413 {
1414 	return ublk_rq_has_data(req) &&
1415 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1416 }
1417 
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,const struct ublk_io * io)1418 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1419 				const struct request *req,
1420 				const struct ublk_io *io)
1421 {
1422 	const unsigned int rq_bytes = blk_rq_bytes(req);
1423 
1424 	if (!ublk_need_map_io(ubq))
1425 		return rq_bytes;
1426 
1427 	/*
1428 	 * no zero copy, we delay copy WRITE request data into ublksrv
1429 	 * context and the big benefit is that pinning pages in current
1430 	 * context is pretty fast, see ublk_pin_user_pages
1431 	 */
1432 	if (ublk_need_map_req(req)) {
1433 		struct iov_iter iter;
1434 		const int dir = ITER_DEST;
1435 
1436 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1437 		return ublk_copy_user_pages(req, 0, &iter, dir);
1438 	}
1439 	return rq_bytes;
1440 }
1441 
ublk_unmap_io(bool need_map,const struct request * req,const struct ublk_io * io)1442 static unsigned int ublk_unmap_io(bool need_map,
1443 		const struct request *req,
1444 		const struct ublk_io *io)
1445 {
1446 	const unsigned int rq_bytes = blk_rq_bytes(req);
1447 
1448 	if (!need_map)
1449 		return rq_bytes;
1450 
1451 	if (ublk_need_unmap_req(req)) {
1452 		struct iov_iter iter;
1453 		const int dir = ITER_SOURCE;
1454 
1455 		WARN_ON_ONCE(io->res > rq_bytes);
1456 
1457 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1458 		return ublk_copy_user_pages(req, 0, &iter, dir);
1459 	}
1460 	return rq_bytes;
1461 }
1462 
ublk_req_build_flags(struct request * req)1463 static inline unsigned int ublk_req_build_flags(struct request *req)
1464 {
1465 	unsigned flags = 0;
1466 
1467 	if (req->cmd_flags & REQ_FAILFAST_DEV)
1468 		flags |= UBLK_IO_F_FAILFAST_DEV;
1469 
1470 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1471 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1472 
1473 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1474 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
1475 
1476 	if (req->cmd_flags & REQ_META)
1477 		flags |= UBLK_IO_F_META;
1478 
1479 	if (req->cmd_flags & REQ_FUA)
1480 		flags |= UBLK_IO_F_FUA;
1481 
1482 	if (req->cmd_flags & REQ_NOUNMAP)
1483 		flags |= UBLK_IO_F_NOUNMAP;
1484 
1485 	if (req->cmd_flags & REQ_SWAP)
1486 		flags |= UBLK_IO_F_SWAP;
1487 
1488 	if (blk_integrity_rq(req))
1489 		flags |= UBLK_IO_F_INTEGRITY;
1490 
1491 	return flags;
1492 }
1493 
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1494 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1495 {
1496 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1497 	struct ublk_io *io = &ubq->ios[req->tag];
1498 	u32 ublk_op;
1499 
1500 	switch (req_op(req)) {
1501 	case REQ_OP_READ:
1502 		ublk_op = UBLK_IO_OP_READ;
1503 		break;
1504 	case REQ_OP_WRITE:
1505 		ublk_op = UBLK_IO_OP_WRITE;
1506 		break;
1507 	case REQ_OP_FLUSH:
1508 		ublk_op = UBLK_IO_OP_FLUSH;
1509 		break;
1510 	case REQ_OP_DISCARD:
1511 		ublk_op = UBLK_IO_OP_DISCARD;
1512 		break;
1513 	case REQ_OP_WRITE_ZEROES:
1514 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1515 		break;
1516 	default:
1517 		if (ublk_queue_is_zoned(ubq))
1518 			return ublk_setup_iod_zoned(ubq, req);
1519 		return BLK_STS_IOERR;
1520 	}
1521 
1522 	/* need to translate since kernel may change */
1523 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
1524 	iod->nr_sectors = blk_rq_sectors(req);
1525 	iod->start_sector = blk_rq_pos(req);
1526 
1527 	/* Try shmem zero-copy match before setting addr */
1528 	if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
1529 		u32 buf_idx, buf_off;
1530 
1531 		if (ublk_try_buf_match(ubq->dev, req,
1532 					  &buf_idx, &buf_off)) {
1533 			iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
1534 			iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
1535 			return BLK_STS_OK;
1536 		}
1537 	}
1538 
1539 	iod->addr = io->buf.addr;
1540 
1541 	return BLK_STS_OK;
1542 }
1543 
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)1544 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1545 		struct io_uring_cmd *ioucmd)
1546 {
1547 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1548 }
1549 
ublk_end_request(struct request * req,blk_status_t error)1550 static void ublk_end_request(struct request *req, blk_status_t error)
1551 {
1552 	local_bh_disable();
1553 	blk_mq_end_request(req, error);
1554 	local_bh_enable();
1555 }
1556 
1557 /* todo: handle partial completion */
__ublk_complete_rq(struct request * req,struct ublk_io * io,bool need_map,struct io_comp_batch * iob)1558 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1559 				      bool need_map, struct io_comp_batch *iob)
1560 {
1561 	unsigned int unmapped_bytes;
1562 	blk_status_t res = BLK_STS_OK;
1563 	bool requeue;
1564 
1565 	/* failed read IO if nothing is read */
1566 	if (!io->res && req_op(req) == REQ_OP_READ)
1567 		io->res = -EIO;
1568 
1569 	if (io->res < 0) {
1570 		res = errno_to_blk_status(io->res);
1571 		goto exit;
1572 	}
1573 
1574 	/*
1575 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1576 	 * directly.
1577 	 *
1578 	 * Both the two needn't unmap.
1579 	 */
1580 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1581 	    req_op(req) != REQ_OP_DRV_IN)
1582 		goto exit;
1583 
1584 	/* shmem zero copy: no data to unmap, pages already shared */
1585 	if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1586 		goto exit;
1587 
1588 	/* for READ request, writing data in iod->addr to rq buffers */
1589 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1590 
1591 	/*
1592 	 * Extremely impossible since we got data filled in just before
1593 	 *
1594 	 * Re-read simply for this unlikely case.
1595 	 */
1596 	if (unlikely(unmapped_bytes < io->res))
1597 		io->res = unmapped_bytes;
1598 
1599 	/*
1600 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1601 	 * happens off this path, then that will prevent ublk's blkdev_release()
1602 	 * from being called on current's task work, see fput() implementation.
1603 	 *
1604 	 * Otherwise, ublk server may not provide forward progress in case of
1605 	 * reading the partition table from bdev_open() with disk->open_mutex
1606 	 * held, and causes dead lock as we could already be holding
1607 	 * disk->open_mutex here.
1608 	 *
1609 	 * Preferably we would not be doing IO with a mutex held that is also
1610 	 * used for release, but this work-around will suffice for now.
1611 	 */
1612 	local_bh_disable();
1613 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1614 	local_bh_enable();
1615 	if (requeue)
1616 		blk_mq_requeue_request(req, true);
1617 	else if (likely(!blk_should_fake_timeout(req->q))) {
1618 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1619 			return;
1620 		__blk_mq_end_request(req, BLK_STS_OK);
1621 	}
1622 
1623 	return;
1624 exit:
1625 	ublk_end_request(req, res);
1626 }
1627 
__ublk_prep_compl_io_cmd(struct ublk_io * io,struct request * req)1628 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1629 						     struct request *req)
1630 {
1631 	/* read cmd first because req will overwrite it */
1632 	struct io_uring_cmd *cmd = io->cmd;
1633 
1634 	/* mark this cmd owned by ublksrv */
1635 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1636 
1637 	/*
1638 	 * clear ACTIVE since we are done with this sqe/cmd slot
1639 	 * We can only accept io cmd in case of being not active.
1640 	 */
1641 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1642 
1643 	io->req = req;
1644 	return cmd;
1645 }
1646 
ublk_complete_io_cmd(struct ublk_io * io,struct request * req,int res,unsigned issue_flags)1647 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1648 				 int res, unsigned issue_flags)
1649 {
1650 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1651 
1652 	/* tell ublksrv one io request is coming */
1653 	io_uring_cmd_done(cmd, res, issue_flags);
1654 }
1655 
1656 #define UBLK_REQUEUE_DELAY_MS	3
1657 
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)1658 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1659 		struct request *rq)
1660 {
1661 	/* We cannot process this rq so just requeue it. */
1662 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1663 		blk_mq_requeue_request(rq, false);
1664 	else
1665 		ublk_end_request(rq, BLK_STS_IOERR);
1666 }
1667 
1668 static void
ublk_auto_buf_reg_fallback(const struct ublk_queue * ubq,unsigned tag)1669 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1670 {
1671 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1672 
1673 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1674 }
1675 
1676 enum auto_buf_reg_res {
1677 	AUTO_BUF_REG_FAIL,
1678 	AUTO_BUF_REG_FALLBACK,
1679 	AUTO_BUF_REG_OK,
1680 };
1681 
1682 /*
1683  * Setup io state after auto buffer registration.
1684  *
1685  * Must be called after ublk_auto_buf_register() is done.
1686  * Caller must hold io->lock in batch context.
1687  */
ublk_auto_buf_io_setup(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,enum auto_buf_reg_res res)1688 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1689 				   struct request *req, struct ublk_io *io,
1690 				   struct io_uring_cmd *cmd,
1691 				   enum auto_buf_reg_res res)
1692 {
1693 	if (res == AUTO_BUF_REG_OK) {
1694 		io->task_registered_buffers = 1;
1695 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1696 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1697 	}
1698 	ublk_init_req_ref(ubq, io);
1699 	__ublk_prep_compl_io_cmd(io, req);
1700 }
1701 
1702 /* Register request bvec to io_uring for auto buffer registration. */
1703 static enum auto_buf_reg_res
ublk_auto_buf_register(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1704 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1705 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1706 		       unsigned int issue_flags)
1707 {
1708 	int ret;
1709 
1710 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1711 				      io->buf.auto_reg.index, issue_flags);
1712 	if (ret) {
1713 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1714 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1715 			return AUTO_BUF_REG_FALLBACK;
1716 		}
1717 		ublk_end_request(req, BLK_STS_IOERR);
1718 		return AUTO_BUF_REG_FAIL;
1719 	}
1720 
1721 	return AUTO_BUF_REG_OK;
1722 }
1723 
1724 /*
1725  * Dispatch IO to userspace with auto buffer registration.
1726  *
1727  * Only called in non-batch context from task work, io->lock not held.
1728  */
ublk_auto_buf_dispatch(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned int issue_flags)1729 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1730 				   struct request *req, struct ublk_io *io,
1731 				   struct io_uring_cmd *cmd,
1732 				   unsigned int issue_flags)
1733 {
1734 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1735 			issue_flags);
1736 
1737 	if (res != AUTO_BUF_REG_FAIL) {
1738 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1739 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1740 	}
1741 }
1742 
ublk_start_io(const struct ublk_queue * ubq,struct request * req,struct ublk_io * io)1743 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1744 			  struct ublk_io *io)
1745 {
1746 	unsigned mapped_bytes;
1747 
1748 	/* shmem zero copy: skip data copy, pages already shared */
1749 	if (ublk_iod_is_shmem_zc(ubq, req->tag))
1750 		return true;
1751 
1752 	mapped_bytes = ublk_map_io(ubq, req, io);
1753 
1754 	/* partially mapped, update io descriptor */
1755 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1756 		/*
1757 		 * Nothing mapped, retry until we succeed.
1758 		 *
1759 		 * We may never succeed in mapping any bytes here because
1760 		 * of OOM. TODO: reserve one buffer with single page pinned
1761 		 * for providing forward progress guarantee.
1762 		 */
1763 		if (unlikely(!mapped_bytes)) {
1764 			blk_mq_requeue_request(req, false);
1765 			blk_mq_delay_kick_requeue_list(req->q,
1766 					UBLK_REQUEUE_DELAY_MS);
1767 			return false;
1768 		}
1769 
1770 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1771 			mapped_bytes >> 9;
1772 	}
1773 
1774 	return true;
1775 }
1776 
ublk_dispatch_req(struct ublk_queue * ubq,struct request * req)1777 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1778 {
1779 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1780 	int tag = req->tag;
1781 	struct ublk_io *io = &ubq->ios[tag];
1782 
1783 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1784 			__func__, ubq->q_id, req->tag, io->flags,
1785 			ublk_get_iod(ubq, req->tag)->addr);
1786 
1787 	/*
1788 	 * Task is exiting if either:
1789 	 *
1790 	 * (1) current != io->task.
1791 	 * io_uring_cmd_complete_in_task() tries to run task_work
1792 	 * in a workqueue if cmd's task is PF_EXITING.
1793 	 *
1794 	 * (2) current->flags & PF_EXITING.
1795 	 */
1796 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1797 		__ublk_abort_rq(ubq, req);
1798 		return;
1799 	}
1800 
1801 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1802 		/*
1803 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1804 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1805 		 * and notify it.
1806 		 */
1807 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1808 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1809 				__func__, ubq->q_id, req->tag, io->flags);
1810 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1811 				     issue_flags);
1812 		return;
1813 	}
1814 
1815 	if (!ublk_start_io(ubq, req, io))
1816 		return;
1817 
1818 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1819 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1820 	} else {
1821 		ublk_init_req_ref(ubq, io);
1822 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1823 	}
1824 }
1825 
__ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short tag)1826 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1827 				       const struct ublk_batch_io_data *data,
1828 				       unsigned short tag)
1829 {
1830 	struct ublk_device *ub = data->ub;
1831 	struct ublk_io *io = &ubq->ios[tag];
1832 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1833 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1834 	struct io_uring_cmd *cmd = data->cmd;
1835 
1836 	if (!ublk_start_io(ubq, req, io))
1837 		return false;
1838 
1839 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1840 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1841 				data->issue_flags);
1842 
1843 		if (res == AUTO_BUF_REG_FAIL)
1844 			return false;
1845 	}
1846 
1847 	ublk_io_lock(io);
1848 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1849 	ublk_io_unlock(io);
1850 
1851 	return true;
1852 }
1853 
ublk_batch_prep_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,unsigned int len)1854 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1855 				     const struct ublk_batch_io_data *data,
1856 				     unsigned short *tag_buf,
1857 				     unsigned int len)
1858 {
1859 	bool has_unused = false;
1860 	unsigned int i;
1861 
1862 	for (i = 0; i < len; i++) {
1863 		unsigned short tag = tag_buf[i];
1864 
1865 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1866 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1867 			has_unused = true;
1868 		}
1869 	}
1870 
1871 	return has_unused;
1872 }
1873 
1874 /*
1875  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1876  * Returns the new length after filtering.
1877  */
ublk_filter_unused_tags(unsigned short * tag_buf,unsigned int len)1878 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1879 					    unsigned int len)
1880 {
1881 	unsigned int i, j;
1882 
1883 	for (i = 0, j = 0; i < len; i++) {
1884 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1885 			if (i != j)
1886 				tag_buf[j] = tag_buf[i];
1887 			j++;
1888 		}
1889 	}
1890 
1891 	return j;
1892 }
1893 
ublk_batch_dispatch_fail(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,unsigned short * tag_buf,size_t len,int ret)1894 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1895 		const struct ublk_batch_io_data *data,
1896 		unsigned short *tag_buf, size_t len, int ret)
1897 {
1898 	int i, res;
1899 
1900 	/*
1901 	 * Undo prep state for all IOs since userspace never received them.
1902 	 * This restores IOs to pre-prepared state so they can be cleanly
1903 	 * re-prepared when tags are pulled from FIFO again.
1904 	 */
1905 	for (i = 0; i < len; i++) {
1906 		struct ublk_io *io = &ubq->ios[tag_buf[i]];
1907 		int index = -1;
1908 
1909 		ublk_io_lock(io);
1910 		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1911 			index = io->buf.auto_reg.index;
1912 		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1913 		io->flags |= UBLK_IO_FLAG_ACTIVE;
1914 		ublk_io_unlock(io);
1915 
1916 		if (index != -1)
1917 			io_buffer_unregister_bvec(data->cmd, index,
1918 					data->issue_flags);
1919 	}
1920 
1921 	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1922 		tag_buf, len, &ubq->evts_lock);
1923 
1924 	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1925 			"tags(%d %zu) ret %d\n", __func__, res, len,
1926 			ret);
1927 }
1928 
1929 #define MAX_NR_TAG 128
__ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)1930 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1931 				 const struct ublk_batch_io_data *data,
1932 				 struct ublk_batch_fetch_cmd *fcmd)
1933 {
1934 	const unsigned int tag_sz = sizeof(unsigned short);
1935 	unsigned short tag_buf[MAX_NR_TAG];
1936 	struct io_br_sel sel;
1937 	size_t len = 0;
1938 	bool needs_filter;
1939 	int ret;
1940 
1941 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1942 
1943 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1944 					 data->issue_flags);
1945 	if (sel.val < 0)
1946 		return sel.val;
1947 	if (!sel.addr)
1948 		return -ENOBUFS;
1949 
1950 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1951 	len = min(len, sizeof(tag_buf)) / tag_sz;
1952 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1953 
1954 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1955 	/* Filter out unused tags before posting to userspace */
1956 	if (unlikely(needs_filter)) {
1957 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1958 
1959 		/* return actual length if all are failed or requeued */
1960 		if (!new_len) {
1961 			/* release the selected buffer */
1962 			sel.val = 0;
1963 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1964 						&sel, data->issue_flags));
1965 			return len;
1966 		}
1967 		len = new_len;
1968 	}
1969 
1970 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1971 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1972 	if (unlikely(ret < 0))
1973 		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1974 	return ret;
1975 }
1976 
__ublk_acquire_fcmd(struct ublk_queue * ubq)1977 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1978 		struct ublk_queue *ubq)
1979 {
1980 	struct ublk_batch_fetch_cmd *fcmd;
1981 
1982 	lockdep_assert_held(&ubq->evts_lock);
1983 
1984 	/*
1985 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1986 	 *
1987 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1988 	 *
1989 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1990 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1991 	 */
1992 	smp_mb();
1993 	if (READ_ONCE(ubq->active_fcmd)) {
1994 		fcmd = NULL;
1995 	} else {
1996 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1997 				struct ublk_batch_fetch_cmd, node);
1998 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1999 	}
2000 	return fcmd;
2001 }
2002 
ublk_batch_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2003 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2004 {
2005 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
2006 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2007 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2008 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2009 	struct ublk_batch_io_data data = {
2010 		.ub = pdu->ubq->dev,
2011 		.cmd = fcmd->cmd,
2012 		.issue_flags = issue_flags,
2013 	};
2014 
2015 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
2016 
2017 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
2018 }
2019 
2020 static void
ublk_batch_dispatch(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)2021 ublk_batch_dispatch(struct ublk_queue *ubq,
2022 		    const struct ublk_batch_io_data *data,
2023 		    struct ublk_batch_fetch_cmd *fcmd)
2024 {
2025 	struct ublk_batch_fetch_cmd *new_fcmd;
2026 	unsigned tried = 0;
2027 	int ret = 0;
2028 
2029 again:
2030 	while (!ublk_io_evts_empty(ubq)) {
2031 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
2032 		if (ret <= 0)
2033 			break;
2034 	}
2035 
2036 	if (ret < 0) {
2037 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2038 		return;
2039 	}
2040 
2041 	__ublk_release_fcmd(ubq);
2042 	/*
2043 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2044 	 * checking ubq->evts_fifo.
2045 	 *
2046 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2047 	 */
2048 	smp_mb();
2049 	if (likely(ublk_io_evts_empty(ubq)))
2050 		return;
2051 
2052 	spin_lock(&ubq->evts_lock);
2053 	new_fcmd = __ublk_acquire_fcmd(ubq);
2054 	spin_unlock(&ubq->evts_lock);
2055 
2056 	if (!new_fcmd)
2057 		return;
2058 
2059 	/* Avoid lockup by allowing to handle at most 32 batches */
2060 	if (new_fcmd == fcmd && tried++ < 32)
2061 		goto again;
2062 
2063 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2064 }
2065 
ublk_cmd_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2066 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2067 {
2068 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2069 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2070 	struct ublk_queue *ubq = pdu->ubq;
2071 
2072 	ublk_dispatch_req(ubq, pdu->req);
2073 }
2074 
ublk_batch_queue_cmd(struct ublk_queue * ubq,struct request * rq,bool last)2075 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2076 {
2077 	unsigned short tag = rq->tag;
2078 	struct ublk_batch_fetch_cmd *fcmd = NULL;
2079 
2080 	spin_lock(&ubq->evts_lock);
2081 	kfifo_put(&ubq->evts_fifo, tag);
2082 	if (last)
2083 		fcmd = __ublk_acquire_fcmd(ubq);
2084 	spin_unlock(&ubq->evts_lock);
2085 
2086 	if (fcmd)
2087 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2088 }
2089 
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)2090 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2091 {
2092 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2093 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2094 
2095 	pdu->req = rq;
2096 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2097 }
2098 
ublk_cmd_list_tw_cb(struct io_tw_req tw_req,io_tw_token_t tw)2099 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2100 {
2101 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2102 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2103 	struct request *rq = pdu->req_list;
2104 	struct request *next;
2105 
2106 	do {
2107 		next = rq->rq_next;
2108 		rq->rq_next = NULL;
2109 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2110 		rq = next;
2111 	} while (rq);
2112 }
2113 
ublk_queue_cmd_list(struct ublk_io * io,struct rq_list * l)2114 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2115 {
2116 	struct io_uring_cmd *cmd = io->cmd;
2117 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2118 
2119 	pdu->req_list = rq_list_peek(l);
2120 	rq_list_init(l);
2121 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2122 }
2123 
ublk_timeout(struct request * rq)2124 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2125 {
2126 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2127 	pid_t tgid = ubq->dev->ublksrv_tgid;
2128 	struct task_struct *p;
2129 	struct pid *pid;
2130 
2131 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2132 		return BLK_EH_RESET_TIMER;
2133 
2134 	if (unlikely(!tgid))
2135 		return BLK_EH_RESET_TIMER;
2136 
2137 	rcu_read_lock();
2138 	pid = find_vpid(tgid);
2139 	p = pid_task(pid, PIDTYPE_PID);
2140 	if (p)
2141 		send_sig(SIGKILL, p, 0);
2142 	rcu_read_unlock();
2143 	return BLK_EH_DONE;
2144 }
2145 
ublk_prep_req(struct ublk_queue * ubq,struct request * rq,bool check_cancel)2146 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2147 				  bool check_cancel)
2148 {
2149 	blk_status_t res;
2150 
2151 	if (unlikely(READ_ONCE(ubq->fail_io)))
2152 		return BLK_STS_TARGET;
2153 
2154 	/* With recovery feature enabled, force_abort is set in
2155 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2156 	 * abort all requeued and new rqs here to let del_gendisk()
2157 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2158 	 * to avoid UAF on io_uring ctx.
2159 	 *
2160 	 * Note: force_abort is guaranteed to be seen because it is set
2161 	 * before request queue is unqiuesced.
2162 	 */
2163 	if (ublk_nosrv_should_queue_io(ubq) &&
2164 	    unlikely(READ_ONCE(ubq->force_abort)))
2165 		return BLK_STS_IOERR;
2166 
2167 	if (check_cancel && unlikely(ubq->canceling))
2168 		return BLK_STS_IOERR;
2169 
2170 	/* fill iod to slot in io cmd buffer */
2171 	res = ublk_setup_iod(ubq, rq);
2172 	if (unlikely(res != BLK_STS_OK))
2173 		return BLK_STS_IOERR;
2174 
2175 	blk_mq_start_request(rq);
2176 	return BLK_STS_OK;
2177 }
2178 
2179 /*
2180  * Common helper for queue_rq that handles request preparation and
2181  * cancellation checks. Returns status and sets should_queue to indicate
2182  * whether the caller should proceed with queuing the request.
2183  */
__ublk_queue_rq_common(struct ublk_queue * ubq,struct request * rq,bool * should_queue)2184 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2185 						   struct request *rq,
2186 						   bool *should_queue)
2187 {
2188 	blk_status_t res;
2189 
2190 	res = ublk_prep_req(ubq, rq, false);
2191 	if (res != BLK_STS_OK) {
2192 		*should_queue = false;
2193 		return res;
2194 	}
2195 
2196 	/*
2197 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2198 	 * is dealt with, otherwise this request may not be failed in case
2199 	 * of recovery, and cause hang when deleting disk
2200 	 */
2201 	if (unlikely(ubq->canceling)) {
2202 		*should_queue = false;
2203 		__ublk_abort_rq(ubq, rq);
2204 		return BLK_STS_OK;
2205 	}
2206 
2207 	*should_queue = true;
2208 	return BLK_STS_OK;
2209 }
2210 
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2211 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2212 		const struct blk_mq_queue_data *bd)
2213 {
2214 	struct ublk_queue *ubq = hctx->driver_data;
2215 	struct request *rq = bd->rq;
2216 	bool should_queue;
2217 	blk_status_t res;
2218 
2219 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2220 	if (!should_queue)
2221 		return res;
2222 
2223 	ublk_queue_cmd(ubq, rq);
2224 	return BLK_STS_OK;
2225 }
2226 
ublk_batch_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)2227 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2228 		const struct blk_mq_queue_data *bd)
2229 {
2230 	struct ublk_queue *ubq = hctx->driver_data;
2231 	struct request *rq = bd->rq;
2232 	bool should_queue;
2233 	blk_status_t res;
2234 
2235 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2236 	if (!should_queue)
2237 		return res;
2238 
2239 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2240 	return BLK_STS_OK;
2241 }
2242 
ublk_belong_to_same_batch(const struct ublk_io * io,const struct ublk_io * io2)2243 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2244 					     const struct ublk_io *io2)
2245 {
2246 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2247 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2248 		(io->task == io2->task);
2249 }
2250 
ublk_commit_rqs(struct blk_mq_hw_ctx * hctx)2251 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2252 {
2253 	struct ublk_queue *ubq = hctx->driver_data;
2254 	struct ublk_batch_fetch_cmd *fcmd;
2255 
2256 	spin_lock(&ubq->evts_lock);
2257 	fcmd = __ublk_acquire_fcmd(ubq);
2258 	spin_unlock(&ubq->evts_lock);
2259 
2260 	if (fcmd)
2261 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2262 }
2263 
ublk_queue_rqs(struct rq_list * rqlist)2264 static void ublk_queue_rqs(struct rq_list *rqlist)
2265 {
2266 	struct rq_list requeue_list = { };
2267 	struct rq_list submit_list = { };
2268 	struct ublk_io *io = NULL;
2269 	struct request *req;
2270 
2271 	while ((req = rq_list_pop(rqlist))) {
2272 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2273 		struct ublk_io *this_io = &this_q->ios[req->tag];
2274 
2275 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2276 			rq_list_add_tail(&requeue_list, req);
2277 			continue;
2278 		}
2279 
2280 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2281 				!rq_list_empty(&submit_list))
2282 			ublk_queue_cmd_list(io, &submit_list);
2283 		io = this_io;
2284 		rq_list_add_tail(&submit_list, req);
2285 	}
2286 
2287 	if (!rq_list_empty(&submit_list))
2288 		ublk_queue_cmd_list(io, &submit_list);
2289 	*rqlist = requeue_list;
2290 }
2291 
ublk_batch_queue_cmd_list(struct ublk_queue * ubq,struct rq_list * l)2292 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2293 {
2294 	unsigned short tags[MAX_NR_TAG];
2295 	struct ublk_batch_fetch_cmd *fcmd;
2296 	struct request *rq;
2297 	unsigned cnt = 0;
2298 
2299 	spin_lock(&ubq->evts_lock);
2300 	rq_list_for_each(l, rq) {
2301 		tags[cnt++] = (unsigned short)rq->tag;
2302 		if (cnt >= MAX_NR_TAG) {
2303 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2304 			cnt = 0;
2305 		}
2306 	}
2307 	if (cnt)
2308 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2309 	fcmd = __ublk_acquire_fcmd(ubq);
2310 	spin_unlock(&ubq->evts_lock);
2311 
2312 	rq_list_init(l);
2313 	if (fcmd)
2314 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2315 }
2316 
ublk_batch_queue_rqs(struct rq_list * rqlist)2317 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2318 {
2319 	struct rq_list requeue_list = { };
2320 	struct rq_list submit_list = { };
2321 	struct ublk_queue *ubq = NULL;
2322 	struct request *req;
2323 
2324 	while ((req = rq_list_pop(rqlist))) {
2325 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2326 
2327 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2328 			rq_list_add_tail(&requeue_list, req);
2329 			continue;
2330 		}
2331 
2332 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2333 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2334 		ubq = this_q;
2335 		rq_list_add_tail(&submit_list, req);
2336 	}
2337 
2338 	if (!rq_list_empty(&submit_list))
2339 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2340 	*rqlist = requeue_list;
2341 }
2342 
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)2343 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2344 		unsigned int hctx_idx)
2345 {
2346 	struct ublk_device *ub = driver_data;
2347 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2348 
2349 	hctx->driver_data = ubq;
2350 	return 0;
2351 }
2352 
2353 static const struct blk_mq_ops ublk_mq_ops = {
2354 	.queue_rq       = ublk_queue_rq,
2355 	.queue_rqs      = ublk_queue_rqs,
2356 	.init_hctx	= ublk_init_hctx,
2357 	.timeout	= ublk_timeout,
2358 };
2359 
2360 static const struct blk_mq_ops ublk_batch_mq_ops = {
2361 	.commit_rqs	= ublk_commit_rqs,
2362 	.queue_rq       = ublk_batch_queue_rq,
2363 	.queue_rqs      = ublk_batch_queue_rqs,
2364 	.init_hctx	= ublk_init_hctx,
2365 	.timeout	= ublk_timeout,
2366 };
2367 
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2368 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2369 {
2370 	int i;
2371 
2372 	ubq->nr_io_ready = 0;
2373 
2374 	for (i = 0; i < ubq->q_depth; i++) {
2375 		struct ublk_io *io = &ubq->ios[i];
2376 
2377 		/*
2378 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2379 		 * io->cmd
2380 		 */
2381 		io->flags &= UBLK_IO_FLAG_CANCELED;
2382 		io->cmd = NULL;
2383 		io->buf.addr = 0;
2384 
2385 		/*
2386 		 * old task is PF_EXITING, put it now
2387 		 *
2388 		 * It could be NULL in case of closing one quiesced
2389 		 * device.
2390 		 */
2391 		if (io->task) {
2392 			put_task_struct(io->task);
2393 			io->task = NULL;
2394 		}
2395 
2396 		WARN_ON_ONCE(refcount_read(&io->ref));
2397 		WARN_ON_ONCE(io->task_registered_buffers);
2398 	}
2399 }
2400 
ublk_ch_open(struct inode * inode,struct file * filp)2401 static int ublk_ch_open(struct inode *inode, struct file *filp)
2402 {
2403 	struct ublk_device *ub = container_of(inode->i_cdev,
2404 			struct ublk_device, cdev);
2405 
2406 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2407 		return -EBUSY;
2408 	filp->private_data = ub;
2409 	ub->ublksrv_tgid = current->tgid;
2410 	return 0;
2411 }
2412 
ublk_reset_ch_dev(struct ublk_device * ub)2413 static void ublk_reset_ch_dev(struct ublk_device *ub)
2414 {
2415 	int i;
2416 
2417 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2418 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2419 
2420 		/* Sync with ublk_cancel_cmd() */
2421 		spin_lock(&ubq->cancel_lock);
2422 		ublk_queue_reinit(ub, ubq);
2423 		spin_unlock(&ubq->cancel_lock);
2424 	}
2425 
2426 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2427 	ub->mm = NULL;
2428 	ub->nr_queue_ready = 0;
2429 	ub->unprivileged_daemons = false;
2430 	ub->ublksrv_tgid = -1;
2431 }
2432 
ublk_get_disk(struct ublk_device * ub)2433 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2434 {
2435 	struct gendisk *disk;
2436 
2437 	spin_lock(&ub->lock);
2438 	disk = ub->ub_disk;
2439 	if (disk)
2440 		get_device(disk_to_dev(disk));
2441 	spin_unlock(&ub->lock);
2442 
2443 	return disk;
2444 }
2445 
ublk_put_disk(struct gendisk * disk)2446 static void ublk_put_disk(struct gendisk *disk)
2447 {
2448 	if (disk)
2449 		put_device(disk_to_dev(disk));
2450 }
2451 
ublk_partition_scan_work(struct work_struct * work)2452 static void ublk_partition_scan_work(struct work_struct *work)
2453 {
2454 	struct ublk_device *ub =
2455 		container_of(work, struct ublk_device, partition_scan_work);
2456 	/* Hold disk reference to prevent UAF during concurrent teardown */
2457 	struct gendisk *disk = ublk_get_disk(ub);
2458 
2459 	if (!disk)
2460 		return;
2461 
2462 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2463 					     &disk->state)))
2464 		goto out;
2465 
2466 	mutex_lock(&disk->open_mutex);
2467 	bdev_disk_changed(disk, false);
2468 	mutex_unlock(&disk->open_mutex);
2469 out:
2470 	ublk_put_disk(disk);
2471 }
2472 
2473 /*
2474  * Use this function to ensure that ->canceling is consistently set for
2475  * the device and all queues. Do not set these flags directly.
2476  *
2477  * Caller must ensure that:
2478  * - cancel_mutex is held. This ensures that there is no concurrent
2479  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2480  * - there are no concurrent reads of ubq->canceling from the queue_rq
2481  *   path. This can be done by quiescing the queue, or through other
2482  *   means.
2483  */
ublk_set_canceling(struct ublk_device * ub,bool canceling)2484 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2485 	__must_hold(&ub->cancel_mutex)
2486 {
2487 	int i;
2488 
2489 	ub->canceling = canceling;
2490 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2491 		ublk_get_queue(ub, i)->canceling = canceling;
2492 }
2493 
ublk_check_and_reset_active_ref(struct ublk_device * ub)2494 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2495 {
2496 	int i, j;
2497 
2498 	if (!ublk_dev_need_req_ref(ub))
2499 		return false;
2500 
2501 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2502 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2503 
2504 		for (j = 0; j < ubq->q_depth; j++) {
2505 			struct ublk_io *io = &ubq->ios[j];
2506 			unsigned int refs = refcount_read(&io->ref) +
2507 				io->task_registered_buffers;
2508 
2509 			/*
2510 			 * UBLK_REFCOUNT_INIT or zero means no active
2511 			 * reference
2512 			 */
2513 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2514 				return true;
2515 
2516 			/* reset to zero if the io hasn't active references */
2517 			refcount_set(&io->ref, 0);
2518 			io->task_registered_buffers = 0;
2519 		}
2520 	}
2521 	return false;
2522 }
2523 
ublk_ch_release_work_fn(struct work_struct * work)2524 static void ublk_ch_release_work_fn(struct work_struct *work)
2525 {
2526 	struct ublk_device *ub =
2527 		container_of(work, struct ublk_device, exit_work.work);
2528 	struct gendisk *disk;
2529 	int i;
2530 
2531 	/*
2532 	 * For zero-copy and auto buffer register modes, I/O references
2533 	 * might not be dropped naturally when the daemon is killed, but
2534 	 * io_uring guarantees that registered bvec kernel buffers are
2535 	 * unregistered finally when freeing io_uring context, then the
2536 	 * active references are dropped.
2537 	 *
2538 	 * Wait until active references are dropped for avoiding use-after-free
2539 	 *
2540 	 * registered buffer may be unregistered in io_ring's release hander,
2541 	 * so have to wait by scheduling work function for avoiding the two
2542 	 * file release dependency.
2543 	 */
2544 	if (ublk_check_and_reset_active_ref(ub)) {
2545 		schedule_delayed_work(&ub->exit_work, 1);
2546 		return;
2547 	}
2548 
2549 	/*
2550 	 * disk isn't attached yet, either device isn't live, or it has
2551 	 * been removed already, so we needn't to do anything
2552 	 */
2553 	disk = ublk_get_disk(ub);
2554 	if (!disk)
2555 		goto out;
2556 
2557 	/*
2558 	 * All uring_cmd are done now, so abort any request outstanding to
2559 	 * the ublk server
2560 	 *
2561 	 * This can be done in lockless way because ublk server has been
2562 	 * gone
2563 	 *
2564 	 * More importantly, we have to provide forward progress guarantee
2565 	 * without holding ub->mutex, otherwise control task grabbing
2566 	 * ub->mutex triggers deadlock
2567 	 *
2568 	 * All requests may be inflight, so ->canceling may not be set, set
2569 	 * it now.
2570 	 */
2571 	mutex_lock(&ub->cancel_mutex);
2572 	ublk_set_canceling(ub, true);
2573 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2574 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2575 	mutex_unlock(&ub->cancel_mutex);
2576 	blk_mq_kick_requeue_list(disk->queue);
2577 
2578 	/*
2579 	 * All infligh requests have been completed or requeued and any new
2580 	 * request will be failed or requeued via `->canceling` now, so it is
2581 	 * fine to grab ub->mutex now.
2582 	 */
2583 	mutex_lock(&ub->mutex);
2584 
2585 	/* double check after grabbing lock */
2586 	if (!ub->ub_disk)
2587 		goto unlock;
2588 
2589 	/*
2590 	 * Transition the device to the nosrv state. What exactly this
2591 	 * means depends on the recovery flags
2592 	 */
2593 	if (ublk_nosrv_should_stop_dev(ub)) {
2594 		/*
2595 		 * Allow any pending/future I/O to pass through quickly
2596 		 * with an error. This is needed because del_gendisk
2597 		 * waits for all pending I/O to complete
2598 		 */
2599 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2600 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2601 
2602 		ublk_stop_dev_unlocked(ub);
2603 	} else {
2604 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2605 			/* ->canceling is set and all requests are aborted */
2606 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2607 		} else {
2608 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2609 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2610 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2611 		}
2612 	}
2613 unlock:
2614 	mutex_unlock(&ub->mutex);
2615 	ublk_put_disk(disk);
2616 
2617 	/* all uring_cmd has been done now, reset device & ubq */
2618 	ublk_reset_ch_dev(ub);
2619 out:
2620 	clear_bit(UB_STATE_OPEN, &ub->state);
2621 
2622 	/* put the reference grabbed in ublk_ch_release() */
2623 	ublk_put_device(ub);
2624 }
2625 
ublk_ch_release(struct inode * inode,struct file * filp)2626 static int ublk_ch_release(struct inode *inode, struct file *filp)
2627 {
2628 	struct ublk_device *ub = filp->private_data;
2629 
2630 	/*
2631 	 * Grab ublk device reference, so it won't be gone until we are
2632 	 * really released from work function.
2633 	 */
2634 	ublk_get_device(ub);
2635 
2636 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2637 	schedule_delayed_work(&ub->exit_work, 0);
2638 	return 0;
2639 }
2640 
2641 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)2642 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2643 {
2644 	struct ublk_device *ub = filp->private_data;
2645 	size_t sz = vma->vm_end - vma->vm_start;
2646 	unsigned max_sz = ublk_max_cmd_buf_size();
2647 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2648 	int q_id, ret = 0;
2649 
2650 	spin_lock(&ub->lock);
2651 	if (!ub->mm)
2652 		ub->mm = current->mm;
2653 	if (current->mm != ub->mm)
2654 		ret = -EINVAL;
2655 	spin_unlock(&ub->lock);
2656 
2657 	if (ret)
2658 		return ret;
2659 
2660 	if (vma->vm_flags & VM_WRITE)
2661 		return -EPERM;
2662 
2663 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2664 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2665 		return -EINVAL;
2666 
2667 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2668 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2669 			__func__, q_id, current->pid, vma->vm_start,
2670 			phys_off, (unsigned long)sz);
2671 
2672 	if (sz != ublk_queue_cmd_buf_size(ub))
2673 		return -EINVAL;
2674 
2675 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2676 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2677 }
2678 
__ublk_fail_req(struct ublk_device * ub,struct ublk_io * io,struct request * req)2679 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2680 		struct request *req)
2681 {
2682 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2683 			io->flags & UBLK_IO_FLAG_ACTIVE);
2684 
2685 	if (ublk_nosrv_should_reissue_outstanding(ub))
2686 		blk_mq_requeue_request(req, false);
2687 	else {
2688 		io->res = -EIO;
2689 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2690 	}
2691 }
2692 
2693 /*
2694  * Request tag may just be filled to event kfifo, not get chance to
2695  * dispatch, abort these requests too
2696  */
ublk_abort_batch_queue(struct ublk_device * ub,struct ublk_queue * ubq)2697 static void ublk_abort_batch_queue(struct ublk_device *ub,
2698 				   struct ublk_queue *ubq)
2699 {
2700 	unsigned short tag;
2701 
2702 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2703 		struct request *req = blk_mq_tag_to_rq(
2704 				ub->tag_set.tags[ubq->q_id], tag);
2705 
2706 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2707 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2708 	}
2709 }
2710 
2711 /*
2712  * Called from ublk char device release handler, when any uring_cmd is
2713  * done, meantime request queue is "quiesced" since all inflight requests
2714  * can't be completed because ublk server is dead.
2715  *
2716  * So no one can hold our request IO reference any more, simply ignore the
2717  * reference, and complete the request immediately
2718  */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)2719 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2720 {
2721 	int i;
2722 
2723 	for (i = 0; i < ubq->q_depth; i++) {
2724 		struct ublk_io *io = &ubq->ios[i];
2725 
2726 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2727 			__ublk_fail_req(ub, io, io->req);
2728 	}
2729 
2730 	if (ublk_support_batch_io(ubq))
2731 		ublk_abort_batch_queue(ub, ubq);
2732 }
2733 
ublk_start_cancel(struct ublk_device * ub)2734 static void ublk_start_cancel(struct ublk_device *ub)
2735 {
2736 	struct gendisk *disk = ublk_get_disk(ub);
2737 
2738 	/* Our disk has been dead */
2739 	if (!disk)
2740 		return;
2741 
2742 	mutex_lock(&ub->cancel_mutex);
2743 	if (ub->canceling)
2744 		goto out;
2745 	/*
2746 	 * Now we are serialized with ublk_queue_rq()
2747 	 *
2748 	 * Make sure that ubq->canceling is set when queue is frozen,
2749 	 * because ublk_queue_rq() has to rely on this flag for avoiding to
2750 	 * touch completed uring_cmd
2751 	 */
2752 	blk_mq_quiesce_queue(disk->queue);
2753 	ublk_set_canceling(ub, true);
2754 	blk_mq_unquiesce_queue(disk->queue);
2755 out:
2756 	mutex_unlock(&ub->cancel_mutex);
2757 	ublk_put_disk(disk);
2758 }
2759 
ublk_cancel_cmd(struct ublk_queue * ubq,unsigned tag,unsigned int issue_flags)2760 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2761 		unsigned int issue_flags)
2762 {
2763 	struct ublk_io *io = &ubq->ios[tag];
2764 	struct ublk_device *ub = ubq->dev;
2765 	struct io_uring_cmd *cmd = NULL;
2766 	struct request *req;
2767 	bool done;
2768 
2769 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2770 		return;
2771 
2772 	/*
2773 	 * Don't try to cancel this command if the request is started for
2774 	 * avoiding race between io_uring_cmd_done() and
2775 	 * io_uring_cmd_complete_in_task().
2776 	 *
2777 	 * Either the started request will be aborted via __ublk_abort_rq(),
2778 	 * then this uring_cmd is canceled next time, or it will be done in
2779 	 * task work function ublk_dispatch_req() because io_uring guarantees
2780 	 * that ublk_dispatch_req() is always called
2781 	 */
2782 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2783 	if (req && blk_mq_request_started(req) && req->tag == tag)
2784 		return;
2785 
2786 	spin_lock(&ubq->cancel_lock);
2787 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2788 	if (!done) {
2789 		io->flags |= UBLK_IO_FLAG_CANCELED;
2790 		cmd = io->cmd;
2791 		io->cmd = NULL;
2792 	}
2793 	spin_unlock(&ubq->cancel_lock);
2794 
2795 	if (!done && cmd)
2796 		io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, issue_flags);
2797 }
2798 
2799 /*
2800  * Cancel a batch fetch command if it hasn't been claimed by another path.
2801  *
2802  * An fcmd can only be cancelled if:
2803  * 1. It's not the active_fcmd (which is currently being processed)
2804  * 2. It's still on the list (!list_empty check) - once removed from the list,
2805  *    the fcmd is considered claimed and will be freed by whoever removed it
2806  *
2807  * Use list_del_init() so subsequent list_empty() checks work correctly.
2808  */
ublk_batch_cancel_cmd(struct ublk_queue * ubq,struct ublk_batch_fetch_cmd * fcmd,unsigned int issue_flags)2809 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2810 				  struct ublk_batch_fetch_cmd *fcmd,
2811 				  unsigned int issue_flags)
2812 {
2813 	bool done;
2814 
2815 	spin_lock(&ubq->evts_lock);
2816 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2817 	if (done)
2818 		list_del_init(&fcmd->node);
2819 	spin_unlock(&ubq->evts_lock);
2820 
2821 	if (done) {
2822 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2823 		ublk_batch_free_fcmd(fcmd);
2824 	}
2825 }
2826 
ublk_batch_cancel_queue(struct ublk_queue * ubq)2827 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2828 {
2829 	struct ublk_batch_fetch_cmd *fcmd;
2830 	LIST_HEAD(fcmd_list);
2831 
2832 	spin_lock(&ubq->evts_lock);
2833 	ubq->force_abort = true;
2834 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2835 	fcmd = READ_ONCE(ubq->active_fcmd);
2836 	if (fcmd)
2837 		list_move(&fcmd->node, &ubq->fcmd_head);
2838 	spin_unlock(&ubq->evts_lock);
2839 
2840 	while (!list_empty(&fcmd_list)) {
2841 		fcmd = list_first_entry(&fcmd_list,
2842 				struct ublk_batch_fetch_cmd, node);
2843 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2844 	}
2845 }
2846 
ublk_batch_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2847 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2848 				 unsigned int issue_flags)
2849 {
2850 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2851 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2852 	struct ublk_queue *ubq = pdu->ubq;
2853 
2854 	ublk_start_cancel(ubq->dev);
2855 
2856 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2857 }
2858 
2859 /*
2860  * The ublk char device won't be closed when calling cancel fn, so both
2861  * ublk device and queue are guaranteed to be live
2862  *
2863  * Two-stage cancel:
2864  *
2865  * - make every active uring_cmd done in ->cancel_fn()
2866  *
2867  * - aborting inflight ublk IO requests in ublk char device release handler,
2868  *   which depends on 1st stage because device can only be closed iff all
2869  *   uring_cmd are done
2870  *
2871  * Do _not_ try to acquire ub->mutex before all inflight requests are
2872  * aborted, otherwise deadlock may be caused.
2873  */
ublk_uring_cmd_cancel_fn(struct io_uring_cmd * cmd,unsigned int issue_flags)2874 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2875 		unsigned int issue_flags)
2876 {
2877 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2878 	struct ublk_queue *ubq = pdu->ubq;
2879 	struct task_struct *task;
2880 	struct ublk_io *io;
2881 
2882 	if (WARN_ON_ONCE(!ubq))
2883 		return;
2884 
2885 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2886 		return;
2887 
2888 	task = io_uring_cmd_get_task(cmd);
2889 	io = &ubq->ios[pdu->tag];
2890 	if (WARN_ON_ONCE(task && task != io->task))
2891 		return;
2892 
2893 	ublk_start_cancel(ubq->dev);
2894 
2895 	WARN_ON_ONCE(io->cmd != cmd);
2896 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2897 }
2898 
ublk_queue_ready(const struct ublk_queue * ubq)2899 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2900 {
2901 	return ubq->nr_io_ready == ubq->q_depth;
2902 }
2903 
ublk_dev_ready(const struct ublk_device * ub)2904 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2905 {
2906 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2907 }
2908 
ublk_cancel_queue(struct ublk_queue * ubq)2909 static void ublk_cancel_queue(struct ublk_queue *ubq)
2910 {
2911 	int i;
2912 
2913 	if (ublk_support_batch_io(ubq)) {
2914 		ublk_batch_cancel_queue(ubq);
2915 		return;
2916 	}
2917 
2918 	for (i = 0; i < ubq->q_depth; i++)
2919 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2920 }
2921 
2922 /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)2923 static void ublk_cancel_dev(struct ublk_device *ub)
2924 {
2925 	int i;
2926 
2927 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2928 		ublk_cancel_queue(ublk_get_queue(ub, i));
2929 }
2930 
ublk_check_inflight_rq(struct request * rq,void * data)2931 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2932 {
2933 	bool *idle = data;
2934 
2935 	if (blk_mq_request_started(rq)) {
2936 		*idle = false;
2937 		return false;
2938 	}
2939 	return true;
2940 }
2941 
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)2942 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2943 {
2944 	bool idle;
2945 
2946 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2947 	while (true) {
2948 		idle = true;
2949 		blk_mq_tagset_busy_iter(&ub->tag_set,
2950 				ublk_check_inflight_rq, &idle);
2951 		if (idle)
2952 			break;
2953 		msleep(UBLK_REQUEUE_DELAY_MS);
2954 	}
2955 }
2956 
ublk_force_abort_dev(struct ublk_device * ub)2957 static void ublk_force_abort_dev(struct ublk_device *ub)
2958 {
2959 	int i;
2960 
2961 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2962 			__func__, ub->dev_info.dev_id,
2963 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2964 			"LIVE" : "QUIESCED");
2965 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2966 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2967 		ublk_wait_tagset_rqs_idle(ub);
2968 
2969 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2970 		ublk_get_queue(ub, i)->force_abort = true;
2971 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2972 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2973 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2974 }
2975 
ublk_detach_disk(struct ublk_device * ub)2976 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2977 {
2978 	struct gendisk *disk;
2979 
2980 	/* Sync with ublk_abort_queue() by holding the lock */
2981 	spin_lock(&ub->lock);
2982 	disk = ub->ub_disk;
2983 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2984 	ub->dev_info.ublksrv_pid = -1;
2985 	ub->ub_disk = NULL;
2986 	spin_unlock(&ub->lock);
2987 
2988 	return disk;
2989 }
2990 
ublk_stop_dev_unlocked(struct ublk_device * ub)2991 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2992 	__must_hold(&ub->mutex)
2993 {
2994 	struct gendisk *disk;
2995 
2996 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2997 		return;
2998 
2999 	if (ublk_nosrv_dev_should_queue_io(ub))
3000 		ublk_force_abort_dev(ub);
3001 	del_gendisk(ub->ub_disk);
3002 	disk = ublk_detach_disk(ub);
3003 	put_disk(disk);
3004 }
3005 
ublk_stop_dev(struct ublk_device * ub)3006 static void ublk_stop_dev(struct ublk_device *ub)
3007 {
3008 	mutex_lock(&ub->mutex);
3009 	ublk_stop_dev_unlocked(ub);
3010 	mutex_unlock(&ub->mutex);
3011 	cancel_work_sync(&ub->partition_scan_work);
3012 	ublk_cancel_dev(ub);
3013 }
3014 
ublk_reset_io_flags(struct ublk_queue * ubq,struct ublk_io * io)3015 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
3016 {
3017 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
3018 	spin_lock(&ubq->cancel_lock);
3019 	io->flags &= ~UBLK_IO_FLAG_CANCELED;
3020 	spin_unlock(&ubq->cancel_lock);
3021 }
3022 
3023 /* reset per-queue io flags */
ublk_queue_reset_io_flags(struct ublk_queue * ubq)3024 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
3025 {
3026 	spin_lock(&ubq->cancel_lock);
3027 	ubq->canceling = false;
3028 	spin_unlock(&ubq->cancel_lock);
3029 	ubq->fail_io = false;
3030 }
3031 
3032 /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,u16 q_id,struct ublk_io * io)3033 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
3034 	struct ublk_io *io)
3035 	__must_hold(&ub->mutex)
3036 {
3037 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3038 
3039 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3040 		ub->unprivileged_daemons = true;
3041 
3042 	ubq->nr_io_ready++;
3043 	ublk_reset_io_flags(ubq, io);
3044 
3045 	/* Check if this specific queue is now fully ready */
3046 	if (ublk_queue_ready(ubq)) {
3047 		ub->nr_queue_ready++;
3048 
3049 		/*
3050 		 * Reset queue flags as soon as this queue is ready.
3051 		 * This clears the canceling flag, allowing batch FETCH commands
3052 		 * to succeed during recovery without waiting for all queues.
3053 		 */
3054 		ublk_queue_reset_io_flags(ubq);
3055 	}
3056 
3057 	/* Check if all queues are ready */
3058 	if (ublk_dev_ready(ub)) {
3059 		/*
3060 		 * All queues ready - clear device-level canceling flag
3061 		 * and complete the recovery/initialization.
3062 		 */
3063 		mutex_lock(&ub->cancel_mutex);
3064 		ub->canceling = false;
3065 		mutex_unlock(&ub->cancel_mutex);
3066 		complete_all(&ub->completion);
3067 	}
3068 }
3069 
ublk_check_cmd_op(u32 cmd_op)3070 static inline int ublk_check_cmd_op(u32 cmd_op)
3071 {
3072 	u32 ioc_type = _IOC_TYPE(cmd_op);
3073 
3074 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3075 		return -EOPNOTSUPP;
3076 
3077 	if (ioc_type != 'u' && ioc_type != 0)
3078 		return -EOPNOTSUPP;
3079 
3080 	return 0;
3081 }
3082 
ublk_set_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd)3083 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3084 {
3085 	struct ublk_auto_buf_reg buf;
3086 
3087 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3088 
3089 	if (buf.reserved0 || buf.reserved1)
3090 		return -EINVAL;
3091 
3092 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3093 		return -EINVAL;
3094 	io->buf.auto_reg = buf;
3095 	return 0;
3096 }
3097 
ublk_clear_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3098 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3099 				    struct io_uring_cmd *cmd,
3100 				    u16 *buf_idx)
3101 {
3102 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3103 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3104 
3105 		/*
3106 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3107 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3108 		 * `io_ring_ctx`.
3109 		 *
3110 		 * If this uring_cmd's io_ring_ctx isn't same with the
3111 		 * one for registering the buffer, it is ublk server's
3112 		 * responsibility for unregistering the buffer, otherwise
3113 		 * this ublk request gets stuck.
3114 		 */
3115 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3116 			*buf_idx = io->buf.auto_reg.index;
3117 	}
3118 }
3119 
ublk_handle_auto_buf_reg(struct ublk_io * io,struct io_uring_cmd * cmd,u16 * buf_idx)3120 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3121 				    struct io_uring_cmd *cmd,
3122 				    u16 *buf_idx)
3123 {
3124 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3125 	return ublk_set_auto_buf_reg(io, cmd);
3126 }
3127 
3128 /* Once we return, `io->req` can't be used any more */
3129 static inline struct request *
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd)3130 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3131 {
3132 	struct request *req = io->req;
3133 
3134 	io->cmd = cmd;
3135 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3136 	/* now this cmd slot is owned by ublk driver */
3137 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3138 
3139 	return req;
3140 }
3141 
3142 static inline int
ublk_config_io_buf(const struct ublk_device * ub,struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr,u16 * buf_idx)3143 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3144 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3145 		   u16 *buf_idx)
3146 {
3147 	if (ublk_dev_support_auto_buf_reg(ub))
3148 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3149 
3150 	io->buf.addr = buf_addr;
3151 	return 0;
3152 }
3153 
ublk_prep_cancel(struct io_uring_cmd * cmd,unsigned int issue_flags,struct ublk_queue * ubq,unsigned int tag)3154 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3155 				    unsigned int issue_flags,
3156 				    struct ublk_queue *ubq, unsigned int tag)
3157 {
3158 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3159 
3160 	/*
3161 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3162 	 * commands are completed
3163 	 */
3164 	pdu->ubq = ubq;
3165 	pdu->tag = tag;
3166 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3167 }
3168 
ublk_io_release(void * priv)3169 static void ublk_io_release(void *priv)
3170 {
3171 	struct request *rq = priv;
3172 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3173 	struct ublk_io *io = &ubq->ios[rq->tag];
3174 
3175 	/*
3176 	 * task_registered_buffers may be 0 if buffers were registered off task
3177 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3178 	 */
3179 	if (current == io->task && io->task_registered_buffers)
3180 		io->task_registered_buffers--;
3181 	else
3182 		ublk_put_req_ref(io, rq);
3183 }
3184 
ublk_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned int index,unsigned int issue_flags)3185 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3186 				struct ublk_device *ub,
3187 				u16 q_id, u16 tag,
3188 				struct ublk_io *io,
3189 				unsigned int index, unsigned int issue_flags)
3190 {
3191 	struct request *req;
3192 	int ret;
3193 
3194 	if (!ublk_dev_support_zero_copy(ub))
3195 		return -EINVAL;
3196 
3197 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3198 	if (!req)
3199 		return -EINVAL;
3200 
3201 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3202 				      issue_flags);
3203 	if (ret) {
3204 		ublk_put_req_ref(io, req);
3205 		return ret;
3206 	}
3207 
3208 	return 0;
3209 }
3210 
3211 static int
ublk_daemon_register_io_buf(struct io_uring_cmd * cmd,struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io,unsigned index,unsigned issue_flags)3212 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3213 			    struct ublk_device *ub,
3214 			    u16 q_id, u16 tag, struct ublk_io *io,
3215 			    unsigned index, unsigned issue_flags)
3216 {
3217 	unsigned new_registered_buffers;
3218 	struct request *req = io->req;
3219 	int ret;
3220 
3221 	/*
3222 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3223 	 * If not, fall back on the thread-safe buffer registration.
3224 	 */
3225 	new_registered_buffers = io->task_registered_buffers + 1;
3226 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3227 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3228 					    issue_flags);
3229 
3230 	if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3231 		return -EINVAL;
3232 
3233 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3234 				      issue_flags);
3235 	if (ret)
3236 		return ret;
3237 
3238 	io->task_registered_buffers = new_registered_buffers;
3239 	return 0;
3240 }
3241 
ublk_unregister_io_buf(struct io_uring_cmd * cmd,const struct ublk_device * ub,unsigned int index,unsigned int issue_flags)3242 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3243 				  const struct ublk_device *ub,
3244 				  unsigned int index, unsigned int issue_flags)
3245 {
3246 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3247 		return -EINVAL;
3248 
3249 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3250 }
3251 
ublk_check_fetch_buf(const struct ublk_device * ub,__u64 buf_addr)3252 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3253 {
3254 	if (ublk_dev_need_map_io(ub)) {
3255 		/*
3256 		 * FETCH_RQ has to provide IO buffer if NEED GET
3257 		 * DATA is not enabled
3258 		 */
3259 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3260 			return -EINVAL;
3261 	} else if (buf_addr) {
3262 		/* User copy requires addr to be unset */
3263 		return -EINVAL;
3264 	}
3265 	return 0;
3266 }
3267 
__ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,u16 q_id)3268 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3269 			struct ublk_io *io, u16 q_id)
3270 {
3271 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3272 	if (ublk_dev_ready(ub))
3273 		return -EBUSY;
3274 
3275 	/* allow each command to be FETCHed at most once */
3276 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3277 		return -EINVAL;
3278 
3279 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3280 
3281 	ublk_fill_io_cmd(io, cmd);
3282 
3283 	if (ublk_dev_support_batch_io(ub))
3284 		WRITE_ONCE(io->task, NULL);
3285 	else
3286 		WRITE_ONCE(io->task, get_task_struct(current));
3287 
3288 	return 0;
3289 }
3290 
ublk_fetch(struct io_uring_cmd * cmd,struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr,u16 q_id)3291 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3292 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3293 {
3294 	int ret;
3295 
3296 	/*
3297 	 * When handling FETCH command for setting up ublk uring queue,
3298 	 * ub->mutex is the innermost lock, and we won't block for handling
3299 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3300 	 */
3301 	mutex_lock(&ub->mutex);
3302 	ret = __ublk_fetch(cmd, ub, io, q_id);
3303 	if (!ret)
3304 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3305 	if (!ret)
3306 		ublk_mark_io_ready(ub, q_id, io);
3307 	mutex_unlock(&ub->mutex);
3308 	return ret;
3309 }
3310 
ublk_check_commit_and_fetch(const struct ublk_device * ub,struct ublk_io * io,__u64 buf_addr)3311 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3312 				       struct ublk_io *io, __u64 buf_addr)
3313 {
3314 	struct request *req = io->req;
3315 
3316 	if (ublk_dev_need_map_io(ub)) {
3317 		/*
3318 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3319 		 * NEED GET DATA is not enabled or it is Read IO.
3320 		 */
3321 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3322 					req_op(req) == REQ_OP_READ))
3323 			return -EINVAL;
3324 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3325 		/*
3326 		 * User copy requires addr to be unset when command is
3327 		 * not zone append
3328 		 */
3329 		return -EINVAL;
3330 	}
3331 
3332 	return 0;
3333 }
3334 
ublk_need_complete_req(const struct ublk_device * ub,struct ublk_io * io)3335 static bool ublk_need_complete_req(const struct ublk_device *ub,
3336 				   struct ublk_io *io)
3337 {
3338 	if (ublk_dev_need_req_ref(ub))
3339 		return ublk_sub_req_ref(io);
3340 	return true;
3341 }
3342 
ublk_get_data(const struct ublk_queue * ubq,struct ublk_io * io,struct request * req)3343 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3344 			  struct request *req)
3345 {
3346 	/*
3347 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3348 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3349 	 * do the copy work.
3350 	 */
3351 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3352 	/* update iod->addr because ublksrv may have passed a new io buffer */
3353 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3354 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3355 			__func__, ubq->q_id, req->tag, io->flags,
3356 			ublk_get_iod(ubq, req->tag)->addr);
3357 
3358 	return ublk_start_io(ubq, req, io);
3359 }
3360 
ublk_ch_uring_cmd_local(struct io_uring_cmd * cmd,unsigned int issue_flags)3361 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3362 		unsigned int issue_flags)
3363 {
3364 	/* May point to userspace-mapped memory */
3365 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3366 							       struct ublksrv_io_cmd);
3367 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3368 	struct ublk_device *ub = cmd->file->private_data;
3369 	struct ublk_queue *ubq;
3370 	struct ublk_io *io = NULL;
3371 	u32 cmd_op = cmd->cmd_op;
3372 	u16 q_id = READ_ONCE(ub_src->q_id);
3373 	u16 tag = READ_ONCE(ub_src->tag);
3374 	s32 result = READ_ONCE(ub_src->result);
3375 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3376 	struct request *req;
3377 	int ret;
3378 	bool compl;
3379 
3380 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3381 
3382 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3383 			__func__, cmd->cmd_op, q_id, tag, result);
3384 
3385 	ret = ublk_check_cmd_op(cmd_op);
3386 	if (ret)
3387 		goto out;
3388 
3389 	/*
3390 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3391 	 * so no need to validate the q_id, tag, or task
3392 	 */
3393 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3394 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3395 
3396 	ret = -EINVAL;
3397 	if (q_id >= ub->dev_info.nr_hw_queues)
3398 		goto out;
3399 
3400 	ubq = ublk_get_queue(ub, q_id);
3401 
3402 	if (tag >= ub->dev_info.queue_depth)
3403 		goto out;
3404 
3405 	io = &ubq->ios[tag];
3406 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3407 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3408 		ret = ublk_check_fetch_buf(ub, addr);
3409 		if (ret)
3410 			goto out;
3411 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3412 		if (ret)
3413 			goto out;
3414 
3415 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3416 		return -EIOCBQUEUED;
3417 	}
3418 
3419 	if (READ_ONCE(io->task) != current) {
3420 		/*
3421 		 * ublk_register_io_buf() accesses only the io's refcount,
3422 		 * so can be handled on any task
3423 		 */
3424 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3425 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3426 						    addr, issue_flags);
3427 
3428 		goto out;
3429 	}
3430 
3431 	/* there is pending io cmd, something must be wrong */
3432 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3433 		ret = -EBUSY;
3434 		goto out;
3435 	}
3436 
3437 	/*
3438 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3439 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3440 	 */
3441 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3442 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3443 		goto out;
3444 
3445 	switch (_IOC_NR(cmd_op)) {
3446 	case UBLK_IO_REGISTER_IO_BUF:
3447 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3448 						   issue_flags);
3449 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3450 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3451 		if (ret)
3452 			goto out;
3453 		io->res = result;
3454 		req = ublk_fill_io_cmd(io, cmd);
3455 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3456 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3457 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3458 		compl = ublk_need_complete_req(ub, io);
3459 
3460 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3461 			req->__sector = addr;
3462 		if (compl)
3463 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3464 
3465 		if (ret)
3466 			goto out;
3467 		break;
3468 	case UBLK_IO_NEED_GET_DATA:
3469 		/*
3470 		 * ublk_get_data() may fail and fallback to requeue, so keep
3471 		 * uring_cmd active first and prepare for handling new requeued
3472 		 * request
3473 		 */
3474 		req = ublk_fill_io_cmd(io, cmd);
3475 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3476 		WARN_ON_ONCE(ret);
3477 		if (likely(ublk_get_data(ubq, io, req))) {
3478 			__ublk_prep_compl_io_cmd(io, req);
3479 			return UBLK_IO_RES_OK;
3480 		}
3481 		break;
3482 	default:
3483 		goto out;
3484 	}
3485 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3486 	return -EIOCBQUEUED;
3487 
3488  out:
3489 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3490 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3491 	return ret;
3492 }
3493 
__ublk_check_and_get_req(struct ublk_device * ub,u16 q_id,u16 tag,struct ublk_io * io)3494 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3495 		u16 q_id, u16 tag, struct ublk_io *io)
3496 {
3497 	struct request *req;
3498 
3499 	/*
3500 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3501 	 * which would overwrite it with io->cmd
3502 	 */
3503 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3504 	if (!req)
3505 		return NULL;
3506 
3507 	if (!ublk_get_req_ref(io))
3508 		return NULL;
3509 
3510 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3511 		goto fail_put;
3512 
3513 	if (!ublk_rq_has_data(req))
3514 		goto fail_put;
3515 
3516 	return req;
3517 fail_put:
3518 	ublk_put_req_ref(io, req);
3519 	return NULL;
3520 }
3521 
ublk_ch_uring_cmd_cb(struct io_tw_req tw_req,io_tw_token_t tw)3522 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3523 {
3524 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3525 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3526 	int ret = -ECANCELED;
3527 
3528 	if (!tw.cancel)
3529 		ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3530 	if (ret != -EIOCBQUEUED)
3531 		io_uring_cmd_done(cmd, ret, issue_flags);
3532 }
3533 
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3534 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3535 {
3536 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3537 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3538 		return 0;
3539 	}
3540 
3541 	/* well-implemented server won't run into unlocked */
3542 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3543 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3544 		return -EIOCBQUEUED;
3545 	}
3546 
3547 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3548 }
3549 
ublk_batch_buf_addr(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3550 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3551 					const struct ublk_elem_header *elem)
3552 {
3553 	const void *buf = elem;
3554 
3555 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3556 		return *(const __u64 *)(buf + sizeof(*elem));
3557 	return 0;
3558 }
3559 
ublk_batch_zone_lba(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3560 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3561 					const struct ublk_elem_header *elem)
3562 {
3563 	const void *buf = elem;
3564 
3565 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3566 		return *(const __u64 *)(buf + sizeof(*elem) +
3567 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3568 	return -1;
3569 }
3570 
3571 static struct ublk_auto_buf_reg
ublk_batch_auto_buf_reg(const struct ublk_batch_io * uc,const struct ublk_elem_header * elem)3572 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3573 			const struct ublk_elem_header *elem)
3574 {
3575 	struct ublk_auto_buf_reg reg = {
3576 		.index = elem->buf_index,
3577 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3578 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3579 	};
3580 
3581 	return reg;
3582 }
3583 
3584 /*
3585  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3586  * it is the least common multiple(LCM) of 8, 16 and 24
3587  */
3588 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3589 struct ublk_batch_io_iter {
3590 	void __user *uaddr;
3591 	unsigned done, total;
3592 	unsigned char elem_bytes;
3593 	/* copy to this buffer from user space */
3594 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3595 };
3596 
3597 static inline int
__ublk_walk_cmd_buf(struct ublk_queue * ubq,struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,unsigned bytes,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3598 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3599 		    struct ublk_batch_io_iter *iter,
3600 		    const struct ublk_batch_io_data *data,
3601 		    unsigned bytes,
3602 		    int (*cb)(struct ublk_queue *q,
3603 			    const struct ublk_batch_io_data *data,
3604 			    const struct ublk_elem_header *elem))
3605 {
3606 	unsigned int i;
3607 	int ret = 0;
3608 
3609 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3610 		const struct ublk_elem_header *elem =
3611 			(const struct ublk_elem_header *)&iter->buf[i];
3612 
3613 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3614 			ret = -EINVAL;
3615 			break;
3616 		}
3617 
3618 		ret = cb(ubq, data, elem);
3619 		if (unlikely(ret))
3620 			break;
3621 	}
3622 
3623 	iter->done += i;
3624 	return ret;
3625 }
3626 
ublk_walk_cmd_buf(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data,int (* cb)(struct ublk_queue * q,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem))3627 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3628 			     const struct ublk_batch_io_data *data,
3629 			     int (*cb)(struct ublk_queue *q,
3630 				     const struct ublk_batch_io_data *data,
3631 				     const struct ublk_elem_header *elem))
3632 {
3633 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3634 	int ret = 0;
3635 
3636 	while (iter->done < iter->total) {
3637 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3638 
3639 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3640 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3641 					data->ub->dev_info.dev_id);
3642 			return -EFAULT;
3643 		}
3644 
3645 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3646 		if (ret)
3647 			return ret;
3648 	}
3649 	return 0;
3650 }
3651 
ublk_batch_unprep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3652 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3653 				const struct ublk_batch_io_data *data,
3654 				const struct ublk_elem_header *elem)
3655 {
3656 	struct ublk_io *io = &ubq->ios[elem->tag];
3657 
3658 	/*
3659 	 * If queue was ready before this decrement, it won't be anymore,
3660 	 * so we need to decrement the queue ready count and restore the
3661 	 * canceling flag to prevent new requests from being queued.
3662 	 */
3663 	if (ublk_queue_ready(ubq)) {
3664 		data->ub->nr_queue_ready--;
3665 		spin_lock(&ubq->cancel_lock);
3666 		ubq->canceling = true;
3667 		spin_unlock(&ubq->cancel_lock);
3668 	}
3669 	ubq->nr_io_ready--;
3670 
3671 	ublk_io_lock(io);
3672 	io->flags = 0;
3673 	ublk_io_unlock(io);
3674 	return 0;
3675 }
3676 
ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter * iter,const struct ublk_batch_io_data * data)3677 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3678 				       const struct ublk_batch_io_data *data)
3679 {
3680 	int ret;
3681 
3682 	/* Re-process only what we've already processed, starting from beginning */
3683 	iter->total = iter->done;
3684 	iter->done = 0;
3685 
3686 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3687 	WARN_ON_ONCE(ret);
3688 }
3689 
ublk_batch_prep_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3690 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3691 			      const struct ublk_batch_io_data *data,
3692 			      const struct ublk_elem_header *elem)
3693 {
3694 	struct ublk_io *io = &ubq->ios[elem->tag];
3695 	const struct ublk_batch_io *uc = &data->header;
3696 	union ublk_io_buf buf = { 0 };
3697 	int ret;
3698 
3699 	if (ublk_dev_support_auto_buf_reg(data->ub))
3700 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3701 	else if (ublk_dev_need_map_io(data->ub)) {
3702 		buf.addr = ublk_batch_buf_addr(uc, elem);
3703 
3704 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3705 		if (ret)
3706 			return ret;
3707 	}
3708 
3709 	ublk_io_lock(io);
3710 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3711 	if (!ret)
3712 		io->buf = buf;
3713 	ublk_io_unlock(io);
3714 
3715 	if (!ret)
3716 		ublk_mark_io_ready(data->ub, ubq->q_id, io);
3717 
3718 	return ret;
3719 }
3720 
ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data * data)3721 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3722 {
3723 	const struct ublk_batch_io *uc = &data->header;
3724 	struct io_uring_cmd *cmd = data->cmd;
3725 	struct ublk_batch_io_iter iter = {
3726 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3727 		.total = uc->nr_elem * uc->elem_bytes,
3728 		.elem_bytes = uc->elem_bytes,
3729 	};
3730 	int ret;
3731 
3732 	mutex_lock(&data->ub->mutex);
3733 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3734 
3735 	if (ret && iter.done)
3736 		ublk_batch_revert_prep_cmd(&iter, data);
3737 	mutex_unlock(&data->ub->mutex);
3738 	return ret;
3739 }
3740 
ublk_batch_commit_io_check(const struct ublk_queue * ubq,struct ublk_io * io,union ublk_io_buf * buf)3741 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3742 				      struct ublk_io *io,
3743 				      union ublk_io_buf *buf)
3744 {
3745 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3746 		return -EBUSY;
3747 
3748 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3749 	if (ublk_need_map_io(ubq) && !buf->addr)
3750 		return -EINVAL;
3751 	return 0;
3752 }
3753 
ublk_batch_commit_io(struct ublk_queue * ubq,const struct ublk_batch_io_data * data,const struct ublk_elem_header * elem)3754 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3755 				const struct ublk_batch_io_data *data,
3756 				const struct ublk_elem_header *elem)
3757 {
3758 	struct ublk_io *io = &ubq->ios[elem->tag];
3759 	const struct ublk_batch_io *uc = &data->header;
3760 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3761 	union ublk_io_buf buf = { 0 };
3762 	struct request *req = NULL;
3763 	bool auto_reg = false;
3764 	bool compl = false;
3765 	int ret;
3766 
3767 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3768 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3769 		auto_reg = true;
3770 	} else if (ublk_dev_need_map_io(data->ub))
3771 		buf.addr = ublk_batch_buf_addr(uc, elem);
3772 
3773 	ublk_io_lock(io);
3774 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3775 	if (!ret) {
3776 		io->res = elem->result;
3777 		io->buf = buf;
3778 		req = ublk_fill_io_cmd(io, data->cmd);
3779 
3780 		if (auto_reg)
3781 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3782 		compl = ublk_need_complete_req(data->ub, io);
3783 	}
3784 	ublk_io_unlock(io);
3785 
3786 	if (unlikely(ret)) {
3787 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3788 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3789 			elem->tag, ret);
3790 		return ret;
3791 	}
3792 
3793 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3794 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3795 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3796 		req->__sector = ublk_batch_zone_lba(uc, elem);
3797 	if (compl)
3798 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3799 	return 0;
3800 }
3801 
ublk_handle_batch_commit_cmd(struct ublk_batch_io_data * data)3802 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3803 {
3804 	const struct ublk_batch_io *uc = &data->header;
3805 	struct io_uring_cmd *cmd = data->cmd;
3806 	struct ublk_batch_io_iter iter = {
3807 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3808 		.total = uc->nr_elem * uc->elem_bytes,
3809 		.elem_bytes = uc->elem_bytes,
3810 	};
3811 	DEFINE_IO_COMP_BATCH(iob);
3812 	int ret;
3813 
3814 	data->iob = &iob;
3815 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3816 
3817 	if (iob.complete)
3818 		iob.complete(&iob);
3819 
3820 	return iter.done == 0 ? ret : iter.done;
3821 }
3822 
ublk_check_batch_cmd_flags(const struct ublk_batch_io * uc)3823 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3824 {
3825 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3826 
3827 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3828 		return -EINVAL;
3829 
3830 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3831 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3832 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3833 		return -EINVAL;
3834 
3835 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3836 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3837 	if (uc->elem_bytes != elem_bytes)
3838 		return -EINVAL;
3839 	return 0;
3840 }
3841 
ublk_check_batch_cmd(const struct ublk_batch_io_data * data)3842 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3843 {
3844 	const struct ublk_batch_io *uc = &data->header;
3845 
3846 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3847 		return -EINVAL;
3848 
3849 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3850 		return -E2BIG;
3851 
3852 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3853 			!ublk_dev_is_zoned(data->ub))
3854 		return -EINVAL;
3855 
3856 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3857 			!ublk_dev_need_map_io(data->ub))
3858 		return -EINVAL;
3859 
3860 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3861 			!ublk_dev_support_auto_buf_reg(data->ub))
3862 		return -EINVAL;
3863 
3864 	return ublk_check_batch_cmd_flags(uc);
3865 }
3866 
ublk_batch_attach(struct ublk_queue * ubq,struct ublk_batch_io_data * data,struct ublk_batch_fetch_cmd * fcmd)3867 static int ublk_batch_attach(struct ublk_queue *ubq,
3868 			     struct ublk_batch_io_data *data,
3869 			     struct ublk_batch_fetch_cmd *fcmd)
3870 {
3871 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3872 	bool free = false;
3873 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3874 
3875 	spin_lock(&ubq->evts_lock);
3876 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3877 		free = true;
3878 	} else {
3879 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3880 		new_fcmd = __ublk_acquire_fcmd(ubq);
3881 	}
3882 	spin_unlock(&ubq->evts_lock);
3883 
3884 	if (unlikely(free)) {
3885 		ublk_batch_free_fcmd(fcmd);
3886 		return -ENODEV;
3887 	}
3888 
3889 	pdu->ubq = ubq;
3890 	pdu->fcmd = fcmd;
3891 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3892 
3893 	if (!new_fcmd)
3894 		goto out;
3895 
3896 	/*
3897 	 * If the two fetch commands are originated from same io_ring_ctx,
3898 	 * run batch dispatch directly. Otherwise, schedule task work for
3899 	 * doing it.
3900 	 */
3901 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3902 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3903 		data->cmd = new_fcmd->cmd;
3904 		ublk_batch_dispatch(ubq, data, new_fcmd);
3905 	} else {
3906 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3907 				ublk_batch_tw_cb);
3908 	}
3909 out:
3910 	return -EIOCBQUEUED;
3911 }
3912 
ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data * data)3913 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3914 {
3915 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3916 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3917 
3918 	if (!fcmd)
3919 		return -ENOMEM;
3920 
3921 	return ublk_batch_attach(ubq, data, fcmd);
3922 }
3923 
ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data * data)3924 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3925 {
3926 	const struct ublk_batch_io *uc = &data->header;
3927 
3928 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3929 		return -EINVAL;
3930 
3931 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3932 		return -EINVAL;
3933 
3934 	if (uc->elem_bytes != sizeof(__u16))
3935 		return -EINVAL;
3936 
3937 	if (uc->flags != 0)
3938 		return -EINVAL;
3939 
3940 	return 0;
3941 }
3942 
ublk_handle_non_batch_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3943 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3944 				     unsigned int issue_flags)
3945 {
3946 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3947 							       struct ublksrv_io_cmd);
3948 	struct ublk_device *ub = cmd->file->private_data;
3949 	unsigned tag = READ_ONCE(ub_cmd->tag);
3950 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3951 	unsigned index = READ_ONCE(ub_cmd->addr);
3952 	struct ublk_queue *ubq;
3953 	struct ublk_io *io;
3954 
3955 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3956 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3957 
3958 	if (q_id >= ub->dev_info.nr_hw_queues)
3959 		return -EINVAL;
3960 
3961 	if (tag >= ub->dev_info.queue_depth)
3962 		return -EINVAL;
3963 
3964 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3965 		return -EOPNOTSUPP;
3966 
3967 	ubq = ublk_get_queue(ub, q_id);
3968 	io = &ubq->ios[tag];
3969 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3970 			issue_flags);
3971 }
3972 
ublk_ch_batch_io_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)3973 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3974 				       unsigned int issue_flags)
3975 {
3976 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3977 							  struct ublk_batch_io);
3978 	struct ublk_device *ub = cmd->file->private_data;
3979 	struct ublk_batch_io_data data = {
3980 		.ub  = ub,
3981 		.cmd = cmd,
3982 		.header = (struct ublk_batch_io) {
3983 			.q_id = READ_ONCE(uc->q_id),
3984 			.flags = READ_ONCE(uc->flags),
3985 			.nr_elem = READ_ONCE(uc->nr_elem),
3986 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3987 		},
3988 		.issue_flags = issue_flags,
3989 	};
3990 	u32 cmd_op = cmd->cmd_op;
3991 	int ret = -EINVAL;
3992 
3993 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3994 		ublk_batch_cancel_fn(cmd, issue_flags);
3995 		return 0;
3996 	}
3997 
3998 	switch (cmd_op) {
3999 	case UBLK_U_IO_PREP_IO_CMDS:
4000 		ret = ublk_check_batch_cmd(&data);
4001 		if (ret)
4002 			goto out;
4003 		ret = ublk_handle_batch_prep_cmd(&data);
4004 		break;
4005 	case UBLK_U_IO_COMMIT_IO_CMDS:
4006 		ret = ublk_check_batch_cmd(&data);
4007 		if (ret)
4008 			goto out;
4009 		ret = ublk_handle_batch_commit_cmd(&data);
4010 		break;
4011 	case UBLK_U_IO_FETCH_IO_CMDS:
4012 		ret = ublk_validate_batch_fetch_cmd(&data);
4013 		if (ret)
4014 			goto out;
4015 		ret = ublk_handle_batch_fetch_cmd(&data);
4016 		break;
4017 	default:
4018 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
4019 		break;
4020 	}
4021 out:
4022 	return ret;
4023 }
4024 
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)4025 static inline bool ublk_check_ubuf_dir(const struct request *req,
4026 		int ubuf_dir)
4027 {
4028 	/* copy ubuf to request pages */
4029 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
4030 	    ubuf_dir == ITER_SOURCE)
4031 		return true;
4032 
4033 	/* copy request pages to ubuf */
4034 	if ((req_op(req) == REQ_OP_WRITE ||
4035 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
4036 	    ubuf_dir == ITER_DEST)
4037 		return true;
4038 
4039 	return false;
4040 }
4041 
4042 static ssize_t
ublk_user_copy(struct kiocb * iocb,struct iov_iter * iter,int dir)4043 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4044 {
4045 	struct ublk_device *ub = iocb->ki_filp->private_data;
4046 	struct ublk_queue *ubq;
4047 	struct request *req;
4048 	struct ublk_io *io;
4049 	unsigned data_len;
4050 	bool is_integrity;
4051 	bool on_daemon;
4052 	size_t buf_off;
4053 	u16 tag, q_id;
4054 	ssize_t ret;
4055 
4056 	if (!user_backed_iter(iter))
4057 		return -EACCES;
4058 
4059 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4060 		return -EACCES;
4061 
4062 	tag = ublk_pos_to_tag(iocb->ki_pos);
4063 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
4064 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4065 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4066 
4067 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4068 		return -EINVAL;
4069 
4070 	if (q_id >= ub->dev_info.nr_hw_queues)
4071 		return -EINVAL;
4072 
4073 	ubq = ublk_get_queue(ub, q_id);
4074 	if (!ublk_dev_support_user_copy(ub))
4075 		return -EACCES;
4076 
4077 	if (tag >= ub->dev_info.queue_depth)
4078 		return -EINVAL;
4079 
4080 	io = &ubq->ios[tag];
4081 	on_daemon = current == READ_ONCE(io->task);
4082 	if (on_daemon) {
4083 		/* On daemon, io can't be completed concurrently, so skip ref */
4084 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4085 			return -EINVAL;
4086 
4087 		req = io->req;
4088 		if (!ublk_rq_has_data(req))
4089 			return -EINVAL;
4090 	} else {
4091 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
4092 		if (!req)
4093 			return -EINVAL;
4094 	}
4095 
4096 	if (is_integrity) {
4097 		struct blk_integrity *bi = &req->q->limits.integrity;
4098 
4099 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4100 	} else {
4101 		data_len = blk_rq_bytes(req);
4102 	}
4103 	if (buf_off > data_len) {
4104 		ret = -EINVAL;
4105 		goto out;
4106 	}
4107 
4108 	if (!ublk_check_ubuf_dir(req, dir)) {
4109 		ret = -EACCES;
4110 		goto out;
4111 	}
4112 
4113 	if (is_integrity)
4114 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4115 	else
4116 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4117 
4118 out:
4119 	if (!on_daemon)
4120 		ublk_put_req_ref(io, req);
4121 	return ret;
4122 }
4123 
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)4124 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4125 {
4126 	return ublk_user_copy(iocb, to, ITER_DEST);
4127 }
4128 
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)4129 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4130 {
4131 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4132 }
4133 
4134 static const struct file_operations ublk_ch_fops = {
4135 	.owner = THIS_MODULE,
4136 	.open = ublk_ch_open,
4137 	.release = ublk_ch_release,
4138 	.read_iter = ublk_ch_read_iter,
4139 	.write_iter = ublk_ch_write_iter,
4140 	.uring_cmd = ublk_ch_uring_cmd,
4141 	.mmap = ublk_ch_mmap,
4142 };
4143 
4144 static const struct file_operations ublk_ch_batch_io_fops = {
4145 	.owner = THIS_MODULE,
4146 	.open = ublk_ch_open,
4147 	.release = ublk_ch_release,
4148 	.read_iter = ublk_ch_read_iter,
4149 	.write_iter = ublk_ch_write_iter,
4150 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4151 	.mmap = ublk_ch_mmap,
4152 };
4153 
__ublk_deinit_queue(struct ublk_device * ub,struct ublk_queue * ubq)4154 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4155 {
4156 	int size, i;
4157 
4158 	size = ublk_queue_cmd_buf_size(ub);
4159 
4160 	for (i = 0; i < ubq->q_depth; i++) {
4161 		struct ublk_io *io = &ubq->ios[i];
4162 		if (io->task)
4163 			put_task_struct(io->task);
4164 		WARN_ON_ONCE(refcount_read(&io->ref));
4165 		WARN_ON_ONCE(io->task_registered_buffers);
4166 	}
4167 
4168 	if (ubq->io_cmd_buf)
4169 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4170 
4171 	if (ublk_dev_support_batch_io(ub))
4172 		ublk_io_evts_deinit(ubq);
4173 
4174 	kvfree(ubq);
4175 }
4176 
ublk_deinit_queue(struct ublk_device * ub,int q_id)4177 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4178 {
4179 	struct ublk_queue *ubq = ub->queues[q_id];
4180 
4181 	if (!ubq)
4182 		return;
4183 
4184 	__ublk_deinit_queue(ub, ubq);
4185 	ub->queues[q_id] = NULL;
4186 }
4187 
ublk_get_queue_numa_node(struct ublk_device * ub,int q_id)4188 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4189 {
4190 	unsigned int cpu;
4191 
4192 	/* Find first CPU mapped to this queue */
4193 	for_each_possible_cpu(cpu) {
4194 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4195 			return cpu_to_node(cpu);
4196 	}
4197 
4198 	return NUMA_NO_NODE;
4199 }
4200 
ublk_init_queue(struct ublk_device * ub,int q_id)4201 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4202 {
4203 	int depth = ub->dev_info.queue_depth;
4204 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4205 	struct ublk_queue *ubq;
4206 	struct page *page;
4207 	int numa_node;
4208 	int size, i, ret;
4209 
4210 	/* Determine NUMA node based on queue's CPU affinity */
4211 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4212 
4213 	/* Allocate queue structure on local NUMA node */
4214 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4215 			    numa_node);
4216 	if (!ubq)
4217 		return -ENOMEM;
4218 
4219 	spin_lock_init(&ubq->cancel_lock);
4220 	ubq->flags = ub->dev_info.flags;
4221 	ubq->q_id = q_id;
4222 	ubq->q_depth = depth;
4223 	size = ublk_queue_cmd_buf_size(ub);
4224 
4225 	/* Allocate I/O command buffer on local NUMA node */
4226 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4227 	if (!page) {
4228 		kvfree(ubq);
4229 		return -ENOMEM;
4230 	}
4231 	ubq->io_cmd_buf = page_address(page);
4232 
4233 	for (i = 0; i < ubq->q_depth; i++)
4234 		spin_lock_init(&ubq->ios[i].lock);
4235 
4236 	if (ublk_dev_support_batch_io(ub)) {
4237 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4238 		if (ret)
4239 			goto fail;
4240 		INIT_LIST_HEAD(&ubq->fcmd_head);
4241 	}
4242 	ub->queues[q_id] = ubq;
4243 	ubq->dev = ub;
4244 
4245 	return 0;
4246 fail:
4247 	__ublk_deinit_queue(ub, ubq);
4248 	return ret;
4249 }
4250 
ublk_deinit_queues(struct ublk_device * ub)4251 static void ublk_deinit_queues(struct ublk_device *ub)
4252 {
4253 	int i;
4254 
4255 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4256 		ublk_deinit_queue(ub, i);
4257 }
4258 
ublk_init_queues(struct ublk_device * ub)4259 static int ublk_init_queues(struct ublk_device *ub)
4260 {
4261 	int i, ret;
4262 
4263 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4264 		ret = ublk_init_queue(ub, i);
4265 		if (ret)
4266 			goto fail;
4267 	}
4268 
4269 	init_completion(&ub->completion);
4270 	return 0;
4271 
4272  fail:
4273 	ublk_deinit_queues(ub);
4274 	return ret;
4275 }
4276 
ublk_alloc_dev_number(struct ublk_device * ub,int idx)4277 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4278 {
4279 	int i = idx;
4280 	int err;
4281 
4282 	spin_lock(&ublk_idr_lock);
4283 	/* allocate id, if @id >= 0, we're requesting that specific id */
4284 	if (i >= 0) {
4285 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4286 		if (err == -ENOSPC)
4287 			err = -EEXIST;
4288 	} else {
4289 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4290 				GFP_NOWAIT);
4291 	}
4292 	spin_unlock(&ublk_idr_lock);
4293 
4294 	if (err >= 0)
4295 		ub->ub_number = err;
4296 
4297 	return err;
4298 }
4299 
ublk_free_dev_number(struct ublk_device * ub)4300 static void ublk_free_dev_number(struct ublk_device *ub)
4301 {
4302 	spin_lock(&ublk_idr_lock);
4303 	idr_remove(&ublk_index_idr, ub->ub_number);
4304 	wake_up_all(&ublk_idr_wq);
4305 	spin_unlock(&ublk_idr_lock);
4306 }
4307 
ublk_cdev_rel(struct device * dev)4308 static void ublk_cdev_rel(struct device *dev)
4309 {
4310 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4311 
4312 	ublk_buf_cleanup(ub);
4313 	blk_mq_free_tag_set(&ub->tag_set);
4314 	ublk_deinit_queues(ub);
4315 	ublk_free_dev_number(ub);
4316 	mutex_destroy(&ub->mutex);
4317 	mutex_destroy(&ub->cancel_mutex);
4318 	kfree(ub);
4319 }
4320 
ublk_add_chdev(struct ublk_device * ub)4321 static int ublk_add_chdev(struct ublk_device *ub)
4322 {
4323 	struct device *dev = &ub->cdev_dev;
4324 	int minor = ub->ub_number;
4325 	int ret;
4326 
4327 	dev->parent = ublk_misc.this_device;
4328 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4329 	dev->class = &ublk_chr_class;
4330 	dev->release = ublk_cdev_rel;
4331 	device_initialize(dev);
4332 
4333 	ret = dev_set_name(dev, "ublkc%d", minor);
4334 	if (ret)
4335 		goto fail;
4336 
4337 	if (ublk_dev_support_batch_io(ub))
4338 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4339 	else
4340 		cdev_init(&ub->cdev, &ublk_ch_fops);
4341 	ret = cdev_device_add(&ub->cdev, dev);
4342 	if (ret)
4343 		goto fail;
4344 
4345 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4346 		unprivileged_ublks_added++;
4347 	return 0;
4348  fail:
4349 	put_device(dev);
4350 	return ret;
4351 }
4352 
4353 /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)4354 static void ublk_align_max_io_size(struct ublk_device *ub)
4355 {
4356 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4357 
4358 	ub->dev_info.max_io_buf_bytes =
4359 		round_down(max_io_bytes, PAGE_SIZE);
4360 }
4361 
ublk_add_tag_set(struct ublk_device * ub)4362 static int ublk_add_tag_set(struct ublk_device *ub)
4363 {
4364 	if (ublk_dev_support_batch_io(ub))
4365 		ub->tag_set.ops = &ublk_batch_mq_ops;
4366 	else
4367 		ub->tag_set.ops = &ublk_mq_ops;
4368 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4369 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4370 	ub->tag_set.numa_node = NUMA_NO_NODE;
4371 	ub->tag_set.driver_data = ub;
4372 	return blk_mq_alloc_tag_set(&ub->tag_set);
4373 }
4374 
ublk_remove(struct ublk_device * ub)4375 static void ublk_remove(struct ublk_device *ub)
4376 {
4377 	bool unprivileged;
4378 
4379 	ublk_stop_dev(ub);
4380 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4381 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4382 	ublk_put_device(ub);
4383 
4384 	if (unprivileged)
4385 		unprivileged_ublks_added--;
4386 }
4387 
ublk_get_device_from_id(int idx)4388 static struct ublk_device *ublk_get_device_from_id(int idx)
4389 {
4390 	struct ublk_device *ub = NULL;
4391 
4392 	if (idx < 0)
4393 		return NULL;
4394 
4395 	spin_lock(&ublk_idr_lock);
4396 	ub = idr_find(&ublk_index_idr, idx);
4397 	if (ub)
4398 		ub = ublk_get_device(ub);
4399 	spin_unlock(&ublk_idr_lock);
4400 
4401 	return ub;
4402 }
4403 
ublk_validate_user_pid(struct ublk_device * ub,pid_t ublksrv_pid)4404 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4405 {
4406 	rcu_read_lock();
4407 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4408 	rcu_read_unlock();
4409 
4410 	return ub->ublksrv_tgid == ublksrv_pid;
4411 }
4412 
ublk_ctrl_start_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4413 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4414 		const struct ublksrv_ctrl_cmd *header)
4415 {
4416 	const struct ublk_param_basic *p = &ub->params.basic;
4417 	int ublksrv_pid = (int)header->data[0];
4418 	struct queue_limits lim = {
4419 		.logical_block_size	= 1 << p->logical_bs_shift,
4420 		.physical_block_size	= 1 << p->physical_bs_shift,
4421 		.io_min			= 1 << p->io_min_shift,
4422 		.io_opt			= 1 << p->io_opt_shift,
4423 		.max_hw_sectors		= p->max_sectors,
4424 		.chunk_sectors		= p->chunk_sectors,
4425 		.virt_boundary_mask	= p->virt_boundary_mask,
4426 		.max_segments		= USHRT_MAX,
4427 		.max_segment_size	= UINT_MAX,
4428 		.dma_alignment		= 3,
4429 	};
4430 	struct gendisk *disk;
4431 	int ret = -EINVAL;
4432 
4433 	if (ublksrv_pid <= 0)
4434 		return -EINVAL;
4435 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4436 		return -EINVAL;
4437 
4438 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4439 		const struct ublk_param_discard *pd = &ub->params.discard;
4440 
4441 		lim.discard_alignment = pd->discard_alignment;
4442 		lim.discard_granularity = pd->discard_granularity;
4443 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4444 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4445 		lim.max_discard_segments = pd->max_discard_segments;
4446 	}
4447 
4448 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4449 		const struct ublk_param_zoned *p = &ub->params.zoned;
4450 
4451 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4452 			return -EOPNOTSUPP;
4453 
4454 		lim.features |= BLK_FEAT_ZONED;
4455 		lim.max_active_zones = p->max_active_zones;
4456 		lim.max_open_zones =  p->max_open_zones;
4457 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4458 	}
4459 
4460 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4461 		lim.features |= BLK_FEAT_WRITE_CACHE;
4462 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4463 			lim.features |= BLK_FEAT_FUA;
4464 	}
4465 
4466 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4467 		lim.features |= BLK_FEAT_ROTATIONAL;
4468 
4469 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4470 		lim.dma_alignment = ub->params.dma.alignment;
4471 
4472 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4473 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4474 		lim.max_segment_size = ub->params.seg.max_segment_size;
4475 		lim.max_segments = ub->params.seg.max_segments;
4476 	}
4477 
4478 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4479 		const struct ublk_param_integrity *p = &ub->params.integrity;
4480 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4481 
4482 		lim.max_integrity_segments =
4483 			p->max_integrity_segments ?: USHRT_MAX;
4484 		lim.integrity = (struct blk_integrity) {
4485 			.flags = ublk_integrity_flags(p->flags),
4486 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4487 			.metadata_size = p->metadata_size,
4488 			.pi_offset = p->pi_offset,
4489 			.interval_exp = p->interval_exp,
4490 			.tag_size = p->tag_size,
4491 			.pi_tuple_size = pi_tuple_size,
4492 		};
4493 	}
4494 
4495 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4496 		return -EINTR;
4497 
4498 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4499 		return -EINVAL;
4500 
4501 	mutex_lock(&ub->mutex);
4502 	/* device may become not ready in case of F_BATCH */
4503 	if (!ublk_dev_ready(ub)) {
4504 		ret = -EINVAL;
4505 		goto out_unlock;
4506 	}
4507 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4508 	    test_bit(UB_STATE_USED, &ub->state)) {
4509 		ret = -EEXIST;
4510 		goto out_unlock;
4511 	}
4512 
4513 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4514 	if (IS_ERR(disk)) {
4515 		ret = PTR_ERR(disk);
4516 		goto out_unlock;
4517 	}
4518 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4519 	disk->fops = &ub_fops;
4520 	disk->private_data = ub;
4521 
4522 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4523 	ub->ub_disk = disk;
4524 
4525 	ublk_apply_params(ub);
4526 
4527 	/*
4528 	 * Suppress partition scan to avoid potential IO hang.
4529 	 *
4530 	 * If ublk server error occurs during partition scan, the IO may
4531 	 * wait while holding ub->mutex, which can deadlock with other
4532 	 * operations that need the mutex. Defer partition scan to async
4533 	 * work.
4534 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4535 	 * permanently.
4536 	 */
4537 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4538 
4539 	ublk_get_device(ub);
4540 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4541 
4542 	if (ublk_dev_is_zoned(ub)) {
4543 		ret = ublk_revalidate_disk_zones(ub);
4544 		if (ret)
4545 			goto out_put_cdev;
4546 	}
4547 
4548 	ret = add_disk(disk);
4549 	if (ret)
4550 		goto out_put_cdev;
4551 
4552 	set_bit(UB_STATE_USED, &ub->state);
4553 
4554 	/* Skip partition scan if disabled by user */
4555 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4556 		/* Not clear for unprivileged daemons, see comment above */
4557 		if (!ub->unprivileged_daemons)
4558 			clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4559 	} else {
4560 		/* Schedule async partition scan for trusted daemons */
4561 		if (!ub->unprivileged_daemons)
4562 			schedule_work(&ub->partition_scan_work);
4563 	}
4564 
4565 out_put_cdev:
4566 	if (ret) {
4567 		ublk_detach_disk(ub);
4568 		ublk_put_device(ub);
4569 	}
4570 	if (ret)
4571 		put_disk(disk);
4572 out_unlock:
4573 	mutex_unlock(&ub->mutex);
4574 	return ret;
4575 }
4576 
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4577 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4578 		const struct ublksrv_ctrl_cmd *header)
4579 {
4580 	void __user *argp = (void __user *)(unsigned long)header->addr;
4581 	cpumask_var_t cpumask;
4582 	unsigned long queue;
4583 	unsigned int retlen;
4584 	unsigned int i;
4585 	int ret;
4586 
4587 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4588 		return -EINVAL;
4589 	if (header->len & (sizeof(unsigned long)-1))
4590 		return -EINVAL;
4591 	if (!header->addr)
4592 		return -EINVAL;
4593 
4594 	queue = header->data[0];
4595 	if (queue >= ub->dev_info.nr_hw_queues)
4596 		return -EINVAL;
4597 
4598 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4599 		return -ENOMEM;
4600 
4601 	for_each_possible_cpu(i) {
4602 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4603 			cpumask_set_cpu(i, cpumask);
4604 	}
4605 
4606 	ret = -EFAULT;
4607 	retlen = min_t(unsigned short, header->len, cpumask_size());
4608 	if (copy_to_user(argp, cpumask, retlen))
4609 		goto out_free_cpumask;
4610 	if (retlen != header->len &&
4611 	    clear_user(argp + retlen, header->len - retlen))
4612 		goto out_free_cpumask;
4613 
4614 	ret = 0;
4615 out_free_cpumask:
4616 	free_cpumask_var(cpumask);
4617 	return ret;
4618 }
4619 
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)4620 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4621 {
4622 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4623 			info->dev_id, info->flags);
4624 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4625 			info->nr_hw_queues, info->queue_depth);
4626 }
4627 
ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd * header)4628 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4629 {
4630 	void __user *argp = (void __user *)(unsigned long)header->addr;
4631 	struct ublksrv_ctrl_dev_info info;
4632 	struct ublk_device *ub;
4633 	int ret = -EINVAL;
4634 
4635 	if (header->len < sizeof(info) || !header->addr)
4636 		return -EINVAL;
4637 	if (header->queue_id != (u16)-1) {
4638 		pr_warn("%s: queue_id is wrong %x\n",
4639 			__func__, header->queue_id);
4640 		return -EINVAL;
4641 	}
4642 
4643 	if (copy_from_user(&info, argp, sizeof(info)))
4644 		return -EFAULT;
4645 
4646 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4647 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4648 		return -EINVAL;
4649 
4650 	if (capable(CAP_SYS_ADMIN))
4651 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4652 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4653 		return -EPERM;
4654 
4655 	/* forbid nonsense combinations of recovery flags */
4656 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4657 	case 0:
4658 	case UBLK_F_USER_RECOVERY:
4659 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4660 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4661 		break;
4662 	default:
4663 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4664 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4665 		return -EINVAL;
4666 	}
4667 
4668 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4669 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4670 		return -EINVAL;
4671 	}
4672 
4673 	/*
4674 	 * unprivileged device can't be trusted, but RECOVERY and
4675 	 * RECOVERY_REISSUE still may hang error handling, so can't
4676 	 * support recovery features for unprivileged ublk now
4677 	 *
4678 	 * TODO: provide forward progress for RECOVERY handler, so that
4679 	 * unprivileged device can benefit from it
4680 	 */
4681 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4682 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4683 				UBLK_F_USER_RECOVERY);
4684 
4685 		/*
4686 		 * For USER_COPY, we depends on userspace to fill request
4687 		 * buffer by pwrite() to ublk char device, which can't be
4688 		 * used for unprivileged device
4689 		 *
4690 		 * Same with zero copy or auto buffer register.
4691 		 */
4692 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4693 					UBLK_F_AUTO_BUF_REG))
4694 			return -EINVAL;
4695 	}
4696 
4697 	/* User copy is required to access integrity buffer */
4698 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4699 		return -EINVAL;
4700 
4701 	/* the created device is always owned by current user */
4702 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4703 
4704 	if (header->dev_id != info.dev_id) {
4705 		pr_warn("%s: dev id not match %u %u\n",
4706 			__func__, header->dev_id, info.dev_id);
4707 		return -EINVAL;
4708 	}
4709 
4710 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4711 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4712 			__func__, UBLK_MAX_UBLKS - 1);
4713 		return -EINVAL;
4714 	}
4715 
4716 	ublk_dump_dev_info(&info);
4717 
4718 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4719 	if (ret)
4720 		return ret;
4721 
4722 	ret = -EACCES;
4723 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4724 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4725 		goto out_unlock;
4726 
4727 	ret = -ENOMEM;
4728 	ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4729 	if (!ub)
4730 		goto out_unlock;
4731 	mutex_init(&ub->mutex);
4732 	spin_lock_init(&ub->lock);
4733 	mutex_init(&ub->cancel_mutex);
4734 	mt_init(&ub->buf_tree);
4735 	ida_init(&ub->buf_ida);
4736 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4737 
4738 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4739 	if (ret < 0)
4740 		goto out_free_ub;
4741 
4742 	memcpy(&ub->dev_info, &info, sizeof(info));
4743 
4744 	/* update device id */
4745 	ub->dev_info.dev_id = ub->ub_number;
4746 
4747 	/*
4748 	 * 64bit flags will be copied back to userspace as feature
4749 	 * negotiation result, so have to clear flags which driver
4750 	 * doesn't support yet, then userspace can get correct flags
4751 	 * (features) to handle.
4752 	 */
4753 	ub->dev_info.flags &= UBLK_F_ALL;
4754 
4755 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4756 		UBLK_F_URING_CMD_COMP_IN_TASK |
4757 		UBLK_F_PER_IO_DAEMON |
4758 		UBLK_F_BUF_REG_OFF_DAEMON |
4759 		UBLK_F_SAFE_STOP_DEV;
4760 
4761 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4762 	if (ublk_dev_support_batch_io(ub))
4763 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4764 
4765 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4766 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4767 				UBLK_F_AUTO_BUF_REG))
4768 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4769 
4770 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4771 	if (ublk_dev_support_batch_io(ub))
4772 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4773 
4774 	/*
4775 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4776 	 * returning write_append_lba, which is only allowed in case of
4777 	 * user copy or zero copy
4778 	 */
4779 	if (ublk_dev_is_zoned(ub) &&
4780 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4781 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4782 		ret = -EINVAL;
4783 		goto out_free_dev_number;
4784 	}
4785 
4786 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4787 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4788 	ublk_align_max_io_size(ub);
4789 
4790 	ret = ublk_add_tag_set(ub);
4791 	if (ret)
4792 		goto out_free_dev_number;
4793 
4794 	ret = ublk_init_queues(ub);
4795 	if (ret)
4796 		goto out_free_tag_set;
4797 
4798 	ret = -EFAULT;
4799 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4800 		goto out_deinit_queues;
4801 
4802 	/*
4803 	 * Add the char dev so that ublksrv daemon can be setup.
4804 	 * ublk_add_chdev() will cleanup everything if it fails.
4805 	 */
4806 	ret = ublk_add_chdev(ub);
4807 	goto out_unlock;
4808 
4809 out_deinit_queues:
4810 	ublk_deinit_queues(ub);
4811 out_free_tag_set:
4812 	blk_mq_free_tag_set(&ub->tag_set);
4813 out_free_dev_number:
4814 	ublk_free_dev_number(ub);
4815 out_free_ub:
4816 	mutex_destroy(&ub->mutex);
4817 	mutex_destroy(&ub->cancel_mutex);
4818 	kfree(ub);
4819 out_unlock:
4820 	mutex_unlock(&ublk_ctl_mutex);
4821 	return ret;
4822 }
4823 
ublk_idr_freed(int id)4824 static inline bool ublk_idr_freed(int id)
4825 {
4826 	void *ptr;
4827 
4828 	spin_lock(&ublk_idr_lock);
4829 	ptr = idr_find(&ublk_index_idr, id);
4830 	spin_unlock(&ublk_idr_lock);
4831 
4832 	return ptr == NULL;
4833 }
4834 
ublk_ctrl_del_dev(struct ublk_device ** p_ub,bool wait)4835 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4836 {
4837 	struct ublk_device *ub = *p_ub;
4838 	int idx = ub->ub_number;
4839 	int ret;
4840 
4841 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4842 	if (ret)
4843 		return ret;
4844 
4845 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4846 		ublk_remove(ub);
4847 		set_bit(UB_STATE_DELETED, &ub->state);
4848 	}
4849 
4850 	/* Mark the reference as consumed */
4851 	*p_ub = NULL;
4852 	ublk_put_device(ub);
4853 	mutex_unlock(&ublk_ctl_mutex);
4854 
4855 	/*
4856 	 * Wait until the idr is removed, then it can be reused after
4857 	 * DEL_DEV command is returned.
4858 	 *
4859 	 * If we returns because of user interrupt, future delete command
4860 	 * may come:
4861 	 *
4862 	 * - the device number isn't freed, this device won't or needn't
4863 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4864 	 *   will be released after the last reference is dropped
4865 	 *
4866 	 * - the device number is freed already, we will not find this
4867 	 *   device via ublk_get_device_from_id()
4868 	 */
4869 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4870 		return -EINTR;
4871 	return 0;
4872 }
4873 
ublk_ctrl_cmd_dump(u32 cmd_op,const struct ublksrv_ctrl_cmd * header)4874 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4875 				      const struct ublksrv_ctrl_cmd *header)
4876 {
4877 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4878 			__func__, cmd_op, header->dev_id, header->queue_id,
4879 			header->data[0], header->addr, header->len);
4880 }
4881 
ublk_ctrl_stop_dev(struct ublk_device * ub)4882 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4883 {
4884 	ublk_stop_dev(ub);
4885 }
4886 
ublk_ctrl_try_stop_dev(struct ublk_device * ub)4887 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4888 {
4889 	struct gendisk *disk;
4890 	int ret = 0;
4891 
4892 	disk = ublk_get_disk(ub);
4893 	if (!disk)
4894 		return -ENODEV;
4895 
4896 	mutex_lock(&disk->open_mutex);
4897 	if (disk_openers(disk) > 0) {
4898 		ret = -EBUSY;
4899 		goto unlock;
4900 	}
4901 	ub->block_open = true;
4902 	/* release open_mutex as del_gendisk() will reacquire it */
4903 	mutex_unlock(&disk->open_mutex);
4904 
4905 	ublk_ctrl_stop_dev(ub);
4906 	goto out;
4907 
4908 unlock:
4909 	mutex_unlock(&disk->open_mutex);
4910 out:
4911 	ublk_put_disk(disk);
4912 	return ret;
4913 }
4914 
ublk_ctrl_get_dev_info(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4915 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4916 		const struct ublksrv_ctrl_cmd *header)
4917 {
4918 	struct task_struct *p;
4919 	struct pid *pid;
4920 	struct ublksrv_ctrl_dev_info dev_info;
4921 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4922 	void __user *argp = (void __user *)(unsigned long)header->addr;
4923 
4924 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4925 		return -EINVAL;
4926 
4927 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4928 	dev_info.ublksrv_pid = -1;
4929 
4930 	if (init_ublksrv_tgid > 0) {
4931 		rcu_read_lock();
4932 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4933 		p = pid_task(pid, PIDTYPE_TGID);
4934 		if (p) {
4935 			int vnr = task_tgid_vnr(p);
4936 
4937 			if (vnr)
4938 				dev_info.ublksrv_pid = vnr;
4939 		}
4940 		rcu_read_unlock();
4941 	}
4942 
4943 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4944 		return -EFAULT;
4945 
4946 	return 0;
4947 }
4948 
4949 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)4950 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4951 {
4952 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4953 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4954 
4955 	if (ub->ub_disk) {
4956 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4957 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4958 	} else {
4959 		ub->params.devt.disk_major = 0;
4960 		ub->params.devt.disk_minor = 0;
4961 	}
4962 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4963 }
4964 
ublk_ctrl_get_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4965 static int ublk_ctrl_get_params(struct ublk_device *ub,
4966 		const struct ublksrv_ctrl_cmd *header)
4967 {
4968 	void __user *argp = (void __user *)(unsigned long)header->addr;
4969 	struct ublk_params_header ph;
4970 	int ret;
4971 
4972 	if (header->len <= sizeof(ph) || !header->addr)
4973 		return -EINVAL;
4974 
4975 	if (copy_from_user(&ph, argp, sizeof(ph)))
4976 		return -EFAULT;
4977 
4978 	if (ph.len > header->len || !ph.len)
4979 		return -EINVAL;
4980 
4981 	if (ph.len > sizeof(struct ublk_params))
4982 		ph.len = sizeof(struct ublk_params);
4983 
4984 	mutex_lock(&ub->mutex);
4985 	ublk_ctrl_fill_params_devt(ub);
4986 	if (copy_to_user(argp, &ub->params, ph.len))
4987 		ret = -EFAULT;
4988 	else
4989 		ret = 0;
4990 	mutex_unlock(&ub->mutex);
4991 
4992 	return ret;
4993 }
4994 
ublk_ctrl_set_params(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)4995 static int ublk_ctrl_set_params(struct ublk_device *ub,
4996 		const struct ublksrv_ctrl_cmd *header)
4997 {
4998 	void __user *argp = (void __user *)(unsigned long)header->addr;
4999 	struct ublk_params_header ph;
5000 	int ret = -EFAULT;
5001 
5002 	if (header->len <= sizeof(ph) || !header->addr)
5003 		return -EINVAL;
5004 
5005 	if (copy_from_user(&ph, argp, sizeof(ph)))
5006 		return -EFAULT;
5007 
5008 	if (ph.len > header->len || !ph.len || !ph.types)
5009 		return -EINVAL;
5010 
5011 	if (ph.len > sizeof(struct ublk_params))
5012 		ph.len = sizeof(struct ublk_params);
5013 
5014 	mutex_lock(&ub->mutex);
5015 	if (test_bit(UB_STATE_USED, &ub->state)) {
5016 		/*
5017 		 * Parameters can only be changed when device hasn't
5018 		 * been started yet
5019 		 */
5020 		ret = -EACCES;
5021 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
5022 		/* zero out partial copy so no stale params survive */
5023 		memset(&ub->params, 0, sizeof(ub->params));
5024 		ret = -EFAULT;
5025 	} else {
5026 		/* clear all we don't support yet */
5027 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
5028 		ret = ublk_validate_params(ub);
5029 		if (ret)
5030 			memset(&ub->params, 0, sizeof(ub->params));
5031 	}
5032 	mutex_unlock(&ub->mutex);
5033 
5034 	return ret;
5035 }
5036 
ublk_ctrl_start_recovery(struct ublk_device * ub)5037 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
5038 {
5039 	int ret = -EINVAL;
5040 
5041 	mutex_lock(&ub->mutex);
5042 	if (ublk_nosrv_should_stop_dev(ub))
5043 		goto out_unlock;
5044 	/*
5045 	 * START_RECOVERY is only allowd after:
5046 	 *
5047 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5048 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
5049 	 *     released.
5050 	 *
5051 	 * and one of the following holds
5052 	 *
5053 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5054 	 *     (a)has quiesced request queue
5055 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
5056 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5057 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
5058 	 *
5059 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5060 	 *     quiesced, but all I/O is being immediately errored
5061 	 */
5062 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5063 		ret = -EBUSY;
5064 		goto out_unlock;
5065 	}
5066 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5067 	init_completion(&ub->completion);
5068 	ret = 0;
5069  out_unlock:
5070 	mutex_unlock(&ub->mutex);
5071 	return ret;
5072 }
5073 
ublk_ctrl_end_recovery(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5074 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5075 		const struct ublksrv_ctrl_cmd *header)
5076 {
5077 	int ublksrv_pid = (int)header->data[0];
5078 	int ret = -EINVAL;
5079 
5080 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5081 		 header->dev_id);
5082 
5083 	if (wait_for_completion_interruptible(&ub->completion))
5084 		return -EINTR;
5085 
5086 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5087 		 header->dev_id);
5088 
5089 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
5090 		return -EINVAL;
5091 
5092 	mutex_lock(&ub->mutex);
5093 	if (ublk_nosrv_should_stop_dev(ub))
5094 		goto out_unlock;
5095 
5096 	if (!ublk_dev_in_recoverable_state(ub)) {
5097 		ret = -EBUSY;
5098 		goto out_unlock;
5099 	}
5100 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5101 	ub->dev_info.state = UBLK_S_DEV_LIVE;
5102 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5103 			__func__, ublksrv_pid, header->dev_id);
5104 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
5105 	ret = 0;
5106  out_unlock:
5107 	mutex_unlock(&ub->mutex);
5108 	return ret;
5109 }
5110 
ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd * header)5111 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5112 {
5113 	void __user *argp = (void __user *)(unsigned long)header->addr;
5114 	u64 features = UBLK_F_ALL;
5115 
5116 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
5117 		return -EINVAL;
5118 
5119 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5120 		return -EFAULT;
5121 
5122 	return 0;
5123 }
5124 
ublk_ctrl_set_size(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5125 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5126 {
5127 	struct ublk_param_basic *p = &ub->params.basic;
5128 	u64 new_size = header->data[0];
5129 	int ret = 0;
5130 
5131 	mutex_lock(&ub->mutex);
5132 	if (!ub->ub_disk) {
5133 		ret = -ENODEV;
5134 		goto out;
5135 	}
5136 	p->dev_sectors = new_size;
5137 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5138 out:
5139 	mutex_unlock(&ub->mutex);
5140 	return ret;
5141 }
5142 
5143 struct count_busy {
5144 	const struct ublk_queue *ubq;
5145 	unsigned int nr_busy;
5146 };
5147 
ublk_count_busy_req(struct request * rq,void * data)5148 static bool ublk_count_busy_req(struct request *rq, void *data)
5149 {
5150 	struct count_busy *idle = data;
5151 
5152 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5153 		idle->nr_busy += 1;
5154 	return true;
5155 }
5156 
5157 /* uring_cmd is guaranteed to be active if the associated request is idle */
ubq_has_idle_io(const struct ublk_queue * ubq)5158 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5159 {
5160 	struct count_busy data = {
5161 		.ubq = ubq,
5162 	};
5163 
5164 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5165 	return data.nr_busy < ubq->q_depth;
5166 }
5167 
5168 /* Wait until each hw queue has at least one idle IO */
ublk_wait_for_idle_io(struct ublk_device * ub,unsigned int timeout_ms)5169 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5170 				 unsigned int timeout_ms)
5171 {
5172 	unsigned int elapsed = 0;
5173 	int ret;
5174 
5175 	/*
5176 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5177 	 * or new fetch command, so needn't wait any more
5178 	 */
5179 	if (ublk_dev_support_batch_io(ub))
5180 		return 0;
5181 
5182 	while (elapsed < timeout_ms && !signal_pending(current)) {
5183 		unsigned int queues_cancelable = 0;
5184 		int i;
5185 
5186 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5187 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5188 
5189 			queues_cancelable += !!ubq_has_idle_io(ubq);
5190 		}
5191 
5192 		/*
5193 		 * Each queue needs at least one active command for
5194 		 * notifying ublk server
5195 		 */
5196 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5197 			break;
5198 
5199 		msleep(UBLK_REQUEUE_DELAY_MS);
5200 		elapsed += UBLK_REQUEUE_DELAY_MS;
5201 	}
5202 
5203 	if (signal_pending(current))
5204 		ret = -EINTR;
5205 	else if (elapsed >= timeout_ms)
5206 		ret = -EBUSY;
5207 	else
5208 		ret = 0;
5209 
5210 	return ret;
5211 }
5212 
ublk_ctrl_quiesce_dev(struct ublk_device * ub,const struct ublksrv_ctrl_cmd * header)5213 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5214 				 const struct ublksrv_ctrl_cmd *header)
5215 {
5216 	/* zero means wait forever */
5217 	u64 timeout_ms = header->data[0];
5218 	struct gendisk *disk;
5219 	int ret = -ENODEV;
5220 
5221 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5222 		return -EOPNOTSUPP;
5223 
5224 	mutex_lock(&ub->mutex);
5225 	disk = ublk_get_disk(ub);
5226 	if (!disk)
5227 		goto unlock;
5228 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5229 		goto put_disk;
5230 
5231 	ret = 0;
5232 	/* already in expected state */
5233 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5234 		goto put_disk;
5235 
5236 	/* Mark the device as canceling */
5237 	mutex_lock(&ub->cancel_mutex);
5238 	blk_mq_quiesce_queue(disk->queue);
5239 	ublk_set_canceling(ub, true);
5240 	blk_mq_unquiesce_queue(disk->queue);
5241 	mutex_unlock(&ub->cancel_mutex);
5242 
5243 	if (!timeout_ms)
5244 		timeout_ms = UINT_MAX;
5245 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5246 
5247 put_disk:
5248 	ublk_put_disk(disk);
5249 unlock:
5250 	mutex_unlock(&ub->mutex);
5251 
5252 	/* Cancel pending uring_cmd */
5253 	if (!ret)
5254 		ublk_cancel_dev(ub);
5255 	return ret;
5256 }
5257 
5258 /*
5259  * All control commands are sent via /dev/ublk-control, so we have to check
5260  * the destination device's permission
5261  */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)5262 static int ublk_char_dev_permission(struct ublk_device *ub,
5263 		const char *dev_path, int mask)
5264 {
5265 	int err;
5266 	struct path path;
5267 	struct kstat stat;
5268 
5269 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5270 	if (err)
5271 		return err;
5272 
5273 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5274 	if (err)
5275 		goto exit;
5276 
5277 	err = -EPERM;
5278 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5279 		goto exit;
5280 
5281 	err = inode_permission(&nop_mnt_idmap,
5282 			d_backing_inode(path.dentry), mask);
5283 exit:
5284 	path_put(&path);
5285 	return err;
5286 }
5287 
5288 /*
5289  * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5290  * if device is started. If device is not yet started, only mutex is
5291  * needed since no I/O path can access the tree.
5292  *
5293  * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5294  * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5295 */
ublk_lock_buf_tree(struct ublk_device * ub)5296 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5297 {
5298 	unsigned int memflags = 0;
5299 
5300 	mutex_lock(&ub->mutex);
5301 	if (ub->ub_disk)
5302 		memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5303 
5304 	return memflags;
5305 }
5306 
ublk_unlock_buf_tree(struct ublk_device * ub,unsigned int memflags)5307 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5308 {
5309 	if (ub->ub_disk)
5310 		blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5311 	mutex_unlock(&ub->mutex);
5312 }
5313 
5314 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
ublk_buf_erase_ranges(struct ublk_device * ub,int buf_index)5315 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5316 {
5317 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5318 	struct ublk_buf_range *range;
5319 
5320 	mas_lock(&mas);
5321 	mas_for_each(&mas, range, ULONG_MAX) {
5322 		if (range->buf_index == buf_index) {
5323 			mas_erase(&mas);
5324 			kfree(range);
5325 		}
5326 	}
5327 	mas_unlock(&mas);
5328 }
5329 
__ublk_ctrl_reg_buf(struct ublk_device * ub,struct page ** pages,unsigned long nr_pages,int index,unsigned short flags)5330 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5331 			       struct page **pages, unsigned long nr_pages,
5332 			       int index, unsigned short flags)
5333 {
5334 	unsigned long i;
5335 	int ret;
5336 
5337 	for (i = 0; i < nr_pages; i++) {
5338 		unsigned long pfn = page_to_pfn(pages[i]);
5339 		unsigned long start = i;
5340 		struct ublk_buf_range *range;
5341 
5342 		/* Find run of consecutive PFNs */
5343 		while (i + 1 < nr_pages &&
5344 		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5345 			i++;
5346 
5347 		range = kzalloc(sizeof(*range), GFP_KERNEL);
5348 		if (!range) {
5349 			ret = -ENOMEM;
5350 			goto unwind;
5351 		}
5352 		range->buf_index = index;
5353 		range->flags = flags;
5354 		range->base_offset = start << PAGE_SHIFT;
5355 
5356 		ret = mtree_insert_range(&ub->buf_tree, pfn,
5357 					 pfn + (i - start),
5358 					 range, GFP_KERNEL);
5359 		if (ret) {
5360 			kfree(range);
5361 			goto unwind;
5362 		}
5363 	}
5364 	return 0;
5365 
5366 unwind:
5367 	ublk_buf_erase_ranges(ub, index);
5368 	return ret;
5369 }
5370 
5371 /*
5372  * Register a shared memory buffer for zero-copy I/O.
5373  * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5374  * internally. Returns buffer index (>= 0) on success.
5375  */
ublk_ctrl_reg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5376 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5377 			     struct ublksrv_ctrl_cmd *header)
5378 {
5379 	void __user *argp = (void __user *)(unsigned long)header->addr;
5380 	struct ublk_shmem_buf_reg buf_reg;
5381 	unsigned long nr_pages;
5382 	struct page **pages = NULL;
5383 	unsigned int gup_flags;
5384 	unsigned int memflags;
5385 	long pinned;
5386 	int index;
5387 	int ret;
5388 
5389 	if (!ublk_dev_support_shmem_zc(ub))
5390 		return -EOPNOTSUPP;
5391 
5392 	memset(&buf_reg, 0, sizeof(buf_reg));
5393 	if (copy_from_user(&buf_reg, argp,
5394 			   min_t(size_t, header->len, sizeof(buf_reg))))
5395 		return -EFAULT;
5396 
5397 	if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5398 		return -EINVAL;
5399 
5400 	if (buf_reg.reserved)
5401 		return -EINVAL;
5402 
5403 	if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5404 	    !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5405 		return -EINVAL;
5406 
5407 	nr_pages = buf_reg.len >> PAGE_SHIFT;
5408 
5409 	/* Pin pages before any locks (may sleep) */
5410 	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5411 	if (!pages)
5412 		return -ENOMEM;
5413 
5414 	gup_flags = FOLL_LONGTERM;
5415 	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5416 		gup_flags |= FOLL_WRITE;
5417 
5418 	pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5419 	if (pinned < 0) {
5420 		ret = pinned;
5421 		goto err_free_pages;
5422 	}
5423 	if (pinned != nr_pages) {
5424 		ret = -EFAULT;
5425 		goto err_unpin;
5426 	}
5427 
5428 	memflags = ublk_lock_buf_tree(ub);
5429 
5430 	index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5431 	if (index < 0) {
5432 		ret = index;
5433 		goto err_unlock;
5434 	}
5435 
5436 	ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5437 	if (ret) {
5438 		ida_free(&ub->buf_ida, index);
5439 		goto err_unlock;
5440 	}
5441 
5442 	ublk_unlock_buf_tree(ub, memflags);
5443 	kvfree(pages);
5444 	return index;
5445 
5446 err_unlock:
5447 	ublk_unlock_buf_tree(ub, memflags);
5448 err_unpin:
5449 	unpin_user_pages(pages, pinned);
5450 err_free_pages:
5451 	kvfree(pages);
5452 	return ret;
5453 }
5454 
ublk_unpin_range_pages(unsigned long base_pfn,unsigned long nr_pages)5455 static void ublk_unpin_range_pages(unsigned long base_pfn,
5456 				   unsigned long nr_pages)
5457 {
5458 #define UBLK_UNPIN_BATCH	32
5459 	struct page *pages[UBLK_UNPIN_BATCH];
5460 	unsigned long off;
5461 
5462 	for (off = 0; off < nr_pages; ) {
5463 		unsigned int batch = min_t(unsigned long,
5464 					   nr_pages - off, UBLK_UNPIN_BATCH);
5465 		unsigned int j;
5466 
5467 		for (j = 0; j < batch; j++)
5468 			pages[j] = pfn_to_page(base_pfn + off + j);
5469 		unpin_user_pages(pages, batch);
5470 		off += batch;
5471 	}
5472 }
5473 
5474 /*
5475  * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under
5476  * mas_lock, collecting them into an xarray. Then drop the lock and
5477  * unpin pages + free ranges outside spinlock context.
5478  *
5479  * Returns true if the tree walk completed, false if more ranges remain.
5480  * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value().
5481  */
5482 #define UBLK_REMOVE_BATCH	64
5483 
__ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index,int * ret)5484 static bool __ublk_shmem_remove_ranges(struct ublk_device *ub,
5485 					int buf_index, int *ret)
5486 {
5487 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5488 	struct ublk_buf_range *range;
5489 	struct xarray to_unpin;
5490 	unsigned long idx;
5491 	unsigned int count = 0;
5492 	bool done = false;
5493 	void *entry;
5494 
5495 	xa_init(&to_unpin);
5496 
5497 	mas_lock(&mas);
5498 	mas_for_each(&mas, range, ULONG_MAX) {
5499 		unsigned long nr;
5500 
5501 		if (buf_index >= 0 && range->buf_index != buf_index)
5502 			continue;
5503 
5504 		*ret = 0;
5505 		nr = mas.last - mas.index + 1;
5506 		if (xa_err(xa_store(&to_unpin, mas.index,
5507 				    xa_mk_value(nr), GFP_ATOMIC)))
5508 			goto unlock;
5509 		mas_erase(&mas);
5510 		kfree(range);
5511 		if (++count >= UBLK_REMOVE_BATCH)
5512 			goto unlock;
5513 	}
5514 	done = true;
5515 unlock:
5516 	mas_unlock(&mas);
5517 
5518 	xa_for_each(&to_unpin, idx, entry)
5519 		ublk_unpin_range_pages(idx, xa_to_value(entry));
5520 	xa_destroy(&to_unpin);
5521 
5522 	return done;
5523 }
5524 
5525 /*
5526  * Remove ranges from the maple tree matching buf_index, unpin pages
5527  * and free range structs. If buf_index < 0, remove all ranges.
5528  * Processes ranges in batches to avoid holding the maple tree spinlock
5529  * across potentially expensive page unpinning.
5530  */
ublk_shmem_remove_ranges(struct ublk_device * ub,int buf_index)5531 static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index)
5532 {
5533 	int ret = -ENOENT;
5534 
5535 	while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret))
5536 		cond_resched();
5537 	return ret;
5538 }
5539 
ublk_ctrl_unreg_buf(struct ublk_device * ub,struct ublksrv_ctrl_cmd * header)5540 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5541 			       struct ublksrv_ctrl_cmd *header)
5542 {
5543 	int index = (int)header->data[0];
5544 	unsigned int memflags;
5545 	int ret;
5546 
5547 	if (!ublk_dev_support_shmem_zc(ub))
5548 		return -EOPNOTSUPP;
5549 
5550 	if (index < 0 || index > USHRT_MAX)
5551 		return -EINVAL;
5552 
5553 	memflags = ublk_lock_buf_tree(ub);
5554 
5555 	ret = ublk_shmem_remove_ranges(ub, index);
5556 	if (!ret)
5557 		ida_free(&ub->buf_ida, index);
5558 
5559 	ublk_unlock_buf_tree(ub, memflags);
5560 	return ret;
5561 }
5562 
ublk_buf_cleanup(struct ublk_device * ub)5563 static void ublk_buf_cleanup(struct ublk_device *ub)
5564 {
5565 	ublk_shmem_remove_ranges(ub, -1);
5566 	mtree_destroy(&ub->buf_tree);
5567 	ida_destroy(&ub->buf_ida);
5568 }
5569 
5570 /* Check if request pages match a registered shared memory buffer */
ublk_try_buf_match(struct ublk_device * ub,struct request * rq,u32 * buf_idx,u32 * buf_off)5571 static bool ublk_try_buf_match(struct ublk_device *ub,
5572 				   struct request *rq,
5573 				   u32 *buf_idx, u32 *buf_off)
5574 {
5575 	struct req_iterator iter;
5576 	struct bio_vec bv;
5577 	int index = -1;
5578 	unsigned long expected_offset = 0;
5579 	bool first = true;
5580 
5581 	rq_for_each_bvec(bv, rq, iter) {
5582 		unsigned long pfn = page_to_pfn(bv.bv_page);
5583 		unsigned long end_pfn = pfn +
5584 			((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5585 		struct ublk_buf_range *range;
5586 		unsigned long off;
5587 		MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5588 
5589 		range = mas_walk(&mas);
5590 		if (!range)
5591 			return false;
5592 
5593 		/* verify all pages in this bvec fall within the range */
5594 		if (end_pfn > mas.last)
5595 			return false;
5596 
5597 		off = range->base_offset +
5598 			(pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5599 
5600 		if (first) {
5601 			/* Read-only buffer can't serve READ (kernel writes) */
5602 			if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5603 			    req_op(rq) != REQ_OP_WRITE)
5604 				return false;
5605 			index = range->buf_index;
5606 			expected_offset = off;
5607 			*buf_off = off;
5608 			first = false;
5609 		} else {
5610 			if (range->buf_index != index)
5611 				return false;
5612 			if (off != expected_offset)
5613 				return false;
5614 		}
5615 		expected_offset += bv.bv_len;
5616 	}
5617 
5618 	if (first)
5619 		return false;
5620 
5621 	*buf_idx = index;
5622 	return true;
5623 }
5624 
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,u32 cmd_op,struct ublksrv_ctrl_cmd * header)5625 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5626 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5627 {
5628 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5629 	void __user *argp = (void __user *)(unsigned long)header->addr;
5630 	char *dev_path = NULL;
5631 	int ret = 0;
5632 	int mask;
5633 
5634 	if (!unprivileged) {
5635 		if (!capable(CAP_SYS_ADMIN))
5636 			return -EPERM;
5637 		/*
5638 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5639 		 * char_dev_path in payload too, since userspace may not
5640 		 * know if the specified device is created as unprivileged
5641 		 * mode.
5642 		 */
5643 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5644 			return 0;
5645 	}
5646 
5647 	/*
5648 	 * User has to provide the char device path for unprivileged ublk
5649 	 *
5650 	 * header->addr always points to the dev path buffer, and
5651 	 * header->dev_path_len records length of dev path buffer.
5652 	 */
5653 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5654 		return -EINVAL;
5655 
5656 	if (header->len < header->dev_path_len)
5657 		return -EINVAL;
5658 
5659 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5660 	if (IS_ERR(dev_path))
5661 		return PTR_ERR(dev_path);
5662 
5663 	ret = -EINVAL;
5664 	switch (_IOC_NR(cmd_op)) {
5665 	case UBLK_CMD_GET_DEV_INFO:
5666 	case UBLK_CMD_GET_DEV_INFO2:
5667 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5668 	case UBLK_CMD_GET_PARAMS:
5669 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5670 		mask = MAY_READ;
5671 		break;
5672 	case UBLK_CMD_START_DEV:
5673 	case UBLK_CMD_STOP_DEV:
5674 	case UBLK_CMD_ADD_DEV:
5675 	case UBLK_CMD_DEL_DEV:
5676 	case UBLK_CMD_SET_PARAMS:
5677 	case UBLK_CMD_START_USER_RECOVERY:
5678 	case UBLK_CMD_END_USER_RECOVERY:
5679 	case UBLK_CMD_UPDATE_SIZE:
5680 	case UBLK_CMD_QUIESCE_DEV:
5681 	case UBLK_CMD_TRY_STOP_DEV:
5682 	case UBLK_CMD_REG_BUF:
5683 	case UBLK_CMD_UNREG_BUF:
5684 		mask = MAY_READ | MAY_WRITE;
5685 		break;
5686 	default:
5687 		goto exit;
5688 	}
5689 
5690 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5691 	if (!ret) {
5692 		header->len -= header->dev_path_len;
5693 		header->addr += header->dev_path_len;
5694 	}
5695 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5696 			__func__, ub->ub_number, cmd_op,
5697 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5698 			dev_path, ret);
5699 exit:
5700 	kfree(dev_path);
5701 	return ret;
5702 }
5703 
ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)5704 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5705 {
5706 	switch (_IOC_NR(cmd_op)) {
5707 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5708 	case UBLK_CMD_GET_DEV_INFO:
5709 	case UBLK_CMD_GET_DEV_INFO2:
5710 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5711 		return false;
5712 	default:
5713 		return true;
5714 	}
5715 }
5716 
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)5717 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5718 		unsigned int issue_flags)
5719 {
5720 	/* May point to userspace-mapped memory */
5721 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5722 								    struct ublksrv_ctrl_cmd);
5723 	struct ublksrv_ctrl_cmd header;
5724 	struct ublk_device *ub = NULL;
5725 	u32 cmd_op = cmd->cmd_op;
5726 	int ret = -EINVAL;
5727 
5728 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5729 	    issue_flags & IO_URING_F_NONBLOCK)
5730 		return -EAGAIN;
5731 
5732 	if (!(issue_flags & IO_URING_F_SQE128))
5733 		return -EINVAL;
5734 
5735 	header.dev_id = READ_ONCE(ub_src->dev_id);
5736 	header.queue_id = READ_ONCE(ub_src->queue_id);
5737 	header.len = READ_ONCE(ub_src->len);
5738 	header.addr = READ_ONCE(ub_src->addr);
5739 	header.data[0] = READ_ONCE(ub_src->data[0]);
5740 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5741 	ublk_ctrl_cmd_dump(cmd_op, &header);
5742 
5743 	ret = ublk_check_cmd_op(cmd_op);
5744 	if (ret)
5745 		goto out;
5746 
5747 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5748 		ret = ublk_ctrl_get_features(&header);
5749 		goto out;
5750 	}
5751 
5752 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5753 		ret = -ENODEV;
5754 		ub = ublk_get_device_from_id(header.dev_id);
5755 		if (!ub)
5756 			goto out;
5757 
5758 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5759 		if (ret)
5760 			goto put_dev;
5761 	}
5762 
5763 	switch (_IOC_NR(cmd_op)) {
5764 	case UBLK_CMD_START_DEV:
5765 		ret = ublk_ctrl_start_dev(ub, &header);
5766 		break;
5767 	case UBLK_CMD_STOP_DEV:
5768 		ublk_ctrl_stop_dev(ub);
5769 		ret = 0;
5770 		break;
5771 	case UBLK_CMD_GET_DEV_INFO:
5772 	case UBLK_CMD_GET_DEV_INFO2:
5773 		ret = ublk_ctrl_get_dev_info(ub, &header);
5774 		break;
5775 	case UBLK_CMD_ADD_DEV:
5776 		ret = ublk_ctrl_add_dev(&header);
5777 		break;
5778 	case UBLK_CMD_DEL_DEV:
5779 		ret = ublk_ctrl_del_dev(&ub, true);
5780 		break;
5781 	case UBLK_CMD_DEL_DEV_ASYNC:
5782 		ret = ublk_ctrl_del_dev(&ub, false);
5783 		break;
5784 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5785 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5786 		break;
5787 	case UBLK_CMD_GET_PARAMS:
5788 		ret = ublk_ctrl_get_params(ub, &header);
5789 		break;
5790 	case UBLK_CMD_SET_PARAMS:
5791 		ret = ublk_ctrl_set_params(ub, &header);
5792 		break;
5793 	case UBLK_CMD_START_USER_RECOVERY:
5794 		ret = ublk_ctrl_start_recovery(ub);
5795 		break;
5796 	case UBLK_CMD_END_USER_RECOVERY:
5797 		ret = ublk_ctrl_end_recovery(ub, &header);
5798 		break;
5799 	case UBLK_CMD_UPDATE_SIZE:
5800 		ret = ublk_ctrl_set_size(ub, &header);
5801 		break;
5802 	case UBLK_CMD_QUIESCE_DEV:
5803 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5804 		break;
5805 	case UBLK_CMD_TRY_STOP_DEV:
5806 		ret = ublk_ctrl_try_stop_dev(ub);
5807 		break;
5808 	case UBLK_CMD_REG_BUF:
5809 		ret = ublk_ctrl_reg_buf(ub, &header);
5810 		break;
5811 	case UBLK_CMD_UNREG_BUF:
5812 		ret = ublk_ctrl_unreg_buf(ub, &header);
5813 		break;
5814 	default:
5815 		ret = -EOPNOTSUPP;
5816 		break;
5817 	}
5818 
5819  put_dev:
5820 	if (ub)
5821 		ublk_put_device(ub);
5822  out:
5823 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5824 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5825 	return ret;
5826 }
5827 
5828 static const struct file_operations ublk_ctl_fops = {
5829 	.open		= nonseekable_open,
5830 	.uring_cmd      = ublk_ctrl_uring_cmd,
5831 	.owner		= THIS_MODULE,
5832 	.llseek		= noop_llseek,
5833 };
5834 
5835 static struct miscdevice ublk_misc = {
5836 	.minor		= MISC_DYNAMIC_MINOR,
5837 	.name		= "ublk-control",
5838 	.fops		= &ublk_ctl_fops,
5839 };
5840 
ublk_init(void)5841 static int __init ublk_init(void)
5842 {
5843 	int ret;
5844 
5845 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5846 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5847 	/*
5848 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5849 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5850 	 */
5851 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5852 		     UBLKSRV_IO_INTEGRITY_FLAG);
5853 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5854 
5855 	init_waitqueue_head(&ublk_idr_wq);
5856 
5857 	ret = misc_register(&ublk_misc);
5858 	if (ret)
5859 		return ret;
5860 
5861 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5862 	if (ret)
5863 		goto unregister_mis;
5864 
5865 	ret = class_register(&ublk_chr_class);
5866 	if (ret)
5867 		goto free_chrdev_region;
5868 
5869 	return 0;
5870 
5871 free_chrdev_region:
5872 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5873 unregister_mis:
5874 	misc_deregister(&ublk_misc);
5875 	return ret;
5876 }
5877 
ublk_exit(void)5878 static void __exit ublk_exit(void)
5879 {
5880 	struct ublk_device *ub;
5881 	int id;
5882 
5883 	idr_for_each_entry(&ublk_index_idr, ub, id)
5884 		ublk_remove(ub);
5885 
5886 	class_unregister(&ublk_chr_class);
5887 	misc_deregister(&ublk_misc);
5888 
5889 	idr_destroy(&ublk_index_idr);
5890 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5891 }
5892 
5893 module_init(ublk_init);
5894 module_exit(ublk_exit);
5895 
ublk_set_max_unprivileged_ublks(const char * buf,const struct kernel_param * kp)5896 static int ublk_set_max_unprivileged_ublks(const char *buf,
5897 					   const struct kernel_param *kp)
5898 {
5899 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5900 }
5901 
ublk_get_max_unprivileged_ublks(char * buf,const struct kernel_param * kp)5902 static int ublk_get_max_unprivileged_ublks(char *buf,
5903 					   const struct kernel_param *kp)
5904 {
5905 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5906 }
5907 
5908 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5909 	.set = ublk_set_max_unprivileged_ublks,
5910 	.get = ublk_get_max_unprivileged_ublks,
5911 };
5912 
5913 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5914 		&unprivileged_ublks_max, 0644);
5915 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5916 
5917 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5918 MODULE_DESCRIPTION("Userspace block device");
5919 MODULE_LICENSE("GPL");
5920