xref: /linux/drivers/block/ublk_drv.c (revision ba9c792c824fff732df85119011d399d9b6d9155)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53 
54 #define UBLK_MINORS		(1U << MINORBITS)
55 
56 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
57 
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF	_IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF	_IOC_NR(UBLK_U_CMD_UNREG_BUF)
65 
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX	(1ULL << 32)
68 
69 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71 
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 		| UBLK_F_URING_CMD_COMP_IN_TASK \
75 		| UBLK_F_NEED_GET_DATA \
76 		| UBLK_F_USER_RECOVERY \
77 		| UBLK_F_USER_RECOVERY_REISSUE \
78 		| UBLK_F_UNPRIVILEGED_DEV \
79 		| UBLK_F_CMD_IOCTL_ENCODE \
80 		| UBLK_F_USER_COPY \
81 		| UBLK_F_ZONED \
82 		| UBLK_F_USER_RECOVERY_FAIL_IO \
83 		| UBLK_F_UPDATE_SIZE \
84 		| UBLK_F_AUTO_BUF_REG \
85 		| UBLK_F_QUIESCE \
86 		| UBLK_F_PER_IO_DAEMON \
87 		| UBLK_F_BUF_REG_OFF_DAEMON \
88 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 		| UBLK_F_SAFE_STOP_DEV \
90 		| UBLK_F_BATCH_IO \
91 		| UBLK_F_NO_AUTO_PART_SCAN \
92 		| UBLK_F_SHMEM_ZC)
93 
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 		| UBLK_F_USER_RECOVERY_REISSUE \
96 		| UBLK_F_USER_RECOVERY_FAIL_IO)
97 
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL                                \
100 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
102 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 	 UBLK_PARAM_TYPE_INTEGRITY)
104 
105 #define UBLK_BATCH_F_ALL  \
106 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
107 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109 
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 	struct list_head node;
113 	struct io_uring_cmd *cmd;
114 	unsigned short buf_group;
115 };
116 
117 struct ublk_uring_cmd_pdu {
118 	/*
119 	 * Store requests in same batch temporarily for queuing them to
120 	 * daemon context.
121 	 *
122 	 * It should have been stored to request payload, but we do want
123 	 * to avoid extra pre-allocation, and uring_cmd payload is always
124 	 * free for us
125 	 */
126 	union {
127 		struct request *req;
128 		struct request *req_list;
129 	};
130 
131 	/*
132 	 * The following two are valid in this cmd whole lifetime, and
133 	 * setup in ublk uring_cmd handler
134 	 */
135 	struct ublk_queue *ubq;
136 
137 	union {
138 		u16 tag;
139 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 	};
141 };
142 
143 struct ublk_batch_io_data {
144 	struct ublk_device *ub;
145 	struct io_uring_cmd *cmd;
146 	struct ublk_batch_io header;
147 	unsigned int issue_flags;
148 	struct io_comp_batch *iob;
149 };
150 
151 /*
152  * io command is active: sqe cmd is received, and its cqe isn't done
153  *
154  * If the flag is set, the io command is owned by ublk driver, and waited
155  * for incoming blk-mq request from the ublk block device.
156  *
157  * If the flag is cleared, the io command will be completed, and owned by
158  * ublk server.
159  */
160 #define UBLK_IO_FLAG_ACTIVE	0x01
161 
162 /*
163  * IO command is completed via cqe, and it is being handled by ublksrv, and
164  * not committed yet
165  *
166  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167  * cross verification
168  */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170 
171 /*
172  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173  * get data buffer address from ublksrv.
174  *
175  * Then, bio data could be copied into this data buffer for a WRITE request
176  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177  */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179 
180 /*
181  * request buffer is registered automatically, so we have to unregister it
182  * before completing this request.
183  *
184  * io_uring will unregister buffer automatically for us during exiting.
185  */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
187 
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED	0x80000000
190 
191 /*
192  * Initialize refcount to a large number to include any registered buffers.
193  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194  * any buffers registered on the io daemon task.
195  */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197 
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
200 
201 union ublk_io_buf {
202 	__u64	addr;
203 	struct ublk_auto_buf_reg auto_reg;
204 };
205 
206 struct ublk_io {
207 	union ublk_io_buf buf;
208 	unsigned int flags;
209 	int res;
210 
211 	union {
212 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
213 		struct io_uring_cmd *cmd;
214 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 		struct request *req;
216 	};
217 
218 	struct task_struct *task;
219 
220 	/*
221 	 * The number of uses of this I/O by the ublk server
222 	 * if user copy or zero copy are enabled:
223 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
225 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 	 * - 1 for each io_uring registered buffer not registered on task
227 	 * The I/O can only be completed once all references are dropped.
228 	 * User copy and buffer registration operations are only permitted
229 	 * if the reference count is nonzero.
230 	 */
231 	refcount_t ref;
232 	/* Count of buffers registered on task and not yet unregistered */
233 	unsigned task_registered_buffers;
234 
235 	void *buf_ctx_handle;
236 	spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238 
239 struct ublk_queue {
240 	int q_id;
241 	int q_depth;
242 
243 	unsigned long flags;
244 	struct ublksrv_io_desc *io_cmd_buf;
245 
246 	bool force_abort;
247 	bool canceling;
248 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 	spinlock_t		cancel_lock;
250 	struct ublk_device *dev;
251 	u32 nr_io_ready;
252 
253 	/*
254 	 * For supporting UBLK_F_BATCH_IO only.
255 	 *
256 	 * Inflight ublk request tag is saved in this fifo
257 	 *
258 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 	 * so lock is required for storing request tag to fifo
260 	 *
261 	 * Make sure just one reader for fetching request from task work
262 	 * function to ublk server, so no need to grab the lock in reader
263 	 * side.
264 	 *
265 	 * Batch I/O State Management:
266 	 *
267 	 * The batch I/O system uses implicit state management based on the
268 	 * combination of three key variables below.
269 	 *
270 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 	 *   No fetch commands available, events queue in evts_fifo
272 	 *
273 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 	 *   Fetch commands available but none processing events
275 	 *
276 	 * - ACTIVE: active_fcmd
277 	 *   One fetch command actively processing events from evts_fifo
278 	 *
279 	 * Key Invariants:
280 	 * - At most one active_fcmd at any time (single reader)
281 	 * - active_fcmd is always from fcmd_head list when non-NULL
282 	 * - evts_fifo can be read locklessly by the single active reader
283 	 * - All state transitions require evts_lock protection
284 	 * - Multiple writers to evts_fifo require lock protection
285 	 */
286 	struct {
287 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 		spinlock_t evts_lock;
289 
290 		/* List of fetch commands available to process events */
291 		struct list_head fcmd_head;
292 
293 		/* Currently active fetch command (NULL = none active) */
294 		struct ublk_batch_fetch_cmd  *active_fcmd;
295 	}____cacheline_aligned_in_smp;
296 
297 	struct ublk_io ios[] __counted_by(q_depth);
298 };
299 
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 	unsigned short buf_index;
303 	unsigned short flags;
304 	unsigned int base_offset;	/* byte offset within buffer */
305 };
306 
307 struct ublk_device {
308 	struct gendisk		*ub_disk;
309 
310 	struct ublksrv_ctrl_dev_info	dev_info;
311 
312 	struct blk_mq_tag_set	tag_set;
313 
314 	struct cdev		cdev;
315 	struct device		cdev_dev;
316 
317 #define UB_STATE_OPEN		0
318 #define UB_STATE_USED		1
319 #define UB_STATE_DELETED	2
320 	unsigned long		state;
321 	int			ub_number;
322 
323 	struct mutex		mutex;
324 
325 	spinlock_t		lock;
326 	struct mm_struct	*mm;
327 
328 	struct ublk_params	params;
329 
330 	struct completion	completion;
331 	u32			nr_queue_ready;
332 	bool 			unprivileged_daemons;
333 	struct mutex cancel_mutex;
334 	bool canceling;
335 	pid_t 	ublksrv_tgid;
336 	struct delayed_work	exit_work;
337 	struct work_struct	partition_scan_work;
338 
339 	bool			block_open; /* protected by open_mutex */
340 
341 	/* shared memory zero copy */
342 	struct maple_tree	buf_tree;
343 	struct ida		buf_ida;
344 
345 	struct ublk_queue       *queues[];
346 };
347 
348 /* header of ublk_params */
349 struct ublk_params_header {
350 	__u32	len;
351 	__u32	types;
352 };
353 
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 				  u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 		u16 q_id, u16 tag, struct ublk_io *io);
362 static void ublk_batch_dispatch(struct ublk_queue *ubq,
363 				const struct ublk_batch_io_data *data,
364 				struct ublk_batch_fetch_cmd *fcmd);
365 
366 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
367 {
368 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
369 }
370 
371 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
372 {
373 	return ubq->flags & UBLK_F_BATCH_IO;
374 }
375 
376 static inline void ublk_io_lock(struct ublk_io *io)
377 {
378 	spin_lock(&io->lock);
379 }
380 
381 static inline void ublk_io_unlock(struct ublk_io *io)
382 {
383 	spin_unlock(&io->lock);
384 }
385 
386 /* Initialize the event queue */
387 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
388 				    int numa_node)
389 {
390 	spin_lock_init(&q->evts_lock);
391 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
392 }
393 
394 /* Check if event queue is empty */
395 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
396 {
397 	return kfifo_is_empty(&q->evts_fifo);
398 }
399 
400 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
401 {
402 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
403 	kfifo_free(&q->evts_fifo);
404 }
405 
406 static inline struct ublksrv_io_desc *
407 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
408 {
409 	return &ubq->io_cmd_buf[tag];
410 }
411 
412 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
413 {
414 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
415 }
416 
417 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
418 {
419 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
420 }
421 
422 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
423 {
424 	return ubq->flags & UBLK_F_SHMEM_ZC;
425 }
426 
427 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
428 					unsigned int tag)
429 {
430 	return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
431 }
432 
433 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
434 {
435 	return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
436 }
437 
438 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
439 {
440 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
441 }
442 
443 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
444 {
445 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
446 }
447 
448 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
449 {
450 	return ubq->flags & UBLK_F_USER_COPY;
451 }
452 
453 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
454 {
455 	return ub->dev_info.flags & UBLK_F_USER_COPY;
456 }
457 
458 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
459 {
460 	return ub->dev_info.flags & UBLK_F_ZONED;
461 }
462 
463 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
464 {
465 	return ubq->flags & UBLK_F_ZONED;
466 }
467 
468 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
469 {
470 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
471 }
472 
473 static inline unsigned int ublk_req_build_flags(struct request *req)
474 {
475 	unsigned flags = 0;
476 
477 	if (req->cmd_flags & REQ_FAILFAST_DEV)
478 		flags |= UBLK_IO_F_FAILFAST_DEV;
479 
480 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
481 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
482 
483 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
484 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
485 
486 	if (req->cmd_flags & REQ_META)
487 		flags |= UBLK_IO_F_META;
488 
489 	if (req->cmd_flags & REQ_FUA)
490 		flags |= UBLK_IO_F_FUA;
491 
492 	if (req->cmd_flags & REQ_NOUNMAP)
493 		flags |= UBLK_IO_F_NOUNMAP;
494 
495 	if (req->cmd_flags & REQ_SWAP)
496 		flags |= UBLK_IO_F_SWAP;
497 
498 	if (blk_integrity_rq(req))
499 		flags |= UBLK_IO_F_INTEGRITY;
500 
501 	return flags;
502 }
503 
504 static void ublk_init_iod(struct ublk_queue *ubq, struct request *req,
505 			  uint8_t ublk_op, uint32_t nr_sectors,
506 			  uint64_t start_sector)
507 {
508 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
509 	struct ublk_io *io = &ubq->ios[req->tag];
510 
511 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
512 	iod->nr_sectors = nr_sectors;
513 	iod->start_sector = start_sector;
514 
515 	/* Try shmem zero-copy match before setting addr */
516 	if (ublk_support_shmem_zc(ubq) && blk_rq_has_data(req)) {
517 		u32 buf_idx, buf_off;
518 
519 		if (ublk_try_buf_match(ubq->dev, req, &buf_idx, &buf_off)) {
520 			iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
521 			iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
522 			return;
523 		}
524 	}
525 
526 	iod->addr = io->buf.addr;
527 }
528 
529 #ifdef CONFIG_BLK_DEV_ZONED
530 
531 struct ublk_zoned_report_desc {
532 	__u64 sector;
533 	__u32 operation;
534 	__u32 nr_zones;
535 };
536 
537 static DEFINE_XARRAY(ublk_zoned_report_descs);
538 
539 static int ublk_zoned_insert_report_desc(const struct request *req,
540 		struct ublk_zoned_report_desc *desc)
541 {
542 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
543 			    desc, GFP_KERNEL);
544 }
545 
546 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
547 		const struct request *req)
548 {
549 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
550 }
551 
552 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
553 		const struct request *req)
554 {
555 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
556 }
557 
558 static int ublk_get_nr_zones(const struct ublk_device *ub)
559 {
560 	const struct ublk_param_basic *p = &ub->params.basic;
561 
562 	/* Zone size is a power of 2 */
563 	return p->dev_sectors >> ilog2(p->chunk_sectors);
564 }
565 
566 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
567 {
568 	return blk_revalidate_disk_zones(ub->ub_disk);
569 }
570 
571 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
572 {
573 	const struct ublk_param_zoned *p = &ub->params.zoned;
574 	int nr_zones;
575 
576 	if (!ublk_dev_is_zoned(ub))
577 		return -EINVAL;
578 
579 	if (!p->max_zone_append_sectors)
580 		return -EINVAL;
581 
582 	nr_zones = ublk_get_nr_zones(ub);
583 
584 	if (p->max_active_zones > nr_zones)
585 		return -EINVAL;
586 
587 	if (p->max_open_zones > nr_zones)
588 		return -EINVAL;
589 
590 	return 0;
591 }
592 
593 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
594 {
595 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
596 }
597 
598 /* Based on virtblk_alloc_report_buffer */
599 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
600 				      unsigned int nr_zones, size_t *buflen)
601 {
602 	struct request_queue *q = ublk->ub_disk->queue;
603 	size_t bufsize;
604 	void *buf;
605 
606 	nr_zones = min_t(unsigned int, nr_zones,
607 			 ublk->ub_disk->nr_zones);
608 
609 	bufsize = nr_zones * sizeof(struct blk_zone);
610 	bufsize =
611 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
612 
613 	while (bufsize >= sizeof(struct blk_zone)) {
614 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
615 		if (buf) {
616 			*buflen = bufsize;
617 			return buf;
618 		}
619 		bufsize >>= 1;
620 	}
621 
622 	*buflen = 0;
623 	return NULL;
624 }
625 
626 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
627 		      unsigned int nr_zones, struct blk_report_zones_args *args)
628 {
629 	struct ublk_device *ub = disk->private_data;
630 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
631 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
632 	unsigned int done_zones = 0;
633 	unsigned int max_zones_per_request;
634 	int ret;
635 	struct blk_zone *buffer;
636 	size_t buffer_length;
637 
638 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
639 			 nr_zones);
640 
641 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
642 	if (!buffer)
643 		return -ENOMEM;
644 
645 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
646 
647 	while (done_zones < nr_zones) {
648 		unsigned int remaining_zones = nr_zones - done_zones;
649 		unsigned int zones_in_request =
650 			min_t(unsigned int, remaining_zones, max_zones_per_request);
651 		struct request *req;
652 		struct ublk_zoned_report_desc desc;
653 		blk_status_t status;
654 
655 		memset(buffer, 0, buffer_length);
656 
657 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
658 		if (IS_ERR(req)) {
659 			ret = PTR_ERR(req);
660 			goto out;
661 		}
662 
663 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
664 		desc.sector = sector;
665 		desc.nr_zones = zones_in_request;
666 		ret = ublk_zoned_insert_report_desc(req, &desc);
667 		if (ret)
668 			goto free_req;
669 
670 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
671 		if (ret)
672 			goto erase_desc;
673 
674 		status = blk_execute_rq(req, 0);
675 		ret = blk_status_to_errno(status);
676 erase_desc:
677 		ublk_zoned_erase_report_desc(req);
678 free_req:
679 		blk_mq_free_request(req);
680 		if (ret)
681 			goto out;
682 
683 		for (unsigned int i = 0; i < zones_in_request; i++) {
684 			struct blk_zone *zone = buffer + i;
685 
686 			/* A zero length zone means no more zones in this response */
687 			if (!zone->len)
688 				break;
689 
690 			ret = disk_report_zone(disk, zone, i, args);
691 			if (ret)
692 				goto out;
693 
694 			done_zones++;
695 			sector += zone_size_sectors;
696 
697 		}
698 	}
699 
700 	ret = done_zones;
701 
702 out:
703 	kvfree(buffer);
704 	return ret;
705 }
706 
707 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
708 					 struct request *req)
709 {
710 	struct ublk_zoned_report_desc *desc;
711 	u32 ublk_op;
712 
713 	switch (req_op(req)) {
714 	case REQ_OP_ZONE_OPEN:
715 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
716 		break;
717 	case REQ_OP_ZONE_CLOSE:
718 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
719 		break;
720 	case REQ_OP_ZONE_FINISH:
721 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
722 		break;
723 	case REQ_OP_ZONE_RESET:
724 		ublk_op = UBLK_IO_OP_ZONE_RESET;
725 		break;
726 	case REQ_OP_ZONE_APPEND:
727 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
728 		break;
729 	case REQ_OP_ZONE_RESET_ALL:
730 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
731 		break;
732 	case REQ_OP_DRV_IN:
733 		desc = ublk_zoned_get_report_desc(req);
734 		if (!desc)
735 			return BLK_STS_IOERR;
736 		ublk_op = desc->operation;
737 		switch (ublk_op) {
738 		case UBLK_IO_OP_REPORT_ZONES:
739 			ublk_init_iod(ubq, req, ublk_op, desc->nr_zones,
740 				      desc->sector);
741 			return BLK_STS_OK;
742 		default:
743 			return BLK_STS_IOERR;
744 		}
745 	case REQ_OP_DRV_OUT:
746 		/* We do not support drv_out */
747 		return BLK_STS_NOTSUPP;
748 	default:
749 		return BLK_STS_IOERR;
750 	}
751 
752 	ublk_init_iod(ubq, req, ublk_op, blk_rq_sectors(req), blk_rq_pos(req));
753 	return BLK_STS_OK;
754 }
755 
756 #else
757 
758 #define ublk_report_zones (NULL)
759 
760 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
761 {
762 	return -EOPNOTSUPP;
763 }
764 
765 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
766 {
767 }
768 
769 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
770 {
771 	return 0;
772 }
773 
774 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
775 					 struct request *req)
776 {
777 	return BLK_STS_NOTSUPP;
778 }
779 
780 #endif
781 
782 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
783 				      bool need_map, struct io_comp_batch *iob);
784 
785 static dev_t ublk_chr_devt;
786 static const struct class ublk_chr_class = {
787 	.name = "ublk-char",
788 };
789 
790 static DEFINE_IDR(ublk_index_idr);
791 static DEFINE_SPINLOCK(ublk_idr_lock);
792 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
793 
794 static DEFINE_MUTEX(ublk_ctl_mutex);
795 
796 static struct ublk_batch_fetch_cmd *
797 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
798 {
799 	struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
800 
801 	if (fcmd) {
802 		fcmd->cmd = cmd;
803 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
804 	}
805 	return fcmd;
806 }
807 
808 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
809 {
810 	kfree(fcmd);
811 }
812 
813 static void __ublk_release_fcmd(struct ublk_queue *ubq)
814 {
815 	WRITE_ONCE(ubq->active_fcmd, NULL);
816 }
817 
818 /*
819  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
820  * dispatching
821  */
822 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
823 					const struct ublk_batch_io_data *data,
824 					struct ublk_batch_fetch_cmd *fcmd,
825 					int res)
826 {
827 	spin_lock(&ubq->evts_lock);
828 	list_del_init(&fcmd->node);
829 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
830 	__ublk_release_fcmd(ubq);
831 	spin_unlock(&ubq->evts_lock);
832 
833 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
834 	ublk_batch_free_fcmd(fcmd);
835 }
836 
837 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
838 				     struct io_br_sel *sel,
839 				     unsigned int issue_flags)
840 {
841 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
842 		return -ENOBUFS;
843 	return 0;
844 }
845 
846 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
847 				       void __user *buf, const u16 *tag_buf,
848 				       unsigned int len)
849 {
850 	if (copy_to_user(buf, tag_buf, len))
851 		return -EFAULT;
852 	return len;
853 }
854 
855 #define UBLK_MAX_UBLKS UBLK_MINORS
856 
857 /*
858  * Max unprivileged ublk devices allowed to add
859  *
860  * It can be extended to one per-user limit in future or even controlled
861  * by cgroup.
862  */
863 static unsigned int unprivileged_ublks_max = 64;
864 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
865 
866 static struct miscdevice ublk_misc;
867 
868 static inline unsigned ublk_pos_to_hwq(loff_t pos)
869 {
870 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
871 		UBLK_QID_BITS_MASK;
872 }
873 
874 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
875 {
876 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
877 }
878 
879 static inline unsigned ublk_pos_to_tag(loff_t pos)
880 {
881 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
882 		UBLK_TAG_BITS_MASK;
883 }
884 
885 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
886 {
887 	const struct ublk_param_basic *p = &ub->params.basic;
888 
889 	if (p->attrs & UBLK_ATTR_READ_ONLY)
890 		set_disk_ro(ub->ub_disk, true);
891 
892 	set_capacity(ub->ub_disk, p->dev_sectors);
893 }
894 
895 static int ublk_integrity_flags(u32 flags)
896 {
897 	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
898 
899 	if (flags & LBMD_PI_CAP_INTEGRITY) {
900 		flags &= ~LBMD_PI_CAP_INTEGRITY;
901 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
902 	}
903 	if (flags & LBMD_PI_CAP_REFTAG) {
904 		flags &= ~LBMD_PI_CAP_REFTAG;
905 		ret_flags |= BLK_INTEGRITY_REF_TAG;
906 	}
907 	return flags ? -EINVAL : ret_flags;
908 }
909 
910 static int ublk_integrity_pi_tuple_size(u8 csum_type)
911 {
912 	switch (csum_type) {
913 	case LBMD_PI_CSUM_NONE:
914 		return 0;
915 	case LBMD_PI_CSUM_IP:
916 	case LBMD_PI_CSUM_CRC16_T10DIF:
917 		return 8;
918 	case LBMD_PI_CSUM_CRC64_NVME:
919 		return 16;
920 	default:
921 		return -EINVAL;
922 	}
923 }
924 
925 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
926 {
927 	switch (csum_type) {
928 	case LBMD_PI_CSUM_NONE:
929 		return BLK_INTEGRITY_CSUM_NONE;
930 	case LBMD_PI_CSUM_IP:
931 		return BLK_INTEGRITY_CSUM_IP;
932 	case LBMD_PI_CSUM_CRC16_T10DIF:
933 		return BLK_INTEGRITY_CSUM_CRC;
934 	case LBMD_PI_CSUM_CRC64_NVME:
935 		return BLK_INTEGRITY_CSUM_CRC64;
936 	default:
937 		WARN_ON_ONCE(1);
938 		return BLK_INTEGRITY_CSUM_NONE;
939 	}
940 }
941 
942 static int ublk_validate_params(const struct ublk_device *ub)
943 {
944 	/* basic param is the only one which must be set */
945 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
946 		const struct ublk_param_basic *p = &ub->params.basic;
947 
948 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
949 			return -EINVAL;
950 
951 		/*
952 		 * 256M is a reasonable upper bound for physical block size,
953 		 * io_min and io_opt; it aligns with the maximum physical
954 		 * block size possible in NVMe.
955 		 */
956 		if (p->physical_bs_shift > ilog2(SZ_256M))
957 			return -EINVAL;
958 
959 		if (p->io_min_shift > ilog2(SZ_256M))
960 			return -EINVAL;
961 
962 		if (p->io_opt_shift > ilog2(SZ_256M))
963 			return -EINVAL;
964 
965 		if (p->logical_bs_shift > p->physical_bs_shift)
966 			return -EINVAL;
967 
968 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
969 			return -EINVAL;
970 
971 		if (p->max_sectors < PAGE_SECTORS)
972 			return -EINVAL;
973 
974 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
975 			return -EINVAL;
976 	} else
977 		return -EINVAL;
978 
979 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
980 		const struct ublk_param_discard *p = &ub->params.discard;
981 
982 		/* So far, only support single segment discard */
983 		if (p->max_discard_sectors && p->max_discard_segments != 1)
984 			return -EINVAL;
985 
986 		if (!p->discard_granularity)
987 			return -EINVAL;
988 	}
989 
990 	/* dev_t is read-only */
991 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
992 		return -EINVAL;
993 
994 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
995 		return ublk_dev_param_zoned_validate(ub);
996 	else if (ublk_dev_is_zoned(ub))
997 		return -EINVAL;
998 
999 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
1000 		const struct ublk_param_dma_align *p = &ub->params.dma;
1001 
1002 		if (p->alignment >= PAGE_SIZE)
1003 			return -EINVAL;
1004 
1005 		if (!is_power_of_2(p->alignment + 1))
1006 			return -EINVAL;
1007 	}
1008 
1009 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
1010 		const struct ublk_param_segment *p = &ub->params.seg;
1011 
1012 		if (!is_power_of_2(p->seg_boundary_mask + 1))
1013 			return -EINVAL;
1014 
1015 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
1016 			return -EINVAL;
1017 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
1018 			return -EINVAL;
1019 	}
1020 
1021 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
1022 		const struct ublk_param_integrity *p = &ub->params.integrity;
1023 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
1024 		int flags = ublk_integrity_flags(p->flags);
1025 
1026 		if (!ublk_dev_support_integrity(ub))
1027 			return -EINVAL;
1028 		if (flags < 0)
1029 			return flags;
1030 		if (pi_tuple_size < 0)
1031 			return pi_tuple_size;
1032 		if (!p->metadata_size)
1033 			return -EINVAL;
1034 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
1035 		    p->flags & LBMD_PI_CAP_REFTAG)
1036 			return -EINVAL;
1037 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
1038 			return -EINVAL;
1039 		if (p->interval_exp < SECTOR_SHIFT ||
1040 		    p->interval_exp > ub->params.basic.logical_bs_shift)
1041 			return -EINVAL;
1042 	}
1043 
1044 	return 0;
1045 }
1046 
1047 static void ublk_apply_params(struct ublk_device *ub)
1048 {
1049 	ublk_dev_param_basic_apply(ub);
1050 
1051 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
1052 		ublk_dev_param_zoned_apply(ub);
1053 }
1054 
1055 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
1056 {
1057 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
1058 		!ublk_support_auto_buf_reg(ubq);
1059 }
1060 
1061 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
1062 {
1063 	return !ublk_dev_support_user_copy(ub) &&
1064 	       !ublk_dev_support_zero_copy(ub) &&
1065 	       !ublk_dev_support_auto_buf_reg(ub);
1066 }
1067 
1068 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1069 {
1070 	/*
1071 	 * read()/write() is involved in user copy, so request reference
1072 	 * has to be grabbed
1073 	 *
1074 	 * for zero copy, request buffer need to be registered to io_uring
1075 	 * buffer table, so reference is needed
1076 	 *
1077 	 * For auto buffer register, ublk server still may issue
1078 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1079 	 * so reference is required too.
1080 	 */
1081 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1082 		ublk_support_auto_buf_reg(ubq);
1083 }
1084 
1085 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1086 {
1087 	return ublk_dev_support_user_copy(ub) ||
1088 	       ublk_dev_support_zero_copy(ub) ||
1089 	       ublk_dev_support_auto_buf_reg(ub);
1090 }
1091 
1092 /*
1093  * ublk IO Reference Counting Design
1094  * ==================================
1095  *
1096  * For user-copy and zero-copy modes, ublk uses a split reference model with
1097  * two counters that together track IO lifetime:
1098  *
1099  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
1100  *   - io->task_registered_buffers: count of buffers registered on the IO task
1101  *
1102  * Key Invariant:
1103  * --------------
1104  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1105  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1106  * when no active references exist. After IO completion, both counters become
1107  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1108  * task_registered_buffers are 0.
1109  *
1110  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1111  * exit to determine if all references have been released.
1112  *
1113  * Why Split Counters:
1114  * -------------------
1115  * Buffers registered on the IO daemon task can use the lightweight
1116  * task_registered_buffers counter (simple increment/decrement) instead of
1117  * atomic refcount operations. The ublk_io_release() callback checks if
1118  * current == io->task to decide which counter to update.
1119  *
1120  * This optimization only applies before IO completion. At completion,
1121  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1122  * After that, all subsequent buffer unregistrations must use the atomic ref
1123  * since they may be releasing the last reference.
1124  *
1125  * Reference Lifecycle:
1126  * --------------------
1127  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1128  *
1129  * 2. During IO processing:
1130  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1131  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1132  *    - Buffer unregister callback (ublk_io_release):
1133  *      * If on-task: task_registered_buffers--
1134  *      * If off-task: ref-- via ublk_put_req_ref()
1135  *
1136  * 3. ublk_sub_req_ref() at IO completion:
1137  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1138  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1139  *    - This effectively collapses task_registered_buffers into the atomic ref,
1140  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1141  *      buffers that were already counted
1142  *
1143  * Example (zero-copy, register on-task, unregister off-task):
1144  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1145  *   - Register buffer on-task: task_registered_buffers = 1
1146  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1147  *   - Completion via ublk_sub_req_ref():
1148  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1149  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1150  *
1151  * Example (auto buffer registration):
1152  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1153  *
1154  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1155  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1156  *   - Completion via ublk_sub_req_ref():
1157  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1158  *
1159  * Example (zero-copy, ublk server killed):
1160  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1161  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1162  *
1163  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1164  *   - Register buffer on-task: task_registered_buffers = 1
1165  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1166  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1167  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1168  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1169  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1170  *     and abort pending requests
1171  *
1172  * Batch IO Special Case:
1173  * ----------------------
1174  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1175  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1176  * task_registered_buffers counter still tracks registered buffers for the
1177  * invariant check, even though the callback doesn't decrement it.
1178  *
1179  * Note: updating task_registered_buffers is protected by io->lock.
1180  */
1181 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1182 		struct ublk_io *io)
1183 {
1184 	if (ublk_need_req_ref(ubq))
1185 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1186 }
1187 
1188 static inline bool ublk_get_req_ref(struct ublk_io *io)
1189 {
1190 	return refcount_inc_not_zero(&io->ref);
1191 }
1192 
1193 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1194 {
1195 	if (!refcount_dec_and_test(&io->ref))
1196 		return;
1197 
1198 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1199 	__ublk_complete_rq(req, io, false, NULL);
1200 }
1201 
1202 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1203 {
1204 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1205 
1206 	io->task_registered_buffers = 0;
1207 	return refcount_sub_and_test(sub_refs, &io->ref);
1208 }
1209 
1210 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1211 {
1212 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1213 }
1214 
1215 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1216 {
1217 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1218 }
1219 
1220 /* Called in slow path only, keep it noinline for trace purpose */
1221 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1222 {
1223 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1224 		return ub;
1225 	return NULL;
1226 }
1227 
1228 /* Called in slow path only, keep it noinline for trace purpose */
1229 static noinline void ublk_put_device(struct ublk_device *ub)
1230 {
1231 	put_device(&ub->cdev_dev);
1232 }
1233 
1234 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1235 		int qid)
1236 {
1237 	return dev->queues[qid];
1238 }
1239 
1240 static inline struct ublksrv_io_desc *
1241 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1242 {
1243 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1244 }
1245 
1246 static inline int __ublk_queue_cmd_buf_size(int depth)
1247 {
1248 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1249 }
1250 
1251 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1252 {
1253 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1254 }
1255 
1256 static int ublk_max_cmd_buf_size(void)
1257 {
1258 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1259 }
1260 
1261 /*
1262  * Should I/O outstanding to the ublk server when it exits be reissued?
1263  * If not, outstanding I/O will get errors.
1264  */
1265 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1266 {
1267 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1268 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1269 }
1270 
1271 /*
1272  * Should I/O issued while there is no ublk server queue? If not, I/O
1273  * issued while there is no ublk server will get errors.
1274  */
1275 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1276 {
1277 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1278 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1279 }
1280 
1281 /*
1282  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1283  * of the device flags for smaller cache footprint - better for fast
1284  * paths.
1285  */
1286 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1287 {
1288 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1289 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1290 }
1291 
1292 /*
1293  * Should ublk devices be stopped (i.e. no recovery possible) when the
1294  * ublk server exits? If not, devices can be used again by a future
1295  * incarnation of a ublk server via the start_recovery/end_recovery
1296  * commands.
1297  */
1298 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1299 {
1300 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1301 }
1302 
1303 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1304 {
1305 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1306 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1307 }
1308 
1309 static void ublk_free_disk(struct gendisk *disk)
1310 {
1311 	struct ublk_device *ub = disk->private_data;
1312 
1313 	clear_bit(UB_STATE_USED, &ub->state);
1314 	ublk_put_device(ub);
1315 }
1316 
1317 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1318 		unsigned int *owner_gid)
1319 {
1320 	kuid_t uid;
1321 	kgid_t gid;
1322 
1323 	current_uid_gid(&uid, &gid);
1324 
1325 	*owner_uid = from_kuid(&init_user_ns, uid);
1326 	*owner_gid = from_kgid(&init_user_ns, gid);
1327 }
1328 
1329 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1330 {
1331 	struct ublk_device *ub = disk->private_data;
1332 
1333 	if (capable(CAP_SYS_ADMIN))
1334 		return 0;
1335 
1336 	/*
1337 	 * If it is one unprivileged device, only owner can open
1338 	 * the disk. Otherwise it could be one trap made by one
1339 	 * evil user who grants this disk's privileges to other
1340 	 * users deliberately.
1341 	 *
1342 	 * This way is reasonable too given anyone can create
1343 	 * unprivileged device, and no need other's grant.
1344 	 */
1345 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1346 		unsigned int curr_uid, curr_gid;
1347 
1348 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1349 
1350 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1351 				ub->dev_info.owner_gid)
1352 			return -EPERM;
1353 	}
1354 
1355 	if (ub->block_open)
1356 		return -ENXIO;
1357 
1358 	return 0;
1359 }
1360 
1361 static const struct block_device_operations ub_fops = {
1362 	.owner =	THIS_MODULE,
1363 	.open =		ublk_open,
1364 	.free_disk =	ublk_free_disk,
1365 	.report_zones =	ublk_report_zones,
1366 };
1367 
1368 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1369 				struct iov_iter *uiter, int dir, size_t *done)
1370 {
1371 	unsigned len;
1372 	void *bv_buf;
1373 	size_t copied;
1374 
1375 	if (*offset >= bv->bv_len) {
1376 		*offset -= bv->bv_len;
1377 		return true;
1378 	}
1379 
1380 	len = bv->bv_len - *offset;
1381 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1382 	/*
1383 	 * Bio pages may originate from slab caches without a usercopy region
1384 	 * (e.g. jbd2 frozen metadata buffers).  This is the same data that
1385 	 * the loop driver writes to its backing file — no exposure risk.
1386 	 * The bvec length is always trusted, so the size check in
1387 	 * check_copy_size() is not needed either.  Use the unchecked
1388 	 * helpers to avoid false positives on slab pages.
1389 	 */
1390 	if (dir == ITER_DEST)
1391 		copied = _copy_to_iter(bv_buf, len, uiter);
1392 	else
1393 		copied = _copy_from_iter(bv_buf, len, uiter);
1394 
1395 	kunmap_local(bv_buf);
1396 
1397 	*done += copied;
1398 	if (copied < len)
1399 		return false;
1400 
1401 	*offset = 0;
1402 	return true;
1403 }
1404 
1405 /*
1406  * Copy data between request pages and io_iter, and 'offset'
1407  * is the start point of linear offset of request.
1408  */
1409 static size_t ublk_copy_user_pages(const struct request *req,
1410 		unsigned offset, struct iov_iter *uiter, int dir)
1411 {
1412 	struct req_iterator iter;
1413 	struct bio_vec bv;
1414 	size_t done = 0;
1415 
1416 	rq_for_each_segment(bv, req, iter) {
1417 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1418 			break;
1419 	}
1420 	return done;
1421 }
1422 
1423 #ifdef CONFIG_BLK_DEV_INTEGRITY
1424 static size_t ublk_copy_user_integrity(const struct request *req,
1425 		unsigned offset, struct iov_iter *uiter, int dir)
1426 {
1427 	size_t done = 0;
1428 	struct bio *bio = req->bio;
1429 	struct bvec_iter iter;
1430 	struct bio_vec iv;
1431 
1432 	if (!blk_integrity_rq(req))
1433 		return 0;
1434 
1435 	bio_for_each_integrity_vec(iv, bio, iter) {
1436 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1437 			break;
1438 	}
1439 
1440 	return done;
1441 }
1442 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1443 static size_t ublk_copy_user_integrity(const struct request *req,
1444 		unsigned offset, struct iov_iter *uiter, int dir)
1445 {
1446 	return 0;
1447 }
1448 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1449 
1450 static inline bool ublk_need_map_req(const struct request *req)
1451 {
1452 	return blk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1453 }
1454 
1455 static inline bool ublk_need_unmap_req(const struct request *req)
1456 {
1457 	return blk_rq_has_data(req) &&
1458 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1459 }
1460 
1461 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1462 				const struct request *req,
1463 				const struct ublk_io *io)
1464 {
1465 	const unsigned int rq_bytes = blk_rq_bytes(req);
1466 
1467 	if (!ublk_need_map_io(ubq))
1468 		return rq_bytes;
1469 
1470 	/*
1471 	 * no zero copy, we delay copy WRITE request data into ublksrv
1472 	 * context and the big benefit is that pinning pages in current
1473 	 * context is pretty fast, see ublk_pin_user_pages
1474 	 */
1475 	if (ublk_need_map_req(req)) {
1476 		struct iov_iter iter;
1477 		const int dir = ITER_DEST;
1478 
1479 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1480 		return ublk_copy_user_pages(req, 0, &iter, dir);
1481 	}
1482 	return rq_bytes;
1483 }
1484 
1485 static unsigned int ublk_unmap_io(bool need_map,
1486 		const struct request *req,
1487 		const struct ublk_io *io)
1488 {
1489 	const unsigned int rq_bytes = blk_rq_bytes(req);
1490 
1491 	if (!need_map)
1492 		return rq_bytes;
1493 
1494 	if (ublk_need_unmap_req(req)) {
1495 		struct iov_iter iter;
1496 		const int dir = ITER_SOURCE;
1497 
1498 		WARN_ON_ONCE(io->res > rq_bytes);
1499 
1500 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1501 		return ublk_copy_user_pages(req, 0, &iter, dir);
1502 	}
1503 	return rq_bytes;
1504 }
1505 
1506 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1507 {
1508 	u32 ublk_op;
1509 
1510 	switch (req_op(req)) {
1511 	case REQ_OP_READ:
1512 		ublk_op = UBLK_IO_OP_READ;
1513 		break;
1514 	case REQ_OP_WRITE:
1515 		ublk_op = UBLK_IO_OP_WRITE;
1516 		break;
1517 	case REQ_OP_FLUSH:
1518 		ublk_op = UBLK_IO_OP_FLUSH;
1519 		break;
1520 	case REQ_OP_DISCARD:
1521 		ublk_op = UBLK_IO_OP_DISCARD;
1522 		break;
1523 	case REQ_OP_WRITE_ZEROES:
1524 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1525 		break;
1526 	default:
1527 		if (ublk_queue_is_zoned(ubq))
1528 			return ublk_setup_iod_zoned(ubq, req);
1529 		return BLK_STS_IOERR;
1530 	}
1531 
1532 	ublk_init_iod(ubq, req, ublk_op, blk_rq_sectors(req), blk_rq_pos(req));
1533 	return BLK_STS_OK;
1534 }
1535 
1536 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1537 		struct io_uring_cmd *ioucmd)
1538 {
1539 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1540 }
1541 
1542 static void ublk_end_request(struct request *req, blk_status_t error)
1543 {
1544 	local_bh_disable();
1545 	blk_mq_end_request(req, error);
1546 	local_bh_enable();
1547 }
1548 
1549 /* todo: handle partial completion */
1550 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1551 				      bool need_map, struct io_comp_batch *iob)
1552 {
1553 	unsigned int unmapped_bytes;
1554 	blk_status_t res = BLK_STS_OK;
1555 	bool requeue;
1556 
1557 	/* failed read IO if nothing is read */
1558 	if (!io->res && req_op(req) == REQ_OP_READ)
1559 		io->res = -EIO;
1560 
1561 	if (io->res < 0) {
1562 		res = errno_to_blk_status(io->res);
1563 		goto exit;
1564 	}
1565 
1566 	/*
1567 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1568 	 * directly.
1569 	 *
1570 	 * Both the two needn't unmap.
1571 	 */
1572 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1573 	    req_op(req) != REQ_OP_DRV_IN)
1574 		goto exit;
1575 
1576 	/* shmem zero copy: no data to unmap, pages already shared */
1577 	if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1578 		goto exit;
1579 
1580 	/* for READ request, writing data in iod->addr to rq buffers */
1581 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1582 
1583 	/*
1584 	 * Extremely impossible since we got data filled in just before
1585 	 *
1586 	 * Re-read simply for this unlikely case.
1587 	 */
1588 	if (unlikely(unmapped_bytes < io->res))
1589 		io->res = unmapped_bytes;
1590 
1591 	/*
1592 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1593 	 * happens off this path, then that will prevent ublk's blkdev_release()
1594 	 * from being called on current's task work, see fput() implementation.
1595 	 *
1596 	 * Otherwise, ublk server may not provide forward progress in case of
1597 	 * reading the partition table from bdev_open() with disk->open_mutex
1598 	 * held, and causes dead lock as we could already be holding
1599 	 * disk->open_mutex here.
1600 	 *
1601 	 * Preferably we would not be doing IO with a mutex held that is also
1602 	 * used for release, but this work-around will suffice for now.
1603 	 */
1604 	local_bh_disable();
1605 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1606 	local_bh_enable();
1607 	if (requeue)
1608 		blk_mq_requeue_request(req, true);
1609 	else if (likely(!blk_should_fake_timeout(req->q))) {
1610 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1611 			return;
1612 		__blk_mq_end_request(req, BLK_STS_OK);
1613 	}
1614 
1615 	return;
1616 exit:
1617 	ublk_end_request(req, res);
1618 }
1619 
1620 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1621 						     struct request *req)
1622 {
1623 	/* read cmd first because req will overwrite it */
1624 	struct io_uring_cmd *cmd = io->cmd;
1625 
1626 	/* mark this cmd owned by ublksrv */
1627 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1628 
1629 	/*
1630 	 * clear ACTIVE since we are done with this sqe/cmd slot
1631 	 * We can only accept io cmd in case of being not active.
1632 	 */
1633 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1634 
1635 	io->req = req;
1636 	return cmd;
1637 }
1638 
1639 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1640 				 int res, unsigned issue_flags)
1641 {
1642 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1643 
1644 	/* tell ublksrv one io request is coming */
1645 	io_uring_cmd_done(cmd, res, issue_flags);
1646 }
1647 
1648 #define UBLK_REQUEUE_DELAY_MS	3
1649 
1650 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1651 		struct request *rq)
1652 {
1653 	/* We cannot process this rq so just requeue it. */
1654 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1655 		blk_mq_requeue_request(rq, false);
1656 	else
1657 		ublk_end_request(rq, BLK_STS_IOERR);
1658 }
1659 
1660 static void
1661 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1662 {
1663 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1664 
1665 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1666 }
1667 
1668 enum auto_buf_reg_res {
1669 	AUTO_BUF_REG_FAIL,
1670 	AUTO_BUF_REG_FALLBACK,
1671 	AUTO_BUF_REG_OK,
1672 };
1673 
1674 /*
1675  * Setup io state after auto buffer registration.
1676  *
1677  * Must be called after ublk_auto_buf_register() is done.
1678  * Caller must hold io->lock in batch context.
1679  */
1680 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1681 				   struct request *req, struct ublk_io *io,
1682 				   struct io_uring_cmd *cmd,
1683 				   enum auto_buf_reg_res res)
1684 {
1685 	if (res == AUTO_BUF_REG_OK) {
1686 		io->task_registered_buffers = 1;
1687 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1688 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1689 	}
1690 	ublk_init_req_ref(ubq, io);
1691 	__ublk_prep_compl_io_cmd(io, req);
1692 }
1693 
1694 /* Register request bvec to io_uring for auto buffer registration. */
1695 static enum auto_buf_reg_res
1696 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1697 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1698 		       unsigned int issue_flags)
1699 {
1700 	int ret;
1701 
1702 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1703 				      io->buf.auto_reg.index, issue_flags);
1704 	if (ret) {
1705 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1706 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1707 			return AUTO_BUF_REG_FALLBACK;
1708 		}
1709 		ublk_end_request(req, BLK_STS_IOERR);
1710 		return AUTO_BUF_REG_FAIL;
1711 	}
1712 
1713 	return AUTO_BUF_REG_OK;
1714 }
1715 
1716 /*
1717  * Dispatch IO to userspace with auto buffer registration.
1718  *
1719  * Only called in non-batch context from task work, io->lock not held.
1720  */
1721 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1722 				   struct request *req, struct ublk_io *io,
1723 				   struct io_uring_cmd *cmd,
1724 				   unsigned int issue_flags)
1725 {
1726 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1727 			issue_flags);
1728 
1729 	if (res != AUTO_BUF_REG_FAIL) {
1730 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1731 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1732 	}
1733 }
1734 
1735 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1736 			  struct ublk_io *io)
1737 {
1738 	unsigned mapped_bytes;
1739 
1740 	/* shmem zero copy: skip data copy, pages already shared */
1741 	if (ublk_iod_is_shmem_zc(ubq, req->tag))
1742 		return true;
1743 
1744 	mapped_bytes = ublk_map_io(ubq, req, io);
1745 
1746 	/* partially mapped, update io descriptor */
1747 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1748 		/*
1749 		 * Nothing mapped, retry until we succeed.
1750 		 *
1751 		 * We may never succeed in mapping any bytes here because
1752 		 * of OOM. TODO: reserve one buffer with single page pinned
1753 		 * for providing forward progress guarantee.
1754 		 */
1755 		if (unlikely(!mapped_bytes)) {
1756 			blk_mq_requeue_request(req, false);
1757 			blk_mq_delay_kick_requeue_list(req->q,
1758 					UBLK_REQUEUE_DELAY_MS);
1759 			return false;
1760 		}
1761 
1762 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1763 			mapped_bytes >> 9;
1764 	}
1765 
1766 	return true;
1767 }
1768 
1769 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1770 {
1771 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1772 	int tag = req->tag;
1773 	struct ublk_io *io = &ubq->ios[tag];
1774 
1775 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1776 			__func__, ubq->q_id, req->tag, io->flags,
1777 			ublk_get_iod(ubq, req->tag)->addr);
1778 
1779 	/*
1780 	 * Task is exiting if either:
1781 	 *
1782 	 * (1) current != io->task.
1783 	 * io_uring_cmd_complete_in_task() tries to run task_work
1784 	 * in a workqueue if cmd's task is PF_EXITING.
1785 	 *
1786 	 * (2) current->flags & PF_EXITING.
1787 	 */
1788 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1789 		__ublk_abort_rq(ubq, req);
1790 		return;
1791 	}
1792 
1793 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1794 		/*
1795 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1796 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1797 		 * and notify it.
1798 		 */
1799 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1800 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1801 				__func__, ubq->q_id, req->tag, io->flags);
1802 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1803 				     issue_flags);
1804 		return;
1805 	}
1806 
1807 	if (!ublk_start_io(ubq, req, io))
1808 		return;
1809 
1810 	if (ublk_support_auto_buf_reg(ubq) && blk_rq_has_data(req)) {
1811 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1812 	} else {
1813 		ublk_init_req_ref(ubq, io);
1814 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1815 	}
1816 }
1817 
1818 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1819 				       const struct ublk_batch_io_data *data,
1820 				       unsigned short tag)
1821 {
1822 	struct ublk_device *ub = data->ub;
1823 	struct ublk_io *io = &ubq->ios[tag];
1824 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1825 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1826 	struct io_uring_cmd *cmd = data->cmd;
1827 
1828 	if (!ublk_start_io(ubq, req, io))
1829 		return false;
1830 
1831 	if (ublk_support_auto_buf_reg(ubq) && blk_rq_has_data(req)) {
1832 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1833 				data->issue_flags);
1834 
1835 		if (res == AUTO_BUF_REG_FAIL)
1836 			return false;
1837 	}
1838 
1839 	ublk_io_lock(io);
1840 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1841 	ublk_io_unlock(io);
1842 
1843 	return true;
1844 }
1845 
1846 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1847 				     const struct ublk_batch_io_data *data,
1848 				     unsigned short *tag_buf,
1849 				     unsigned int len)
1850 {
1851 	bool has_unused = false;
1852 	unsigned int i;
1853 
1854 	for (i = 0; i < len; i++) {
1855 		unsigned short tag = tag_buf[i];
1856 
1857 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1858 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1859 			has_unused = true;
1860 		}
1861 	}
1862 
1863 	return has_unused;
1864 }
1865 
1866 /*
1867  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1868  * Returns the new length after filtering.
1869  */
1870 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1871 					    unsigned int len)
1872 {
1873 	unsigned int i, j;
1874 
1875 	for (i = 0, j = 0; i < len; i++) {
1876 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1877 			if (i != j)
1878 				tag_buf[j] = tag_buf[i];
1879 			j++;
1880 		}
1881 	}
1882 
1883 	return j;
1884 }
1885 
1886 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1887 		const struct ublk_batch_io_data *data,
1888 		unsigned short *tag_buf, size_t len, int ret)
1889 {
1890 	int i, res;
1891 
1892 	/*
1893 	 * Undo prep state for all IOs since userspace never received them.
1894 	 * This restores IOs to pre-prepared state so they can be cleanly
1895 	 * re-prepared when tags are pulled from FIFO again.
1896 	 */
1897 	for (i = 0; i < len; i++) {
1898 		struct ublk_io *io = &ubq->ios[tag_buf[i]];
1899 		int index = -1;
1900 
1901 		ublk_io_lock(io);
1902 		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1903 			index = io->buf.auto_reg.index;
1904 		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1905 		io->flags |= UBLK_IO_FLAG_ACTIVE;
1906 		ublk_io_unlock(io);
1907 
1908 		if (index != -1)
1909 			io_buffer_unregister_bvec(data->cmd, index,
1910 					data->issue_flags);
1911 	}
1912 
1913 	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1914 		tag_buf, len, &ubq->evts_lock);
1915 
1916 	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1917 			"tags(%d %zu) ret %d\n", __func__, res, len,
1918 			ret);
1919 }
1920 
1921 #define MAX_NR_TAG 128
1922 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1923 				 const struct ublk_batch_io_data *data,
1924 				 struct ublk_batch_fetch_cmd *fcmd)
1925 {
1926 	const unsigned int tag_sz = sizeof(unsigned short);
1927 	unsigned short tag_buf[MAX_NR_TAG];
1928 	struct io_br_sel sel;
1929 	size_t len = 0;
1930 	bool needs_filter;
1931 	int ret;
1932 
1933 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1934 
1935 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1936 					 data->issue_flags);
1937 	if (sel.val < 0)
1938 		return sel.val;
1939 	if (!sel.addr)
1940 		return -ENOBUFS;
1941 
1942 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1943 	len = min(len, sizeof(tag_buf)) / tag_sz;
1944 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1945 
1946 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1947 	/* Filter out unused tags before posting to userspace */
1948 	if (unlikely(needs_filter)) {
1949 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1950 
1951 		/* return actual length if all are failed or requeued */
1952 		if (!new_len) {
1953 			/* release the selected buffer */
1954 			sel.val = 0;
1955 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1956 						&sel, data->issue_flags));
1957 			return len;
1958 		}
1959 		len = new_len;
1960 	}
1961 
1962 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1963 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1964 	if (unlikely(ret < 0))
1965 		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1966 	return ret;
1967 }
1968 
1969 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1970 		struct ublk_queue *ubq)
1971 {
1972 	struct ublk_batch_fetch_cmd *fcmd;
1973 
1974 	lockdep_assert_held(&ubq->evts_lock);
1975 
1976 	/*
1977 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1978 	 *
1979 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1980 	 *
1981 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1982 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1983 	 */
1984 	smp_mb();
1985 	if (READ_ONCE(ubq->active_fcmd)) {
1986 		fcmd = NULL;
1987 	} else {
1988 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1989 				struct ublk_batch_fetch_cmd, node);
1990 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1991 	}
1992 	return fcmd;
1993 }
1994 
1995 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1996 {
1997 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1998 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1999 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2000 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2001 	struct ublk_batch_io_data data = {
2002 		.ub = pdu->ubq->dev,
2003 		.cmd = fcmd->cmd,
2004 		.issue_flags = issue_flags,
2005 	};
2006 
2007 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
2008 
2009 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
2010 }
2011 
2012 static void
2013 ublk_batch_dispatch(struct ublk_queue *ubq,
2014 		    const struct ublk_batch_io_data *data,
2015 		    struct ublk_batch_fetch_cmd *fcmd)
2016 {
2017 	struct ublk_batch_fetch_cmd *new_fcmd;
2018 	unsigned tried = 0;
2019 	int ret = 0;
2020 
2021 again:
2022 	while (!ublk_io_evts_empty(ubq)) {
2023 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
2024 		if (ret <= 0)
2025 			break;
2026 	}
2027 
2028 	if (ret < 0) {
2029 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2030 		return;
2031 	}
2032 
2033 	__ublk_release_fcmd(ubq);
2034 	/*
2035 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2036 	 * checking ubq->evts_fifo.
2037 	 *
2038 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2039 	 */
2040 	smp_mb();
2041 	if (likely(ublk_io_evts_empty(ubq)))
2042 		return;
2043 
2044 	spin_lock(&ubq->evts_lock);
2045 	new_fcmd = __ublk_acquire_fcmd(ubq);
2046 	spin_unlock(&ubq->evts_lock);
2047 
2048 	if (!new_fcmd)
2049 		return;
2050 
2051 	/* Avoid lockup by allowing to handle at most 32 batches */
2052 	if (new_fcmd == fcmd && tried++ < 32)
2053 		goto again;
2054 
2055 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2056 }
2057 
2058 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2059 {
2060 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2061 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2062 	struct ublk_queue *ubq = pdu->ubq;
2063 
2064 	ublk_dispatch_req(ubq, pdu->req);
2065 }
2066 
2067 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2068 {
2069 	unsigned short tag = rq->tag;
2070 	struct ublk_batch_fetch_cmd *fcmd = NULL;
2071 
2072 	spin_lock(&ubq->evts_lock);
2073 	kfifo_put(&ubq->evts_fifo, tag);
2074 	if (last)
2075 		fcmd = __ublk_acquire_fcmd(ubq);
2076 	spin_unlock(&ubq->evts_lock);
2077 
2078 	if (fcmd)
2079 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2080 }
2081 
2082 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2083 {
2084 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2085 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2086 
2087 	pdu->req = rq;
2088 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2089 }
2090 
2091 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2092 {
2093 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2094 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2095 	struct request *rq = pdu->req_list;
2096 	struct request *next;
2097 
2098 	do {
2099 		next = rq->rq_next;
2100 		rq->rq_next = NULL;
2101 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2102 		rq = next;
2103 	} while (rq);
2104 }
2105 
2106 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2107 {
2108 	struct io_uring_cmd *cmd = io->cmd;
2109 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2110 
2111 	pdu->req_list = rq_list_peek(l);
2112 	rq_list_init(l);
2113 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2114 }
2115 
2116 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2117 {
2118 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2119 	pid_t tgid = ubq->dev->ublksrv_tgid;
2120 	struct task_struct *p;
2121 	struct pid *pid;
2122 
2123 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2124 		return BLK_EH_RESET_TIMER;
2125 
2126 	if (unlikely(!tgid))
2127 		return BLK_EH_RESET_TIMER;
2128 
2129 	rcu_read_lock();
2130 	pid = find_vpid(tgid);
2131 	p = pid_task(pid, PIDTYPE_PID);
2132 	if (p)
2133 		send_sig(SIGKILL, p, 0);
2134 	rcu_read_unlock();
2135 	return BLK_EH_DONE;
2136 }
2137 
2138 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2139 				  bool check_cancel)
2140 {
2141 	blk_status_t res;
2142 
2143 	if (unlikely(READ_ONCE(ubq->fail_io)))
2144 		return BLK_STS_TARGET;
2145 
2146 	/* With recovery feature enabled, force_abort is set in
2147 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2148 	 * abort all requeued and new rqs here to let del_gendisk()
2149 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2150 	 * to avoid UAF on io_uring ctx.
2151 	 *
2152 	 * Note: force_abort is guaranteed to be seen because it is set
2153 	 * before request queue is unqiuesced.
2154 	 */
2155 	if (ublk_nosrv_should_queue_io(ubq) &&
2156 	    unlikely(READ_ONCE(ubq->force_abort)))
2157 		return BLK_STS_IOERR;
2158 
2159 	if (check_cancel && unlikely(ubq->canceling))
2160 		return BLK_STS_IOERR;
2161 
2162 	/* fill iod to slot in io cmd buffer */
2163 	res = ublk_setup_iod(ubq, rq);
2164 	if (unlikely(res != BLK_STS_OK))
2165 		return BLK_STS_IOERR;
2166 
2167 	blk_mq_start_request(rq);
2168 	return BLK_STS_OK;
2169 }
2170 
2171 /*
2172  * Common helper for queue_rq that handles request preparation and
2173  * cancellation checks. Returns status and sets should_queue to indicate
2174  * whether the caller should proceed with queuing the request.
2175  */
2176 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2177 						   struct request *rq,
2178 						   bool *should_queue)
2179 {
2180 	blk_status_t res;
2181 
2182 	res = ublk_prep_req(ubq, rq, false);
2183 	if (res != BLK_STS_OK) {
2184 		*should_queue = false;
2185 		return res;
2186 	}
2187 
2188 	/*
2189 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2190 	 * is dealt with, otherwise this request may not be failed in case
2191 	 * of recovery, and cause hang when deleting disk
2192 	 */
2193 	if (unlikely(ubq->canceling)) {
2194 		*should_queue = false;
2195 		__ublk_abort_rq(ubq, rq);
2196 		return BLK_STS_OK;
2197 	}
2198 
2199 	*should_queue = true;
2200 	return BLK_STS_OK;
2201 }
2202 
2203 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2204 		const struct blk_mq_queue_data *bd)
2205 {
2206 	struct ublk_queue *ubq = hctx->driver_data;
2207 	struct request *rq = bd->rq;
2208 	bool should_queue;
2209 	blk_status_t res;
2210 
2211 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2212 	if (!should_queue)
2213 		return res;
2214 
2215 	ublk_queue_cmd(ubq, rq);
2216 	return BLK_STS_OK;
2217 }
2218 
2219 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2220 		const struct blk_mq_queue_data *bd)
2221 {
2222 	struct ublk_queue *ubq = hctx->driver_data;
2223 	struct request *rq = bd->rq;
2224 	bool should_queue;
2225 	blk_status_t res;
2226 
2227 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2228 	if (!should_queue)
2229 		return res;
2230 
2231 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2232 	return BLK_STS_OK;
2233 }
2234 
2235 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2236 					     const struct ublk_io *io2)
2237 {
2238 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2239 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2240 		(io->task == io2->task);
2241 }
2242 
2243 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2244 {
2245 	struct ublk_queue *ubq = hctx->driver_data;
2246 	struct ublk_batch_fetch_cmd *fcmd;
2247 
2248 	spin_lock(&ubq->evts_lock);
2249 	fcmd = __ublk_acquire_fcmd(ubq);
2250 	spin_unlock(&ubq->evts_lock);
2251 
2252 	if (fcmd)
2253 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2254 }
2255 
2256 static void ublk_queue_rqs(struct rq_list *rqlist)
2257 {
2258 	struct rq_list requeue_list = { };
2259 	struct rq_list submit_list = { };
2260 	struct ublk_io *io = NULL;
2261 	struct request *req;
2262 
2263 	while ((req = rq_list_pop(rqlist))) {
2264 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2265 		struct ublk_io *this_io = &this_q->ios[req->tag];
2266 
2267 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2268 			rq_list_add_tail(&requeue_list, req);
2269 			continue;
2270 		}
2271 
2272 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2273 				!rq_list_empty(&submit_list))
2274 			ublk_queue_cmd_list(io, &submit_list);
2275 		io = this_io;
2276 		rq_list_add_tail(&submit_list, req);
2277 	}
2278 
2279 	if (!rq_list_empty(&submit_list))
2280 		ublk_queue_cmd_list(io, &submit_list);
2281 	*rqlist = requeue_list;
2282 }
2283 
2284 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2285 {
2286 	unsigned short tags[MAX_NR_TAG];
2287 	struct ublk_batch_fetch_cmd *fcmd;
2288 	struct request *rq;
2289 	unsigned cnt = 0;
2290 
2291 	spin_lock(&ubq->evts_lock);
2292 	rq_list_for_each(l, rq) {
2293 		tags[cnt++] = (unsigned short)rq->tag;
2294 		if (cnt >= MAX_NR_TAG) {
2295 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2296 			cnt = 0;
2297 		}
2298 	}
2299 	if (cnt)
2300 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2301 	fcmd = __ublk_acquire_fcmd(ubq);
2302 	spin_unlock(&ubq->evts_lock);
2303 
2304 	rq_list_init(l);
2305 	if (fcmd)
2306 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2307 }
2308 
2309 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2310 {
2311 	struct rq_list requeue_list = { };
2312 	struct rq_list submit_list = { };
2313 	struct ublk_queue *ubq = NULL;
2314 	struct request *req;
2315 
2316 	while ((req = rq_list_pop(rqlist))) {
2317 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2318 
2319 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2320 			rq_list_add_tail(&requeue_list, req);
2321 			continue;
2322 		}
2323 
2324 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2325 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2326 		ubq = this_q;
2327 		rq_list_add_tail(&submit_list, req);
2328 	}
2329 
2330 	if (!rq_list_empty(&submit_list))
2331 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2332 	*rqlist = requeue_list;
2333 }
2334 
2335 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2336 		unsigned int hctx_idx)
2337 {
2338 	struct ublk_device *ub = driver_data;
2339 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2340 
2341 	hctx->driver_data = ubq;
2342 	return 0;
2343 }
2344 
2345 static const struct blk_mq_ops ublk_mq_ops = {
2346 	.queue_rq       = ublk_queue_rq,
2347 	.queue_rqs      = ublk_queue_rqs,
2348 	.init_hctx	= ublk_init_hctx,
2349 	.timeout	= ublk_timeout,
2350 };
2351 
2352 static const struct blk_mq_ops ublk_batch_mq_ops = {
2353 	.commit_rqs	= ublk_commit_rqs,
2354 	.queue_rq       = ublk_batch_queue_rq,
2355 	.queue_rqs      = ublk_batch_queue_rqs,
2356 	.init_hctx	= ublk_init_hctx,
2357 	.timeout	= ublk_timeout,
2358 };
2359 
2360 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2361 {
2362 	int i;
2363 
2364 	ubq->nr_io_ready = 0;
2365 
2366 	for (i = 0; i < ubq->q_depth; i++) {
2367 		struct ublk_io *io = &ubq->ios[i];
2368 
2369 		/*
2370 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2371 		 * io->cmd
2372 		 */
2373 		io->flags &= UBLK_IO_FLAG_CANCELED;
2374 		io->cmd = NULL;
2375 		io->buf.addr = 0;
2376 
2377 		/*
2378 		 * old task is PF_EXITING, put it now
2379 		 *
2380 		 * It could be NULL in case of closing one quiesced
2381 		 * device.
2382 		 */
2383 		if (io->task) {
2384 			put_task_struct(io->task);
2385 			io->task = NULL;
2386 		}
2387 
2388 		WARN_ON_ONCE(refcount_read(&io->ref));
2389 		WARN_ON_ONCE(io->task_registered_buffers);
2390 	}
2391 }
2392 
2393 static int ublk_ch_open(struct inode *inode, struct file *filp)
2394 {
2395 	struct ublk_device *ub = container_of(inode->i_cdev,
2396 			struct ublk_device, cdev);
2397 
2398 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2399 		return -EBUSY;
2400 	filp->private_data = ub;
2401 	ub->ublksrv_tgid = current->tgid;
2402 	return 0;
2403 }
2404 
2405 static void ublk_reset_ch_dev(struct ublk_device *ub)
2406 {
2407 	int i;
2408 
2409 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2410 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2411 
2412 		/* Sync with ublk_cancel_cmd() */
2413 		spin_lock(&ubq->cancel_lock);
2414 		ublk_queue_reinit(ub, ubq);
2415 		spin_unlock(&ubq->cancel_lock);
2416 	}
2417 
2418 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2419 	ub->mm = NULL;
2420 	ub->nr_queue_ready = 0;
2421 	ub->unprivileged_daemons = false;
2422 	ub->ublksrv_tgid = -1;
2423 }
2424 
2425 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2426 {
2427 	struct gendisk *disk;
2428 
2429 	spin_lock(&ub->lock);
2430 	disk = ub->ub_disk;
2431 	if (disk)
2432 		get_device(disk_to_dev(disk));
2433 	spin_unlock(&ub->lock);
2434 
2435 	return disk;
2436 }
2437 
2438 static void ublk_put_disk(struct gendisk *disk)
2439 {
2440 	if (disk)
2441 		put_device(disk_to_dev(disk));
2442 }
2443 
2444 static void ublk_partition_scan_work(struct work_struct *work)
2445 {
2446 	struct ublk_device *ub =
2447 		container_of(work, struct ublk_device, partition_scan_work);
2448 	/* Hold disk reference to prevent UAF during concurrent teardown */
2449 	struct gendisk *disk = ublk_get_disk(ub);
2450 
2451 	if (!disk)
2452 		return;
2453 
2454 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2455 					     &disk->state)))
2456 		goto out;
2457 
2458 	mutex_lock(&disk->open_mutex);
2459 	bdev_disk_changed(disk, false);
2460 	mutex_unlock(&disk->open_mutex);
2461 out:
2462 	ublk_put_disk(disk);
2463 }
2464 
2465 /*
2466  * Use this function to ensure that ->canceling is consistently set for
2467  * the device and all queues. Do not set these flags directly.
2468  *
2469  * Caller must ensure that:
2470  * - cancel_mutex is held. This ensures that there is no concurrent
2471  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2472  * - there are no concurrent reads of ubq->canceling from the queue_rq
2473  *   path. This can be done by quiescing the queue, or through other
2474  *   means.
2475  */
2476 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2477 	__must_hold(&ub->cancel_mutex)
2478 {
2479 	int i;
2480 
2481 	ub->canceling = canceling;
2482 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2483 		ublk_get_queue(ub, i)->canceling = canceling;
2484 }
2485 
2486 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2487 {
2488 	int i, j;
2489 
2490 	if (!ublk_dev_need_req_ref(ub))
2491 		return false;
2492 
2493 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2494 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2495 
2496 		for (j = 0; j < ubq->q_depth; j++) {
2497 			struct ublk_io *io = &ubq->ios[j];
2498 			unsigned int refs = refcount_read(&io->ref) +
2499 				io->task_registered_buffers;
2500 
2501 			/*
2502 			 * UBLK_REFCOUNT_INIT or zero means no active
2503 			 * reference
2504 			 */
2505 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2506 				return true;
2507 
2508 			/* reset to zero if the io hasn't active references */
2509 			refcount_set(&io->ref, 0);
2510 			io->task_registered_buffers = 0;
2511 		}
2512 	}
2513 	return false;
2514 }
2515 
2516 static void ublk_ch_release_work_fn(struct work_struct *work)
2517 {
2518 	struct ublk_device *ub =
2519 		container_of(work, struct ublk_device, exit_work.work);
2520 	struct gendisk *disk;
2521 	int i;
2522 
2523 	/*
2524 	 * For zero-copy and auto buffer register modes, I/O references
2525 	 * might not be dropped naturally when the daemon is killed, but
2526 	 * io_uring guarantees that registered bvec kernel buffers are
2527 	 * unregistered finally when freeing io_uring context, then the
2528 	 * active references are dropped.
2529 	 *
2530 	 * Wait until active references are dropped for avoiding use-after-free
2531 	 *
2532 	 * registered buffer may be unregistered in io_ring's release hander,
2533 	 * so have to wait by scheduling work function for avoiding the two
2534 	 * file release dependency.
2535 	 */
2536 	if (ublk_check_and_reset_active_ref(ub)) {
2537 		schedule_delayed_work(&ub->exit_work, 1);
2538 		return;
2539 	}
2540 
2541 	/*
2542 	 * disk isn't attached yet, either device isn't live, or it has
2543 	 * been removed already, so we needn't to do anything
2544 	 */
2545 	disk = ublk_get_disk(ub);
2546 	if (!disk)
2547 		goto out;
2548 
2549 	/*
2550 	 * All uring_cmd are done now, so abort any request outstanding to
2551 	 * the ublk server
2552 	 *
2553 	 * This can be done in lockless way because ublk server has been
2554 	 * gone
2555 	 *
2556 	 * More importantly, we have to provide forward progress guarantee
2557 	 * without holding ub->mutex, otherwise control task grabbing
2558 	 * ub->mutex triggers deadlock
2559 	 *
2560 	 * All requests may be inflight, so ->canceling may not be set, set
2561 	 * it now.
2562 	 */
2563 	mutex_lock(&ub->cancel_mutex);
2564 	ublk_set_canceling(ub, true);
2565 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2566 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2567 	mutex_unlock(&ub->cancel_mutex);
2568 	blk_mq_kick_requeue_list(disk->queue);
2569 
2570 	/*
2571 	 * All infligh requests have been completed or requeued and any new
2572 	 * request will be failed or requeued via `->canceling` now, so it is
2573 	 * fine to grab ub->mutex now.
2574 	 */
2575 	mutex_lock(&ub->mutex);
2576 
2577 	/* double check after grabbing lock */
2578 	if (!ub->ub_disk)
2579 		goto unlock;
2580 
2581 	/*
2582 	 * Transition the device to the nosrv state. What exactly this
2583 	 * means depends on the recovery flags
2584 	 */
2585 	if (ublk_nosrv_should_stop_dev(ub)) {
2586 		/*
2587 		 * Allow any pending/future I/O to pass through quickly
2588 		 * with an error. This is needed because del_gendisk
2589 		 * waits for all pending I/O to complete
2590 		 */
2591 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2592 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2593 
2594 		ublk_stop_dev_unlocked(ub);
2595 	} else {
2596 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2597 			/* ->canceling is set and all requests are aborted */
2598 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2599 		} else {
2600 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2601 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2602 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2603 		}
2604 	}
2605 unlock:
2606 	mutex_unlock(&ub->mutex);
2607 	ublk_put_disk(disk);
2608 
2609 	/* all uring_cmd has been done now, reset device & ubq */
2610 	ublk_reset_ch_dev(ub);
2611 out:
2612 	clear_bit(UB_STATE_OPEN, &ub->state);
2613 
2614 	/* put the reference grabbed in ublk_ch_release() */
2615 	ublk_put_device(ub);
2616 }
2617 
2618 static int ublk_ch_release(struct inode *inode, struct file *filp)
2619 {
2620 	struct ublk_device *ub = filp->private_data;
2621 
2622 	/*
2623 	 * Grab ublk device reference, so it won't be gone until we are
2624 	 * really released from work function.
2625 	 */
2626 	ublk_get_device(ub);
2627 
2628 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2629 	schedule_delayed_work(&ub->exit_work, 0);
2630 	return 0;
2631 }
2632 
2633 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
2634 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2635 {
2636 	struct ublk_device *ub = filp->private_data;
2637 	size_t sz = vma->vm_end - vma->vm_start;
2638 	unsigned max_sz = ublk_max_cmd_buf_size();
2639 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2640 	int q_id, ret = 0;
2641 
2642 	spin_lock(&ub->lock);
2643 	if (!ub->mm)
2644 		ub->mm = current->mm;
2645 	if (current->mm != ub->mm)
2646 		ret = -EINVAL;
2647 	spin_unlock(&ub->lock);
2648 
2649 	if (ret)
2650 		return ret;
2651 
2652 	if (vma->vm_flags & VM_WRITE)
2653 		return -EPERM;
2654 
2655 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2656 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2657 		return -EINVAL;
2658 
2659 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2660 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2661 			__func__, q_id, current->pid, vma->vm_start,
2662 			phys_off, (unsigned long)sz);
2663 
2664 	if (sz != ublk_queue_cmd_buf_size(ub))
2665 		return -EINVAL;
2666 
2667 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2668 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2669 }
2670 
2671 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2672 		struct request *req)
2673 {
2674 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2675 			io->flags & UBLK_IO_FLAG_ACTIVE);
2676 
2677 	if (ublk_nosrv_should_reissue_outstanding(ub))
2678 		blk_mq_requeue_request(req, false);
2679 	else {
2680 		io->res = -EIO;
2681 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2682 	}
2683 }
2684 
2685 /*
2686  * Request tag may just be filled to event kfifo, not get chance to
2687  * dispatch, abort these requests too
2688  */
2689 static void ublk_abort_batch_queue(struct ublk_device *ub,
2690 				   struct ublk_queue *ubq)
2691 {
2692 	unsigned short tag;
2693 
2694 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2695 		struct request *req = blk_mq_tag_to_rq(
2696 				ub->tag_set.tags[ubq->q_id], tag);
2697 
2698 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2699 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2700 	}
2701 }
2702 
2703 /*
2704  * Called from ublk char device release handler, when any uring_cmd is
2705  * done, meantime request queue is "quiesced" since all inflight requests
2706  * can't be completed because ublk server is dead.
2707  *
2708  * So no one can hold our request IO reference any more, simply ignore the
2709  * reference, and complete the request immediately
2710  */
2711 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2712 {
2713 	int i;
2714 
2715 	for (i = 0; i < ubq->q_depth; i++) {
2716 		struct ublk_io *io = &ubq->ios[i];
2717 
2718 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2719 			__ublk_fail_req(ub, io, io->req);
2720 	}
2721 
2722 	if (ublk_support_batch_io(ubq))
2723 		ublk_abort_batch_queue(ub, ubq);
2724 }
2725 
2726 static void ublk_start_cancel(struct ublk_device *ub)
2727 {
2728 	struct gendisk *disk = ublk_get_disk(ub);
2729 
2730 	mutex_lock(&ub->cancel_mutex);
2731 	if (ub->canceling)
2732 		goto out;
2733 
2734 	if (disk) {
2735 		/*
2736 		 * Quiesce to serialize with ublk_queue_rq(), ensuring
2737 		 * ubq->canceling is visible when the queue resumes.
2738 		 */
2739 		blk_mq_quiesce_queue(disk->queue);
2740 		ublk_set_canceling(ub, true);
2741 		blk_mq_unquiesce_queue(disk->queue);
2742 	} else {
2743 		/*
2744 		 * Disk not yet allocated by ublk_ctrl_start_dev(), so
2745 		 * there is no request queue and ublk_queue_rq() cannot
2746 		 * be running.  Just set the flag; if start_dev proceeds
2747 		 * later, new I/O will see canceling and be aborted.
2748 		 */
2749 		ublk_set_canceling(ub, true);
2750 	}
2751 out:
2752 	mutex_unlock(&ub->cancel_mutex);
2753 	ublk_put_disk(disk);
2754 }
2755 
2756 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2757 		unsigned int issue_flags)
2758 {
2759 	struct ublk_io *io = &ubq->ios[tag];
2760 	struct ublk_device *ub = ubq->dev;
2761 	struct io_uring_cmd *cmd = NULL;
2762 	struct request *req;
2763 	bool done;
2764 
2765 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2766 		return;
2767 
2768 	/*
2769 	 * Don't try to cancel this command if the request is started for
2770 	 * avoiding race between io_uring_cmd_done() and
2771 	 * io_uring_cmd_complete_in_task().
2772 	 *
2773 	 * Either the started request will be aborted via __ublk_abort_rq(),
2774 	 * then this uring_cmd is canceled next time, or it will be done in
2775 	 * task work function ublk_dispatch_req() because io_uring guarantees
2776 	 * that ublk_dispatch_req() is always called
2777 	 */
2778 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2779 	if (req && blk_mq_request_started(req) && req->tag == tag)
2780 		return;
2781 
2782 	spin_lock(&ubq->cancel_lock);
2783 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2784 	if (!done) {
2785 		io->flags |= UBLK_IO_FLAG_CANCELED;
2786 		cmd = io->cmd;
2787 		io->cmd = NULL;
2788 	}
2789 	spin_unlock(&ubq->cancel_lock);
2790 
2791 	if (!done && cmd)
2792 		io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, issue_flags);
2793 }
2794 
2795 /*
2796  * Cancel a batch fetch command if it hasn't been claimed by another path.
2797  *
2798  * An fcmd can only be cancelled if:
2799  * 1. It's not the active_fcmd (which is currently being processed)
2800  * 2. It's still on the list (!list_empty check) - once removed from the list,
2801  *    the fcmd is considered claimed and will be freed by whoever removed it
2802  *
2803  * Use list_del_init() so subsequent list_empty() checks work correctly.
2804  */
2805 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2806 				  struct ublk_batch_fetch_cmd *fcmd,
2807 				  unsigned int issue_flags)
2808 {
2809 	bool done;
2810 
2811 	spin_lock(&ubq->evts_lock);
2812 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2813 	if (done)
2814 		list_del_init(&fcmd->node);
2815 	spin_unlock(&ubq->evts_lock);
2816 
2817 	if (done) {
2818 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2819 		ublk_batch_free_fcmd(fcmd);
2820 	}
2821 }
2822 
2823 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2824 {
2825 	struct ublk_batch_fetch_cmd *fcmd;
2826 	LIST_HEAD(fcmd_list);
2827 
2828 	spin_lock(&ubq->evts_lock);
2829 	ubq->force_abort = true;
2830 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2831 	fcmd = READ_ONCE(ubq->active_fcmd);
2832 	if (fcmd)
2833 		list_move(&fcmd->node, &ubq->fcmd_head);
2834 	spin_unlock(&ubq->evts_lock);
2835 
2836 	while (!list_empty(&fcmd_list)) {
2837 		fcmd = list_first_entry(&fcmd_list,
2838 				struct ublk_batch_fetch_cmd, node);
2839 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2840 	}
2841 }
2842 
2843 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2844 				 unsigned int issue_flags)
2845 {
2846 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2847 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2848 	struct ublk_queue *ubq = pdu->ubq;
2849 
2850 	ublk_start_cancel(ubq->dev);
2851 
2852 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2853 }
2854 
2855 /*
2856  * The ublk char device won't be closed when calling cancel fn, so both
2857  * ublk device and queue are guaranteed to be live
2858  *
2859  * Two-stage cancel:
2860  *
2861  * - make every active uring_cmd done in ->cancel_fn()
2862  *
2863  * - aborting inflight ublk IO requests in ublk char device release handler,
2864  *   which depends on 1st stage because device can only be closed iff all
2865  *   uring_cmd are done
2866  *
2867  * Do _not_ try to acquire ub->mutex before all inflight requests are
2868  * aborted, otherwise deadlock may be caused.
2869  */
2870 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2871 		unsigned int issue_flags)
2872 {
2873 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2874 	struct ublk_queue *ubq = pdu->ubq;
2875 	struct task_struct *task;
2876 	struct ublk_io *io;
2877 
2878 	if (WARN_ON_ONCE(!ubq))
2879 		return;
2880 
2881 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2882 		return;
2883 
2884 	task = io_uring_cmd_get_task(cmd);
2885 	io = &ubq->ios[pdu->tag];
2886 	if (WARN_ON_ONCE(task && task != io->task))
2887 		return;
2888 
2889 	ublk_start_cancel(ubq->dev);
2890 
2891 	WARN_ON_ONCE(io->cmd != cmd);
2892 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2893 }
2894 
2895 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2896 {
2897 	return ubq->nr_io_ready == ubq->q_depth;
2898 }
2899 
2900 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2901 {
2902 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2903 }
2904 
2905 static void ublk_cancel_queue(struct ublk_queue *ubq)
2906 {
2907 	int i;
2908 
2909 	if (ublk_support_batch_io(ubq)) {
2910 		ublk_batch_cancel_queue(ubq);
2911 		return;
2912 	}
2913 
2914 	for (i = 0; i < ubq->q_depth; i++)
2915 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2916 }
2917 
2918 /* Cancel all pending commands, must be called after del_gendisk() returns */
2919 static void ublk_cancel_dev(struct ublk_device *ub)
2920 {
2921 	int i;
2922 
2923 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2924 		ublk_cancel_queue(ublk_get_queue(ub, i));
2925 }
2926 
2927 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2928 {
2929 	bool *idle = data;
2930 
2931 	if (blk_mq_request_started(rq)) {
2932 		*idle = false;
2933 		return false;
2934 	}
2935 	return true;
2936 }
2937 
2938 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2939 {
2940 	bool idle;
2941 
2942 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2943 	while (true) {
2944 		idle = true;
2945 		blk_mq_tagset_busy_iter(&ub->tag_set,
2946 				ublk_check_inflight_rq, &idle);
2947 		if (idle)
2948 			break;
2949 		msleep(UBLK_REQUEUE_DELAY_MS);
2950 	}
2951 }
2952 
2953 static void ublk_force_abort_dev(struct ublk_device *ub)
2954 {
2955 	int i;
2956 
2957 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2958 			__func__, ub->dev_info.dev_id,
2959 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2960 			"LIVE" : "QUIESCED");
2961 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2962 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2963 		ublk_wait_tagset_rqs_idle(ub);
2964 
2965 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2966 		ublk_get_queue(ub, i)->force_abort = true;
2967 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2968 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2969 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2970 }
2971 
2972 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2973 {
2974 	struct gendisk *disk;
2975 
2976 	/* Sync with ublk_abort_queue() by holding the lock */
2977 	spin_lock(&ub->lock);
2978 	disk = ub->ub_disk;
2979 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2980 	ub->dev_info.ublksrv_pid = -1;
2981 	ub->ub_disk = NULL;
2982 	spin_unlock(&ub->lock);
2983 
2984 	return disk;
2985 }
2986 
2987 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2988 	__must_hold(&ub->mutex)
2989 {
2990 	struct gendisk *disk;
2991 
2992 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2993 		return;
2994 
2995 	if (ublk_nosrv_dev_should_queue_io(ub))
2996 		ublk_force_abort_dev(ub);
2997 	del_gendisk(ub->ub_disk);
2998 	disk = ublk_detach_disk(ub);
2999 	put_disk(disk);
3000 }
3001 
3002 static void ublk_stop_dev(struct ublk_device *ub)
3003 {
3004 	mutex_lock(&ub->mutex);
3005 	ublk_stop_dev_unlocked(ub);
3006 	mutex_unlock(&ub->mutex);
3007 	cancel_work_sync(&ub->partition_scan_work);
3008 	ublk_cancel_dev(ub);
3009 }
3010 
3011 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
3012 {
3013 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
3014 	spin_lock(&ubq->cancel_lock);
3015 	io->flags &= ~UBLK_IO_FLAG_CANCELED;
3016 	spin_unlock(&ubq->cancel_lock);
3017 }
3018 
3019 /* reset per-queue io flags */
3020 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
3021 {
3022 	spin_lock(&ubq->cancel_lock);
3023 	ubq->canceling = false;
3024 	spin_unlock(&ubq->cancel_lock);
3025 	ubq->fail_io = false;
3026 }
3027 
3028 /* device can only be started after all IOs are ready */
3029 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
3030 	struct ublk_io *io)
3031 	__must_hold(&ub->mutex)
3032 {
3033 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3034 
3035 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3036 		ub->unprivileged_daemons = true;
3037 
3038 	ubq->nr_io_ready++;
3039 	ublk_reset_io_flags(ubq, io);
3040 
3041 	/* Check if this specific queue is now fully ready */
3042 	if (ublk_queue_ready(ubq)) {
3043 		ub->nr_queue_ready++;
3044 
3045 		/*
3046 		 * Reset queue flags as soon as this queue is ready.
3047 		 * This clears the canceling flag, allowing batch FETCH commands
3048 		 * to succeed during recovery without waiting for all queues.
3049 		 */
3050 		ublk_queue_reset_io_flags(ubq);
3051 	}
3052 
3053 	/* Check if all queues are ready */
3054 	if (ublk_dev_ready(ub)) {
3055 		/*
3056 		 * All queues ready - clear device-level canceling flag
3057 		 * and complete the recovery/initialization.
3058 		 */
3059 		mutex_lock(&ub->cancel_mutex);
3060 		ub->canceling = false;
3061 		mutex_unlock(&ub->cancel_mutex);
3062 		complete_all(&ub->completion);
3063 	}
3064 }
3065 
3066 static inline int ublk_check_cmd_op(u32 cmd_op)
3067 {
3068 	u32 ioc_type = _IOC_TYPE(cmd_op);
3069 
3070 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3071 		return -EOPNOTSUPP;
3072 
3073 	if (ioc_type != 'u' && ioc_type != 0)
3074 		return -EOPNOTSUPP;
3075 
3076 	return 0;
3077 }
3078 
3079 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3080 {
3081 	struct ublk_auto_buf_reg buf;
3082 
3083 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3084 
3085 	if (buf.reserved0 || buf.reserved1)
3086 		return -EINVAL;
3087 
3088 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3089 		return -EINVAL;
3090 	io->buf.auto_reg = buf;
3091 	return 0;
3092 }
3093 
3094 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3095 				    struct io_uring_cmd *cmd,
3096 				    u16 *buf_idx)
3097 {
3098 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3099 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3100 
3101 		/*
3102 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3103 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3104 		 * `io_ring_ctx`.
3105 		 *
3106 		 * If this uring_cmd's io_ring_ctx isn't same with the
3107 		 * one for registering the buffer, it is ublk server's
3108 		 * responsibility for unregistering the buffer, otherwise
3109 		 * this ublk request gets stuck.
3110 		 */
3111 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3112 			*buf_idx = io->buf.auto_reg.index;
3113 	}
3114 }
3115 
3116 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3117 				    struct io_uring_cmd *cmd,
3118 				    u16 *buf_idx)
3119 {
3120 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3121 	return ublk_set_auto_buf_reg(io, cmd);
3122 }
3123 
3124 /* Once we return, `io->req` can't be used any more */
3125 static inline struct request *
3126 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3127 {
3128 	struct request *req = io->req;
3129 
3130 	io->cmd = cmd;
3131 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3132 	/* now this cmd slot is owned by ublk driver */
3133 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3134 
3135 	return req;
3136 }
3137 
3138 static inline int
3139 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3140 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3141 		   u16 *buf_idx)
3142 {
3143 	if (ublk_dev_support_auto_buf_reg(ub))
3144 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3145 
3146 	io->buf.addr = buf_addr;
3147 	return 0;
3148 }
3149 
3150 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3151 				    unsigned int issue_flags,
3152 				    struct ublk_queue *ubq, unsigned int tag)
3153 {
3154 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3155 
3156 	/*
3157 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3158 	 * commands are completed
3159 	 */
3160 	pdu->ubq = ubq;
3161 	pdu->tag = tag;
3162 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3163 }
3164 
3165 static void ublk_io_release(void *priv)
3166 {
3167 	struct request *rq = priv;
3168 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3169 	struct ublk_io *io = &ubq->ios[rq->tag];
3170 
3171 	/*
3172 	 * task_registered_buffers may be 0 if buffers were registered off task
3173 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3174 	 */
3175 	if (current == io->task && io->task_registered_buffers)
3176 		io->task_registered_buffers--;
3177 	else
3178 		ublk_put_req_ref(io, rq);
3179 }
3180 
3181 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3182 				struct ublk_device *ub,
3183 				u16 q_id, u16 tag,
3184 				struct ublk_io *io,
3185 				unsigned int index, unsigned int issue_flags)
3186 {
3187 	struct request *req;
3188 	int ret;
3189 
3190 	if (!ublk_dev_support_zero_copy(ub))
3191 		return -EINVAL;
3192 
3193 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3194 	if (!req)
3195 		return -EINVAL;
3196 
3197 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3198 				      issue_flags);
3199 	if (ret) {
3200 		ublk_put_req_ref(io, req);
3201 		return ret;
3202 	}
3203 
3204 	return 0;
3205 }
3206 
3207 static int
3208 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3209 			    struct ublk_device *ub,
3210 			    u16 q_id, u16 tag, struct ublk_io *io,
3211 			    unsigned index, unsigned issue_flags)
3212 {
3213 	unsigned new_registered_buffers;
3214 	struct request *req = io->req;
3215 	int ret;
3216 
3217 	/*
3218 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3219 	 * If not, fall back on the thread-safe buffer registration.
3220 	 */
3221 	new_registered_buffers = io->task_registered_buffers + 1;
3222 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3223 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3224 					    issue_flags);
3225 
3226 	if (!ublk_dev_support_zero_copy(ub) || !blk_rq_has_data(req))
3227 		return -EINVAL;
3228 
3229 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3230 				      issue_flags);
3231 	if (ret)
3232 		return ret;
3233 
3234 	io->task_registered_buffers = new_registered_buffers;
3235 	return 0;
3236 }
3237 
3238 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3239 				  const struct ublk_device *ub,
3240 				  unsigned int index, unsigned int issue_flags)
3241 {
3242 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3243 		return -EINVAL;
3244 
3245 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3246 }
3247 
3248 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3249 {
3250 	if (ublk_dev_need_map_io(ub)) {
3251 		/*
3252 		 * FETCH_RQ has to provide IO buffer if NEED GET
3253 		 * DATA is not enabled
3254 		 */
3255 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3256 			return -EINVAL;
3257 	} else if (buf_addr) {
3258 		/* User copy requires addr to be unset */
3259 		return -EINVAL;
3260 	}
3261 	return 0;
3262 }
3263 
3264 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3265 			struct ublk_io *io, u16 q_id)
3266 {
3267 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3268 	if (ublk_dev_ready(ub))
3269 		return -EBUSY;
3270 
3271 	/* allow each command to be FETCHed at most once */
3272 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3273 		return -EINVAL;
3274 
3275 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3276 
3277 	ublk_fill_io_cmd(io, cmd);
3278 
3279 	if (ublk_dev_support_batch_io(ub))
3280 		WRITE_ONCE(io->task, NULL);
3281 	else
3282 		WRITE_ONCE(io->task, get_task_struct(current));
3283 
3284 	return 0;
3285 }
3286 
3287 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3288 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3289 {
3290 	int ret;
3291 
3292 	/*
3293 	 * When handling FETCH command for setting up ublk uring queue,
3294 	 * ub->mutex is the innermost lock, and we won't block for handling
3295 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3296 	 */
3297 	mutex_lock(&ub->mutex);
3298 	ret = __ublk_fetch(cmd, ub, io, q_id);
3299 	if (!ret)
3300 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3301 	if (!ret)
3302 		ublk_mark_io_ready(ub, q_id, io);
3303 	mutex_unlock(&ub->mutex);
3304 	return ret;
3305 }
3306 
3307 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3308 				       struct ublk_io *io, __u64 buf_addr)
3309 {
3310 	struct request *req = io->req;
3311 
3312 	if (ublk_dev_need_map_io(ub)) {
3313 		/*
3314 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3315 		 * NEED GET DATA is not enabled or it is Read IO.
3316 		 */
3317 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3318 					req_op(req) == REQ_OP_READ))
3319 			return -EINVAL;
3320 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3321 		/*
3322 		 * User copy requires addr to be unset when command is
3323 		 * not zone append
3324 		 */
3325 		return -EINVAL;
3326 	}
3327 
3328 	return 0;
3329 }
3330 
3331 static bool ublk_need_complete_req(const struct ublk_device *ub,
3332 				   struct ublk_io *io)
3333 {
3334 	if (ublk_dev_need_req_ref(ub))
3335 		return ublk_sub_req_ref(io);
3336 	return true;
3337 }
3338 
3339 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3340 			  struct request *req)
3341 {
3342 	/*
3343 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3344 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3345 	 * do the copy work.
3346 	 */
3347 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3348 	/* update iod->addr because ublksrv may have passed a new io buffer */
3349 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3350 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3351 			__func__, ubq->q_id, req->tag, io->flags,
3352 			ublk_get_iod(ubq, req->tag)->addr);
3353 
3354 	return ublk_start_io(ubq, req, io);
3355 }
3356 
3357 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3358 		unsigned int issue_flags)
3359 {
3360 	/* May point to userspace-mapped memory */
3361 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3362 							       struct ublksrv_io_cmd);
3363 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3364 	struct ublk_device *ub = cmd->file->private_data;
3365 	struct ublk_queue *ubq;
3366 	struct ublk_io *io = NULL;
3367 	u32 cmd_op = cmd->cmd_op;
3368 	u16 q_id = READ_ONCE(ub_src->q_id);
3369 	u16 tag = READ_ONCE(ub_src->tag);
3370 	s32 result = READ_ONCE(ub_src->result);
3371 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3372 	struct request *req;
3373 	int ret;
3374 	bool compl;
3375 
3376 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3377 
3378 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3379 			__func__, cmd->cmd_op, q_id, tag, result);
3380 
3381 	ret = ublk_check_cmd_op(cmd_op);
3382 	if (ret)
3383 		goto out;
3384 
3385 	/*
3386 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3387 	 * so no need to validate the q_id, tag, or task
3388 	 */
3389 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3390 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3391 
3392 	ret = -EINVAL;
3393 	if (q_id >= ub->dev_info.nr_hw_queues)
3394 		goto out;
3395 
3396 	ubq = ublk_get_queue(ub, q_id);
3397 
3398 	if (tag >= ub->dev_info.queue_depth)
3399 		goto out;
3400 
3401 	io = &ubq->ios[tag];
3402 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3403 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3404 		ret = ublk_check_fetch_buf(ub, addr);
3405 		if (ret)
3406 			goto out;
3407 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3408 		if (ret)
3409 			goto out;
3410 
3411 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3412 		return -EIOCBQUEUED;
3413 	}
3414 
3415 	if (READ_ONCE(io->task) != current) {
3416 		/*
3417 		 * ublk_register_io_buf() accesses only the io's refcount,
3418 		 * so can be handled on any task
3419 		 */
3420 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3421 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3422 						    addr, issue_flags);
3423 
3424 		goto out;
3425 	}
3426 
3427 	/* there is pending io cmd, something must be wrong */
3428 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3429 		ret = -EBUSY;
3430 		goto out;
3431 	}
3432 
3433 	/*
3434 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3435 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3436 	 */
3437 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3438 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3439 		goto out;
3440 
3441 	switch (_IOC_NR(cmd_op)) {
3442 	case UBLK_IO_REGISTER_IO_BUF:
3443 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3444 						   issue_flags);
3445 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3446 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3447 		if (ret)
3448 			goto out;
3449 		io->res = result;
3450 		req = ublk_fill_io_cmd(io, cmd);
3451 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3452 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3453 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3454 		compl = ublk_need_complete_req(ub, io);
3455 
3456 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3457 			req->__sector = addr;
3458 		if (compl)
3459 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3460 
3461 		if (ret)
3462 			goto out;
3463 		break;
3464 	case UBLK_IO_NEED_GET_DATA:
3465 		/*
3466 		 * ublk_get_data() may fail and fallback to requeue, so keep
3467 		 * uring_cmd active first and prepare for handling new requeued
3468 		 * request
3469 		 */
3470 		req = ublk_fill_io_cmd(io, cmd);
3471 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3472 		WARN_ON_ONCE(ret);
3473 		if (likely(ublk_get_data(ubq, io, req))) {
3474 			__ublk_prep_compl_io_cmd(io, req);
3475 			return UBLK_IO_RES_OK;
3476 		}
3477 		break;
3478 	default:
3479 		goto out;
3480 	}
3481 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3482 	return -EIOCBQUEUED;
3483 
3484  out:
3485 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3486 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3487 	return ret;
3488 }
3489 
3490 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3491 		u16 q_id, u16 tag, struct ublk_io *io)
3492 {
3493 	struct request *req;
3494 
3495 	/*
3496 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3497 	 * which would overwrite it with io->cmd
3498 	 */
3499 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3500 	if (!req)
3501 		return NULL;
3502 
3503 	if (!ublk_get_req_ref(io))
3504 		return NULL;
3505 
3506 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3507 		goto fail_put;
3508 
3509 	if (!blk_rq_has_data(req))
3510 		goto fail_put;
3511 
3512 	return req;
3513 fail_put:
3514 	ublk_put_req_ref(io, req);
3515 	return NULL;
3516 }
3517 
3518 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3519 {
3520 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3521 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3522 	int ret = -ECANCELED;
3523 
3524 	if (!tw.cancel)
3525 		ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3526 	if (ret != -EIOCBQUEUED)
3527 		io_uring_cmd_done(cmd, ret, issue_flags);
3528 }
3529 
3530 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3531 {
3532 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3533 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3534 		return 0;
3535 	}
3536 
3537 	/* well-implemented server won't run into unlocked */
3538 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3539 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3540 		return -EIOCBQUEUED;
3541 	}
3542 
3543 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3544 }
3545 
3546 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3547 					const struct ublk_elem_header *elem)
3548 {
3549 	const void *buf = elem;
3550 
3551 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3552 		return *(const __u64 *)(buf + sizeof(*elem));
3553 	return 0;
3554 }
3555 
3556 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3557 					const struct ublk_elem_header *elem)
3558 {
3559 	const void *buf = elem;
3560 
3561 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3562 		return *(const __u64 *)(buf + sizeof(*elem) +
3563 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3564 	return -1;
3565 }
3566 
3567 static struct ublk_auto_buf_reg
3568 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3569 			const struct ublk_elem_header *elem)
3570 {
3571 	struct ublk_auto_buf_reg reg = {
3572 		.index = elem->buf_index,
3573 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3574 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3575 	};
3576 
3577 	return reg;
3578 }
3579 
3580 /*
3581  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3582  * it is the least common multiple(LCM) of 8, 16 and 24
3583  */
3584 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3585 struct ublk_batch_io_iter {
3586 	void __user *uaddr;
3587 	unsigned done, total;
3588 	unsigned char elem_bytes;
3589 	/* copy to this buffer from user space */
3590 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3591 };
3592 
3593 static inline int
3594 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3595 		    struct ublk_batch_io_iter *iter,
3596 		    const struct ublk_batch_io_data *data,
3597 		    unsigned bytes,
3598 		    int (*cb)(struct ublk_queue *q,
3599 			    const struct ublk_batch_io_data *data,
3600 			    const struct ublk_elem_header *elem))
3601 {
3602 	unsigned int i;
3603 	int ret = 0;
3604 
3605 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3606 		const struct ublk_elem_header *elem =
3607 			(const struct ublk_elem_header *)&iter->buf[i];
3608 
3609 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3610 			ret = -EINVAL;
3611 			break;
3612 		}
3613 
3614 		ret = cb(ubq, data, elem);
3615 		if (unlikely(ret))
3616 			break;
3617 	}
3618 
3619 	iter->done += i;
3620 	return ret;
3621 }
3622 
3623 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3624 			     const struct ublk_batch_io_data *data,
3625 			     int (*cb)(struct ublk_queue *q,
3626 				     const struct ublk_batch_io_data *data,
3627 				     const struct ublk_elem_header *elem))
3628 {
3629 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3630 	int ret = 0;
3631 
3632 	while (iter->done < iter->total) {
3633 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3634 
3635 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3636 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3637 					data->ub->dev_info.dev_id);
3638 			return -EFAULT;
3639 		}
3640 
3641 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3642 		if (ret)
3643 			return ret;
3644 	}
3645 	return 0;
3646 }
3647 
3648 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3649 				const struct ublk_batch_io_data *data,
3650 				const struct ublk_elem_header *elem)
3651 {
3652 	struct ublk_io *io = &ubq->ios[elem->tag];
3653 
3654 	/*
3655 	 * If queue was ready before this decrement, it won't be anymore,
3656 	 * so we need to decrement the queue ready count and restore the
3657 	 * canceling flag to prevent new requests from being queued.
3658 	 */
3659 	if (ublk_queue_ready(ubq)) {
3660 		data->ub->nr_queue_ready--;
3661 		spin_lock(&ubq->cancel_lock);
3662 		ubq->canceling = true;
3663 		spin_unlock(&ubq->cancel_lock);
3664 	}
3665 	ubq->nr_io_ready--;
3666 
3667 	ublk_io_lock(io);
3668 	io->flags = 0;
3669 	ublk_io_unlock(io);
3670 	return 0;
3671 }
3672 
3673 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3674 				       const struct ublk_batch_io_data *data)
3675 {
3676 	int ret;
3677 
3678 	/* Re-process only what we've already processed, starting from beginning */
3679 	iter->total = iter->done;
3680 	iter->done = 0;
3681 
3682 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3683 	WARN_ON_ONCE(ret);
3684 }
3685 
3686 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3687 			      const struct ublk_batch_io_data *data,
3688 			      const struct ublk_elem_header *elem)
3689 {
3690 	struct ublk_io *io = &ubq->ios[elem->tag];
3691 	const struct ublk_batch_io *uc = &data->header;
3692 	union ublk_io_buf buf = { 0 };
3693 	int ret;
3694 
3695 	if (ublk_dev_support_auto_buf_reg(data->ub))
3696 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3697 	else if (ublk_dev_need_map_io(data->ub)) {
3698 		buf.addr = ublk_batch_buf_addr(uc, elem);
3699 
3700 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3701 		if (ret)
3702 			return ret;
3703 	}
3704 
3705 	ublk_io_lock(io);
3706 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3707 	if (!ret)
3708 		io->buf = buf;
3709 	ublk_io_unlock(io);
3710 
3711 	if (!ret)
3712 		ublk_mark_io_ready(data->ub, ubq->q_id, io);
3713 
3714 	return ret;
3715 }
3716 
3717 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3718 {
3719 	const struct ublk_batch_io *uc = &data->header;
3720 	struct io_uring_cmd *cmd = data->cmd;
3721 	struct ublk_batch_io_iter iter = {
3722 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3723 		.total = uc->nr_elem * uc->elem_bytes,
3724 		.elem_bytes = uc->elem_bytes,
3725 	};
3726 	int ret;
3727 
3728 	mutex_lock(&data->ub->mutex);
3729 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3730 
3731 	if (ret && iter.done)
3732 		ublk_batch_revert_prep_cmd(&iter, data);
3733 	mutex_unlock(&data->ub->mutex);
3734 	return ret;
3735 }
3736 
3737 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3738 				      struct ublk_io *io,
3739 				      union ublk_io_buf *buf)
3740 {
3741 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3742 		return -EBUSY;
3743 
3744 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3745 	if (ublk_need_map_io(ubq) && !buf->addr)
3746 		return -EINVAL;
3747 	return 0;
3748 }
3749 
3750 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3751 				const struct ublk_batch_io_data *data,
3752 				const struct ublk_elem_header *elem)
3753 {
3754 	struct ublk_io *io = &ubq->ios[elem->tag];
3755 	const struct ublk_batch_io *uc = &data->header;
3756 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3757 	union ublk_io_buf buf = { 0 };
3758 	struct request *req = NULL;
3759 	bool auto_reg = false;
3760 	bool compl = false;
3761 	int ret;
3762 
3763 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3764 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3765 		auto_reg = true;
3766 	} else if (ublk_dev_need_map_io(data->ub))
3767 		buf.addr = ublk_batch_buf_addr(uc, elem);
3768 
3769 	ublk_io_lock(io);
3770 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3771 	if (!ret) {
3772 		io->res = elem->result;
3773 		io->buf = buf;
3774 		req = ublk_fill_io_cmd(io, data->cmd);
3775 
3776 		if (auto_reg)
3777 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3778 		compl = ublk_need_complete_req(data->ub, io);
3779 	}
3780 	ublk_io_unlock(io);
3781 
3782 	if (unlikely(ret)) {
3783 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3784 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3785 			elem->tag, ret);
3786 		return ret;
3787 	}
3788 
3789 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3790 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3791 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3792 		req->__sector = ublk_batch_zone_lba(uc, elem);
3793 	if (compl)
3794 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3795 	return 0;
3796 }
3797 
3798 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3799 {
3800 	const struct ublk_batch_io *uc = &data->header;
3801 	struct io_uring_cmd *cmd = data->cmd;
3802 	struct ublk_batch_io_iter iter = {
3803 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3804 		.total = uc->nr_elem * uc->elem_bytes,
3805 		.elem_bytes = uc->elem_bytes,
3806 	};
3807 	DEFINE_IO_COMP_BATCH(iob);
3808 	int ret;
3809 
3810 	data->iob = &iob;
3811 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3812 
3813 	if (iob.complete)
3814 		iob.complete(&iob);
3815 
3816 	return iter.done == 0 ? ret : iter.done;
3817 }
3818 
3819 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3820 {
3821 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3822 
3823 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3824 		return -EINVAL;
3825 
3826 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3827 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3828 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3829 		return -EINVAL;
3830 
3831 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3832 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3833 	if (uc->elem_bytes != elem_bytes)
3834 		return -EINVAL;
3835 	return 0;
3836 }
3837 
3838 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3839 {
3840 	const struct ublk_batch_io *uc = &data->header;
3841 
3842 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3843 		return -EINVAL;
3844 
3845 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3846 		return -E2BIG;
3847 
3848 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3849 			!ublk_dev_is_zoned(data->ub))
3850 		return -EINVAL;
3851 
3852 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3853 			!ublk_dev_need_map_io(data->ub))
3854 		return -EINVAL;
3855 
3856 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3857 			!ublk_dev_support_auto_buf_reg(data->ub))
3858 		return -EINVAL;
3859 
3860 	return ublk_check_batch_cmd_flags(uc);
3861 }
3862 
3863 static int ublk_batch_attach(struct ublk_queue *ubq,
3864 			     struct ublk_batch_io_data *data,
3865 			     struct ublk_batch_fetch_cmd *fcmd)
3866 {
3867 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3868 	bool free = false;
3869 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3870 
3871 	spin_lock(&ubq->evts_lock);
3872 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3873 		free = true;
3874 	} else {
3875 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3876 		new_fcmd = __ublk_acquire_fcmd(ubq);
3877 	}
3878 	spin_unlock(&ubq->evts_lock);
3879 
3880 	if (unlikely(free)) {
3881 		ublk_batch_free_fcmd(fcmd);
3882 		return -ENODEV;
3883 	}
3884 
3885 	pdu->ubq = ubq;
3886 	pdu->fcmd = fcmd;
3887 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3888 
3889 	if (!new_fcmd)
3890 		goto out;
3891 
3892 	/*
3893 	 * If the two fetch commands are originated from same io_ring_ctx,
3894 	 * run batch dispatch directly. Otherwise, schedule task work for
3895 	 * doing it.
3896 	 */
3897 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3898 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3899 		data->cmd = new_fcmd->cmd;
3900 		ublk_batch_dispatch(ubq, data, new_fcmd);
3901 	} else {
3902 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3903 				ublk_batch_tw_cb);
3904 	}
3905 out:
3906 	return -EIOCBQUEUED;
3907 }
3908 
3909 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3910 {
3911 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3912 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3913 
3914 	if (!fcmd)
3915 		return -ENOMEM;
3916 
3917 	return ublk_batch_attach(ubq, data, fcmd);
3918 }
3919 
3920 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3921 {
3922 	const struct ublk_batch_io *uc = &data->header;
3923 
3924 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3925 		return -EINVAL;
3926 
3927 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3928 		return -EINVAL;
3929 
3930 	if (uc->elem_bytes != sizeof(__u16))
3931 		return -EINVAL;
3932 
3933 	if (uc->flags != 0)
3934 		return -EINVAL;
3935 
3936 	return 0;
3937 }
3938 
3939 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3940 				     unsigned int issue_flags)
3941 {
3942 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3943 							       struct ublksrv_io_cmd);
3944 	struct ublk_device *ub = cmd->file->private_data;
3945 	unsigned tag = READ_ONCE(ub_cmd->tag);
3946 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3947 	unsigned index = READ_ONCE(ub_cmd->addr);
3948 	struct ublk_queue *ubq;
3949 	struct ublk_io *io;
3950 
3951 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3952 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3953 
3954 	if (q_id >= ub->dev_info.nr_hw_queues)
3955 		return -EINVAL;
3956 
3957 	if (tag >= ub->dev_info.queue_depth)
3958 		return -EINVAL;
3959 
3960 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3961 		return -EOPNOTSUPP;
3962 
3963 	ubq = ublk_get_queue(ub, q_id);
3964 	io = &ubq->ios[tag];
3965 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3966 			issue_flags);
3967 }
3968 
3969 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3970 				       unsigned int issue_flags)
3971 {
3972 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3973 							  struct ublk_batch_io);
3974 	struct ublk_device *ub = cmd->file->private_data;
3975 	struct ublk_batch_io_data data = {
3976 		.ub  = ub,
3977 		.cmd = cmd,
3978 		.header = (struct ublk_batch_io) {
3979 			.q_id = READ_ONCE(uc->q_id),
3980 			.flags = READ_ONCE(uc->flags),
3981 			.nr_elem = READ_ONCE(uc->nr_elem),
3982 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3983 		},
3984 		.issue_flags = issue_flags,
3985 	};
3986 	u32 cmd_op = cmd->cmd_op;
3987 	int ret = -EINVAL;
3988 
3989 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3990 		ublk_batch_cancel_fn(cmd, issue_flags);
3991 		return 0;
3992 	}
3993 
3994 	switch (cmd_op) {
3995 	case UBLK_U_IO_PREP_IO_CMDS:
3996 		ret = ublk_check_batch_cmd(&data);
3997 		if (ret)
3998 			goto out;
3999 		ret = ublk_handle_batch_prep_cmd(&data);
4000 		break;
4001 	case UBLK_U_IO_COMMIT_IO_CMDS:
4002 		ret = ublk_check_batch_cmd(&data);
4003 		if (ret)
4004 			goto out;
4005 		ret = ublk_handle_batch_commit_cmd(&data);
4006 		break;
4007 	case UBLK_U_IO_FETCH_IO_CMDS:
4008 		ret = ublk_validate_batch_fetch_cmd(&data);
4009 		if (ret)
4010 			goto out;
4011 		ret = ublk_handle_batch_fetch_cmd(&data);
4012 		break;
4013 	default:
4014 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
4015 		break;
4016 	}
4017 out:
4018 	return ret;
4019 }
4020 
4021 static inline bool ublk_check_ubuf_dir(const struct request *req,
4022 		int ubuf_dir)
4023 {
4024 	/* copy ubuf to request pages */
4025 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
4026 	    ubuf_dir == ITER_SOURCE)
4027 		return true;
4028 
4029 	/* copy request pages to ubuf */
4030 	if ((req_op(req) == REQ_OP_WRITE ||
4031 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
4032 	    ubuf_dir == ITER_DEST)
4033 		return true;
4034 
4035 	return false;
4036 }
4037 
4038 static ssize_t
4039 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4040 {
4041 	struct ublk_device *ub = iocb->ki_filp->private_data;
4042 	struct ublk_queue *ubq;
4043 	struct request *req;
4044 	struct ublk_io *io;
4045 	unsigned data_len;
4046 	bool is_integrity;
4047 	bool on_daemon;
4048 	size_t buf_off;
4049 	u16 tag, q_id;
4050 	ssize_t ret;
4051 
4052 	if (!user_backed_iter(iter))
4053 		return -EACCES;
4054 
4055 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4056 		return -EACCES;
4057 
4058 	tag = ublk_pos_to_tag(iocb->ki_pos);
4059 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
4060 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4061 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4062 
4063 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4064 		return -EINVAL;
4065 
4066 	if (q_id >= ub->dev_info.nr_hw_queues)
4067 		return -EINVAL;
4068 
4069 	ubq = ublk_get_queue(ub, q_id);
4070 	if (!ublk_dev_support_user_copy(ub))
4071 		return -EACCES;
4072 
4073 	if (tag >= ub->dev_info.queue_depth)
4074 		return -EINVAL;
4075 
4076 	io = &ubq->ios[tag];
4077 	on_daemon = current == READ_ONCE(io->task);
4078 	if (on_daemon) {
4079 		/* On daemon, io can't be completed concurrently, so skip ref */
4080 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4081 			return -EINVAL;
4082 
4083 		req = io->req;
4084 		if (!blk_rq_has_data(req))
4085 			return -EINVAL;
4086 	} else {
4087 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
4088 		if (!req)
4089 			return -EINVAL;
4090 	}
4091 
4092 	if (is_integrity) {
4093 		struct blk_integrity *bi = &req->q->limits.integrity;
4094 
4095 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4096 	} else {
4097 		data_len = blk_rq_bytes(req);
4098 	}
4099 	if (buf_off > data_len) {
4100 		ret = -EINVAL;
4101 		goto out;
4102 	}
4103 
4104 	if (!ublk_check_ubuf_dir(req, dir)) {
4105 		ret = -EACCES;
4106 		goto out;
4107 	}
4108 
4109 	if (is_integrity)
4110 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4111 	else
4112 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4113 
4114 out:
4115 	if (!on_daemon)
4116 		ublk_put_req_ref(io, req);
4117 	return ret;
4118 }
4119 
4120 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4121 {
4122 	return ublk_user_copy(iocb, to, ITER_DEST);
4123 }
4124 
4125 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4126 {
4127 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4128 }
4129 
4130 static const struct file_operations ublk_ch_fops = {
4131 	.owner = THIS_MODULE,
4132 	.open = ublk_ch_open,
4133 	.release = ublk_ch_release,
4134 	.read_iter = ublk_ch_read_iter,
4135 	.write_iter = ublk_ch_write_iter,
4136 	.uring_cmd = ublk_ch_uring_cmd,
4137 	.mmap = ublk_ch_mmap,
4138 };
4139 
4140 static const struct file_operations ublk_ch_batch_io_fops = {
4141 	.owner = THIS_MODULE,
4142 	.open = ublk_ch_open,
4143 	.release = ublk_ch_release,
4144 	.read_iter = ublk_ch_read_iter,
4145 	.write_iter = ublk_ch_write_iter,
4146 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4147 	.mmap = ublk_ch_mmap,
4148 };
4149 
4150 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4151 {
4152 	int size, i;
4153 
4154 	size = ublk_queue_cmd_buf_size(ub);
4155 
4156 	for (i = 0; i < ubq->q_depth; i++) {
4157 		struct ublk_io *io = &ubq->ios[i];
4158 		if (io->task)
4159 			put_task_struct(io->task);
4160 		WARN_ON_ONCE(refcount_read(&io->ref));
4161 		WARN_ON_ONCE(io->task_registered_buffers);
4162 	}
4163 
4164 	if (ubq->io_cmd_buf)
4165 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4166 
4167 	if (ublk_dev_support_batch_io(ub))
4168 		ublk_io_evts_deinit(ubq);
4169 
4170 	kvfree(ubq);
4171 }
4172 
4173 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4174 {
4175 	struct ublk_queue *ubq = ub->queues[q_id];
4176 
4177 	if (!ubq)
4178 		return;
4179 
4180 	__ublk_deinit_queue(ub, ubq);
4181 	ub->queues[q_id] = NULL;
4182 }
4183 
4184 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4185 {
4186 	unsigned int cpu;
4187 
4188 	/* Find first CPU mapped to this queue */
4189 	for_each_possible_cpu(cpu) {
4190 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4191 			return cpu_to_node(cpu);
4192 	}
4193 
4194 	return NUMA_NO_NODE;
4195 }
4196 
4197 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4198 {
4199 	int depth = ub->dev_info.queue_depth;
4200 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4201 	struct ublk_queue *ubq;
4202 	struct page *page;
4203 	int numa_node;
4204 	int size, i, ret;
4205 
4206 	/* Determine NUMA node based on queue's CPU affinity */
4207 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4208 
4209 	/* Allocate queue structure on local NUMA node */
4210 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4211 			    numa_node);
4212 	if (!ubq)
4213 		return -ENOMEM;
4214 
4215 	spin_lock_init(&ubq->cancel_lock);
4216 	ubq->flags = ub->dev_info.flags;
4217 	ubq->q_id = q_id;
4218 	ubq->q_depth = depth;
4219 	size = ublk_queue_cmd_buf_size(ub);
4220 
4221 	/* Allocate I/O command buffer on local NUMA node */
4222 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4223 	if (!page) {
4224 		kvfree(ubq);
4225 		return -ENOMEM;
4226 	}
4227 	ubq->io_cmd_buf = page_address(page);
4228 
4229 	for (i = 0; i < ubq->q_depth; i++)
4230 		spin_lock_init(&ubq->ios[i].lock);
4231 
4232 	if (ublk_dev_support_batch_io(ub)) {
4233 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4234 		if (ret)
4235 			goto fail;
4236 		INIT_LIST_HEAD(&ubq->fcmd_head);
4237 	}
4238 	ub->queues[q_id] = ubq;
4239 	ubq->dev = ub;
4240 
4241 	return 0;
4242 fail:
4243 	__ublk_deinit_queue(ub, ubq);
4244 	return ret;
4245 }
4246 
4247 static void ublk_deinit_queues(struct ublk_device *ub)
4248 {
4249 	int i;
4250 
4251 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4252 		ublk_deinit_queue(ub, i);
4253 }
4254 
4255 static int ublk_init_queues(struct ublk_device *ub)
4256 {
4257 	int i, ret;
4258 
4259 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4260 		ret = ublk_init_queue(ub, i);
4261 		if (ret)
4262 			goto fail;
4263 	}
4264 
4265 	init_completion(&ub->completion);
4266 	return 0;
4267 
4268  fail:
4269 	ublk_deinit_queues(ub);
4270 	return ret;
4271 }
4272 
4273 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4274 {
4275 	int i = idx;
4276 	int err;
4277 
4278 	spin_lock(&ublk_idr_lock);
4279 	/* allocate id, if @id >= 0, we're requesting that specific id */
4280 	if (i >= 0) {
4281 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4282 		if (err == -ENOSPC)
4283 			err = -EEXIST;
4284 	} else {
4285 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4286 				GFP_NOWAIT);
4287 	}
4288 	spin_unlock(&ublk_idr_lock);
4289 
4290 	if (err >= 0)
4291 		ub->ub_number = err;
4292 
4293 	return err;
4294 }
4295 
4296 static void ublk_free_dev_number(struct ublk_device *ub)
4297 {
4298 	spin_lock(&ublk_idr_lock);
4299 	idr_remove(&ublk_index_idr, ub->ub_number);
4300 	wake_up_all(&ublk_idr_wq);
4301 	spin_unlock(&ublk_idr_lock);
4302 }
4303 
4304 static void ublk_cdev_rel(struct device *dev)
4305 {
4306 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4307 
4308 	ublk_buf_cleanup(ub);
4309 	blk_mq_free_tag_set(&ub->tag_set);
4310 	ublk_deinit_queues(ub);
4311 	ublk_free_dev_number(ub);
4312 	mutex_destroy(&ub->mutex);
4313 	mutex_destroy(&ub->cancel_mutex);
4314 	kfree(ub);
4315 }
4316 
4317 static int ublk_add_chdev(struct ublk_device *ub)
4318 {
4319 	struct device *dev = &ub->cdev_dev;
4320 	int minor = ub->ub_number;
4321 	int ret;
4322 
4323 	dev->parent = ublk_misc.this_device;
4324 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4325 	dev->class = &ublk_chr_class;
4326 	dev->release = ublk_cdev_rel;
4327 	device_initialize(dev);
4328 
4329 	ret = dev_set_name(dev, "ublkc%d", minor);
4330 	if (ret)
4331 		goto fail;
4332 
4333 	if (ublk_dev_support_batch_io(ub))
4334 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4335 	else
4336 		cdev_init(&ub->cdev, &ublk_ch_fops);
4337 	ret = cdev_device_add(&ub->cdev, dev);
4338 	if (ret)
4339 		goto fail;
4340 
4341 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4342 		unprivileged_ublks_added++;
4343 	return 0;
4344  fail:
4345 	put_device(dev);
4346 	return ret;
4347 }
4348 
4349 /* align max io buffer size with PAGE_SIZE */
4350 static void ublk_align_max_io_size(struct ublk_device *ub)
4351 {
4352 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4353 
4354 	ub->dev_info.max_io_buf_bytes =
4355 		round_down(max_io_bytes, PAGE_SIZE);
4356 }
4357 
4358 static int ublk_add_tag_set(struct ublk_device *ub)
4359 {
4360 	if (ublk_dev_support_batch_io(ub))
4361 		ub->tag_set.ops = &ublk_batch_mq_ops;
4362 	else
4363 		ub->tag_set.ops = &ublk_mq_ops;
4364 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4365 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4366 	ub->tag_set.numa_node = NUMA_NO_NODE;
4367 	ub->tag_set.driver_data = ub;
4368 	return blk_mq_alloc_tag_set(&ub->tag_set);
4369 }
4370 
4371 static void ublk_remove(struct ublk_device *ub)
4372 {
4373 	bool unprivileged;
4374 
4375 	ublk_stop_dev(ub);
4376 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4377 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4378 	ublk_put_device(ub);
4379 
4380 	if (unprivileged)
4381 		unprivileged_ublks_added--;
4382 }
4383 
4384 static struct ublk_device *ublk_get_device_from_id(int idx)
4385 {
4386 	struct ublk_device *ub = NULL;
4387 
4388 	if (idx < 0)
4389 		return NULL;
4390 
4391 	spin_lock(&ublk_idr_lock);
4392 	ub = idr_find(&ublk_index_idr, idx);
4393 	if (ub)
4394 		ub = ublk_get_device(ub);
4395 	spin_unlock(&ublk_idr_lock);
4396 
4397 	return ub;
4398 }
4399 
4400 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4401 {
4402 	rcu_read_lock();
4403 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4404 	rcu_read_unlock();
4405 
4406 	return ub->ublksrv_tgid == ublksrv_pid;
4407 }
4408 
4409 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4410 		const struct ublksrv_ctrl_cmd *header)
4411 {
4412 	const struct ublk_param_basic *p = &ub->params.basic;
4413 	int ublksrv_pid = (int)header->data[0];
4414 	struct queue_limits lim = {
4415 		.logical_block_size	= 1 << p->logical_bs_shift,
4416 		.physical_block_size	= 1 << p->physical_bs_shift,
4417 		.io_min			= 1 << p->io_min_shift,
4418 		.io_opt			= 1 << p->io_opt_shift,
4419 		.max_hw_sectors		= p->max_sectors,
4420 		.chunk_sectors		= p->chunk_sectors,
4421 		.virt_boundary_mask	= p->virt_boundary_mask,
4422 		.max_segments		= USHRT_MAX,
4423 		.max_segment_size	= UINT_MAX,
4424 		.dma_alignment		= 3,
4425 	};
4426 	struct gendisk *disk;
4427 	int ret = -EINVAL;
4428 
4429 	if (ublksrv_pid <= 0)
4430 		return -EINVAL;
4431 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4432 		return -EINVAL;
4433 
4434 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4435 		const struct ublk_param_discard *pd = &ub->params.discard;
4436 
4437 		lim.discard_alignment = pd->discard_alignment;
4438 		lim.discard_granularity = pd->discard_granularity;
4439 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4440 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4441 		lim.max_discard_segments = pd->max_discard_segments;
4442 	}
4443 
4444 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4445 		const struct ublk_param_zoned *p = &ub->params.zoned;
4446 
4447 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4448 			return -EOPNOTSUPP;
4449 
4450 		lim.features |= BLK_FEAT_ZONED;
4451 		lim.max_active_zones = p->max_active_zones;
4452 		lim.max_open_zones =  p->max_open_zones;
4453 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4454 	}
4455 
4456 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4457 		lim.features |= BLK_FEAT_WRITE_CACHE;
4458 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4459 			lim.features |= BLK_FEAT_FUA;
4460 	}
4461 
4462 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4463 		lim.features |= BLK_FEAT_ROTATIONAL;
4464 
4465 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4466 		lim.dma_alignment = ub->params.dma.alignment;
4467 
4468 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4469 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4470 		lim.max_segment_size = ub->params.seg.max_segment_size;
4471 		lim.max_segments = ub->params.seg.max_segments;
4472 	}
4473 
4474 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4475 		const struct ublk_param_integrity *p = &ub->params.integrity;
4476 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4477 
4478 		lim.max_integrity_segments =
4479 			p->max_integrity_segments ?: USHRT_MAX;
4480 		lim.integrity = (struct blk_integrity) {
4481 			.flags = ublk_integrity_flags(p->flags),
4482 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4483 			.metadata_size = p->metadata_size,
4484 			.pi_offset = p->pi_offset,
4485 			.interval_exp = p->interval_exp,
4486 			.tag_size = p->tag_size,
4487 			.pi_tuple_size = pi_tuple_size,
4488 		};
4489 	}
4490 
4491 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4492 		return -EINTR;
4493 
4494 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4495 		return -EINVAL;
4496 
4497 	mutex_lock(&ub->mutex);
4498 	/* device may become not ready in case of F_BATCH */
4499 	if (!ublk_dev_ready(ub)) {
4500 		ret = -EINVAL;
4501 		goto out_unlock;
4502 	}
4503 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4504 	    test_bit(UB_STATE_USED, &ub->state)) {
4505 		ret = -EEXIST;
4506 		goto out_unlock;
4507 	}
4508 
4509 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4510 	if (IS_ERR(disk)) {
4511 		ret = PTR_ERR(disk);
4512 		goto out_unlock;
4513 	}
4514 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4515 	disk->fops = &ub_fops;
4516 	disk->private_data = ub;
4517 
4518 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4519 	ub->ub_disk = disk;
4520 
4521 	ublk_apply_params(ub);
4522 
4523 	/*
4524 	 * Suppress partition scan to avoid potential IO hang.
4525 	 *
4526 	 * If ublk server error occurs during partition scan, the IO may
4527 	 * wait while holding ub->mutex, which can deadlock with other
4528 	 * operations that need the mutex. Defer partition scan to async
4529 	 * work.
4530 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4531 	 * permanently.
4532 	 */
4533 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4534 
4535 	ublk_get_device(ub);
4536 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4537 
4538 	if (ublk_dev_is_zoned(ub)) {
4539 		ret = ublk_revalidate_disk_zones(ub);
4540 		if (ret)
4541 			goto out_put_cdev;
4542 	}
4543 
4544 	ret = add_disk(disk);
4545 	if (ret)
4546 		goto out_put_cdev;
4547 
4548 	set_bit(UB_STATE_USED, &ub->state);
4549 
4550 	/* Skip partition scan if disabled by user */
4551 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4552 		/* Not clear for unprivileged daemons, see comment above */
4553 		if (!ub->unprivileged_daemons)
4554 			clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4555 	} else {
4556 		/* Schedule async partition scan for trusted daemons */
4557 		if (!ub->unprivileged_daemons)
4558 			schedule_work(&ub->partition_scan_work);
4559 	}
4560 
4561 out_put_cdev:
4562 	if (ret) {
4563 		ublk_detach_disk(ub);
4564 		ublk_put_device(ub);
4565 	}
4566 	if (ret)
4567 		put_disk(disk);
4568 out_unlock:
4569 	mutex_unlock(&ub->mutex);
4570 	return ret;
4571 }
4572 
4573 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4574 		const struct ublksrv_ctrl_cmd *header)
4575 {
4576 	void __user *argp = (void __user *)(unsigned long)header->addr;
4577 	cpumask_var_t cpumask;
4578 	unsigned long queue;
4579 	unsigned int retlen;
4580 	unsigned int i;
4581 	int ret;
4582 
4583 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4584 		return -EINVAL;
4585 	if (header->len & (sizeof(unsigned long)-1))
4586 		return -EINVAL;
4587 	if (!header->addr)
4588 		return -EINVAL;
4589 
4590 	queue = header->data[0];
4591 	if (queue >= ub->dev_info.nr_hw_queues)
4592 		return -EINVAL;
4593 
4594 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4595 		return -ENOMEM;
4596 
4597 	for_each_possible_cpu(i) {
4598 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4599 			cpumask_set_cpu(i, cpumask);
4600 	}
4601 
4602 	ret = -EFAULT;
4603 	retlen = min_t(unsigned short, header->len, cpumask_size());
4604 	if (copy_to_user(argp, cpumask, retlen))
4605 		goto out_free_cpumask;
4606 	if (retlen != header->len &&
4607 	    clear_user(argp + retlen, header->len - retlen))
4608 		goto out_free_cpumask;
4609 
4610 	ret = 0;
4611 out_free_cpumask:
4612 	free_cpumask_var(cpumask);
4613 	return ret;
4614 }
4615 
4616 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4617 {
4618 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4619 			info->dev_id, info->flags);
4620 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4621 			info->nr_hw_queues, info->queue_depth);
4622 }
4623 
4624 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4625 {
4626 	void __user *argp = (void __user *)(unsigned long)header->addr;
4627 	struct ublksrv_ctrl_dev_info info;
4628 	struct ublk_device *ub;
4629 	int ret = -EINVAL;
4630 
4631 	if (header->len < sizeof(info) || !header->addr)
4632 		return -EINVAL;
4633 	if (header->queue_id != (u16)-1) {
4634 		pr_warn("%s: queue_id is wrong %x\n",
4635 			__func__, header->queue_id);
4636 		return -EINVAL;
4637 	}
4638 
4639 	if (copy_from_user(&info, argp, sizeof(info)))
4640 		return -EFAULT;
4641 
4642 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4643 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4644 		return -EINVAL;
4645 
4646 	if (capable(CAP_SYS_ADMIN))
4647 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4648 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4649 		return -EPERM;
4650 
4651 	/* forbid nonsense combinations of recovery flags */
4652 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4653 	case 0:
4654 	case UBLK_F_USER_RECOVERY:
4655 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4656 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4657 		break;
4658 	default:
4659 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4660 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4661 		return -EINVAL;
4662 	}
4663 
4664 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4665 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4666 		return -EINVAL;
4667 	}
4668 
4669 	/*
4670 	 * unprivileged device can't be trusted, but RECOVERY and
4671 	 * RECOVERY_REISSUE still may hang error handling, so can't
4672 	 * support recovery features for unprivileged ublk now
4673 	 *
4674 	 * TODO: provide forward progress for RECOVERY handler, so that
4675 	 * unprivileged device can benefit from it
4676 	 */
4677 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4678 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4679 				UBLK_F_USER_RECOVERY);
4680 
4681 		/*
4682 		 * For USER_COPY, we depends on userspace to fill request
4683 		 * buffer by pwrite() to ublk char device, which can't be
4684 		 * used for unprivileged device
4685 		 *
4686 		 * Same with zero copy or auto buffer register.
4687 		 */
4688 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4689 					UBLK_F_AUTO_BUF_REG))
4690 			return -EINVAL;
4691 	}
4692 
4693 	/* User copy is required to access integrity buffer */
4694 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4695 		return -EINVAL;
4696 
4697 	/* the created device is always owned by current user */
4698 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4699 
4700 	if (header->dev_id != info.dev_id) {
4701 		pr_warn("%s: dev id not match %u %u\n",
4702 			__func__, header->dev_id, info.dev_id);
4703 		return -EINVAL;
4704 	}
4705 
4706 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4707 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4708 			__func__, UBLK_MAX_UBLKS - 1);
4709 		return -EINVAL;
4710 	}
4711 
4712 	ublk_dump_dev_info(&info);
4713 
4714 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4715 	if (ret)
4716 		return ret;
4717 
4718 	ret = -EACCES;
4719 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4720 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4721 		goto out_unlock;
4722 
4723 	ret = -ENOMEM;
4724 	ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4725 	if (!ub)
4726 		goto out_unlock;
4727 	mutex_init(&ub->mutex);
4728 	spin_lock_init(&ub->lock);
4729 	mutex_init(&ub->cancel_mutex);
4730 	mt_init(&ub->buf_tree);
4731 	ida_init(&ub->buf_ida);
4732 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4733 
4734 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4735 	if (ret < 0)
4736 		goto out_free_ub;
4737 
4738 	memcpy(&ub->dev_info, &info, sizeof(info));
4739 
4740 	/* update device id */
4741 	ub->dev_info.dev_id = ub->ub_number;
4742 
4743 	/*
4744 	 * 64bit flags will be copied back to userspace as feature
4745 	 * negotiation result, so have to clear flags which driver
4746 	 * doesn't support yet, then userspace can get correct flags
4747 	 * (features) to handle.
4748 	 */
4749 	ub->dev_info.flags &= UBLK_F_ALL;
4750 
4751 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4752 		UBLK_F_URING_CMD_COMP_IN_TASK |
4753 		UBLK_F_PER_IO_DAEMON |
4754 		UBLK_F_BUF_REG_OFF_DAEMON |
4755 		UBLK_F_SAFE_STOP_DEV;
4756 
4757 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4758 	if (ublk_dev_support_batch_io(ub))
4759 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4760 
4761 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4762 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4763 				UBLK_F_AUTO_BUF_REG))
4764 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4765 
4766 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4767 	if (ublk_dev_support_batch_io(ub))
4768 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4769 
4770 	/*
4771 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4772 	 * returning write_append_lba, which is only allowed in case of
4773 	 * user copy or zero copy
4774 	 */
4775 	if (ublk_dev_is_zoned(ub) &&
4776 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4777 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4778 		ret = -EINVAL;
4779 		goto out_free_dev_number;
4780 	}
4781 
4782 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4783 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4784 	ublk_align_max_io_size(ub);
4785 
4786 	ret = ublk_add_tag_set(ub);
4787 	if (ret)
4788 		goto out_free_dev_number;
4789 
4790 	ret = ublk_init_queues(ub);
4791 	if (ret)
4792 		goto out_free_tag_set;
4793 
4794 	ret = -EFAULT;
4795 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4796 		goto out_deinit_queues;
4797 
4798 	/*
4799 	 * Add the char dev so that ublksrv daemon can be setup.
4800 	 * ublk_add_chdev() will cleanup everything if it fails.
4801 	 */
4802 	ret = ublk_add_chdev(ub);
4803 	goto out_unlock;
4804 
4805 out_deinit_queues:
4806 	ublk_deinit_queues(ub);
4807 out_free_tag_set:
4808 	blk_mq_free_tag_set(&ub->tag_set);
4809 out_free_dev_number:
4810 	ublk_free_dev_number(ub);
4811 out_free_ub:
4812 	mutex_destroy(&ub->mutex);
4813 	mutex_destroy(&ub->cancel_mutex);
4814 	kfree(ub);
4815 out_unlock:
4816 	mutex_unlock(&ublk_ctl_mutex);
4817 	return ret;
4818 }
4819 
4820 static inline bool ublk_idr_freed(int id)
4821 {
4822 	void *ptr;
4823 
4824 	spin_lock(&ublk_idr_lock);
4825 	ptr = idr_find(&ublk_index_idr, id);
4826 	spin_unlock(&ublk_idr_lock);
4827 
4828 	return ptr == NULL;
4829 }
4830 
4831 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4832 {
4833 	struct ublk_device *ub = *p_ub;
4834 	int idx = ub->ub_number;
4835 	int ret;
4836 
4837 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4838 	if (ret)
4839 		return ret;
4840 
4841 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4842 		ublk_remove(ub);
4843 		set_bit(UB_STATE_DELETED, &ub->state);
4844 	}
4845 
4846 	/* Mark the reference as consumed */
4847 	*p_ub = NULL;
4848 	ublk_put_device(ub);
4849 	mutex_unlock(&ublk_ctl_mutex);
4850 
4851 	/*
4852 	 * Wait until the idr is removed, then it can be reused after
4853 	 * DEL_DEV command is returned.
4854 	 *
4855 	 * If we returns because of user interrupt, future delete command
4856 	 * may come:
4857 	 *
4858 	 * - the device number isn't freed, this device won't or needn't
4859 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4860 	 *   will be released after the last reference is dropped
4861 	 *
4862 	 * - the device number is freed already, we will not find this
4863 	 *   device via ublk_get_device_from_id()
4864 	 */
4865 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4866 		return -EINTR;
4867 	return 0;
4868 }
4869 
4870 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4871 				      const struct ublksrv_ctrl_cmd *header)
4872 {
4873 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4874 			__func__, cmd_op, header->dev_id, header->queue_id,
4875 			header->data[0], header->addr, header->len);
4876 }
4877 
4878 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4879 {
4880 	ublk_stop_dev(ub);
4881 }
4882 
4883 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4884 {
4885 	struct gendisk *disk;
4886 	int ret = 0;
4887 
4888 	disk = ublk_get_disk(ub);
4889 	if (!disk)
4890 		return -ENODEV;
4891 
4892 	mutex_lock(&disk->open_mutex);
4893 	if (disk_openers(disk) > 0) {
4894 		ret = -EBUSY;
4895 		goto unlock;
4896 	}
4897 	ub->block_open = true;
4898 	/* release open_mutex as del_gendisk() will reacquire it */
4899 	mutex_unlock(&disk->open_mutex);
4900 
4901 	ublk_ctrl_stop_dev(ub);
4902 	goto out;
4903 
4904 unlock:
4905 	mutex_unlock(&disk->open_mutex);
4906 out:
4907 	ublk_put_disk(disk);
4908 	return ret;
4909 }
4910 
4911 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4912 		const struct ublksrv_ctrl_cmd *header)
4913 {
4914 	struct task_struct *p;
4915 	struct pid *pid;
4916 	struct ublksrv_ctrl_dev_info dev_info;
4917 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4918 	void __user *argp = (void __user *)(unsigned long)header->addr;
4919 
4920 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4921 		return -EINVAL;
4922 
4923 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4924 	dev_info.ublksrv_pid = -1;
4925 
4926 	if (init_ublksrv_tgid > 0) {
4927 		rcu_read_lock();
4928 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4929 		p = pid_task(pid, PIDTYPE_TGID);
4930 		if (p) {
4931 			int vnr = task_tgid_vnr(p);
4932 
4933 			if (vnr)
4934 				dev_info.ublksrv_pid = vnr;
4935 		}
4936 		rcu_read_unlock();
4937 	}
4938 
4939 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4940 		return -EFAULT;
4941 
4942 	return 0;
4943 }
4944 
4945 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
4946 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4947 {
4948 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4949 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4950 
4951 	if (ub->ub_disk) {
4952 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4953 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4954 	} else {
4955 		ub->params.devt.disk_major = 0;
4956 		ub->params.devt.disk_minor = 0;
4957 	}
4958 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4959 }
4960 
4961 static int ublk_ctrl_get_params(struct ublk_device *ub,
4962 		const struct ublksrv_ctrl_cmd *header)
4963 {
4964 	void __user *argp = (void __user *)(unsigned long)header->addr;
4965 	struct ublk_params_header ph;
4966 	int ret;
4967 
4968 	if (header->len <= sizeof(ph) || !header->addr)
4969 		return -EINVAL;
4970 
4971 	if (copy_from_user(&ph, argp, sizeof(ph)))
4972 		return -EFAULT;
4973 
4974 	if (ph.len > header->len || !ph.len)
4975 		return -EINVAL;
4976 
4977 	if (ph.len > sizeof(struct ublk_params))
4978 		ph.len = sizeof(struct ublk_params);
4979 
4980 	mutex_lock(&ub->mutex);
4981 	ublk_ctrl_fill_params_devt(ub);
4982 	if (copy_to_user(argp, &ub->params, ph.len))
4983 		ret = -EFAULT;
4984 	else
4985 		ret = 0;
4986 	mutex_unlock(&ub->mutex);
4987 
4988 	return ret;
4989 }
4990 
4991 static int ublk_ctrl_set_params(struct ublk_device *ub,
4992 		const struct ublksrv_ctrl_cmd *header)
4993 {
4994 	void __user *argp = (void __user *)(unsigned long)header->addr;
4995 	struct ublk_params_header ph;
4996 	int ret = -EFAULT;
4997 
4998 	if (header->len <= sizeof(ph) || !header->addr)
4999 		return -EINVAL;
5000 
5001 	if (copy_from_user(&ph, argp, sizeof(ph)))
5002 		return -EFAULT;
5003 
5004 	if (ph.len > header->len || !ph.len || !ph.types)
5005 		return -EINVAL;
5006 
5007 	if (ph.len > sizeof(struct ublk_params))
5008 		ph.len = sizeof(struct ublk_params);
5009 
5010 	mutex_lock(&ub->mutex);
5011 	if (test_bit(UB_STATE_USED, &ub->state)) {
5012 		/*
5013 		 * Parameters can only be changed when device hasn't
5014 		 * been started yet
5015 		 */
5016 		ret = -EACCES;
5017 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
5018 		/* zero out partial copy so no stale params survive */
5019 		memset(&ub->params, 0, sizeof(ub->params));
5020 		ret = -EFAULT;
5021 	} else {
5022 		/* clear all we don't support yet */
5023 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
5024 		ret = ublk_validate_params(ub);
5025 		if (ret)
5026 			memset(&ub->params, 0, sizeof(ub->params));
5027 	}
5028 	mutex_unlock(&ub->mutex);
5029 
5030 	return ret;
5031 }
5032 
5033 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
5034 {
5035 	int ret = -EINVAL;
5036 
5037 	mutex_lock(&ub->mutex);
5038 	if (ublk_nosrv_should_stop_dev(ub))
5039 		goto out_unlock;
5040 	/*
5041 	 * START_RECOVERY is only allowd after:
5042 	 *
5043 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5044 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
5045 	 *     released.
5046 	 *
5047 	 * and one of the following holds
5048 	 *
5049 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5050 	 *     (a)has quiesced request queue
5051 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
5052 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5053 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
5054 	 *
5055 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5056 	 *     quiesced, but all I/O is being immediately errored
5057 	 */
5058 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5059 		ret = -EBUSY;
5060 		goto out_unlock;
5061 	}
5062 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5063 	init_completion(&ub->completion);
5064 	ret = 0;
5065  out_unlock:
5066 	mutex_unlock(&ub->mutex);
5067 	return ret;
5068 }
5069 
5070 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5071 		const struct ublksrv_ctrl_cmd *header)
5072 {
5073 	int ublksrv_pid = (int)header->data[0];
5074 	int ret = -EINVAL;
5075 
5076 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5077 		 header->dev_id);
5078 
5079 	if (wait_for_completion_interruptible(&ub->completion))
5080 		return -EINTR;
5081 
5082 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5083 		 header->dev_id);
5084 
5085 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
5086 		return -EINVAL;
5087 
5088 	mutex_lock(&ub->mutex);
5089 	if (ublk_nosrv_should_stop_dev(ub))
5090 		goto out_unlock;
5091 
5092 	if (!ublk_dev_in_recoverable_state(ub)) {
5093 		ret = -EBUSY;
5094 		goto out_unlock;
5095 	}
5096 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5097 	ub->dev_info.state = UBLK_S_DEV_LIVE;
5098 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5099 			__func__, ublksrv_pid, header->dev_id);
5100 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
5101 	ret = 0;
5102  out_unlock:
5103 	mutex_unlock(&ub->mutex);
5104 	return ret;
5105 }
5106 
5107 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5108 {
5109 	void __user *argp = (void __user *)(unsigned long)header->addr;
5110 	u64 features = UBLK_F_ALL;
5111 
5112 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
5113 		return -EINVAL;
5114 
5115 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5116 		return -EFAULT;
5117 
5118 	return 0;
5119 }
5120 
5121 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5122 {
5123 	struct ublk_param_basic *p = &ub->params.basic;
5124 	u64 new_size = header->data[0];
5125 	int ret = 0;
5126 
5127 	mutex_lock(&ub->mutex);
5128 	if (!ub->ub_disk) {
5129 		ret = -ENODEV;
5130 		goto out;
5131 	}
5132 	p->dev_sectors = new_size;
5133 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5134 out:
5135 	mutex_unlock(&ub->mutex);
5136 	return ret;
5137 }
5138 
5139 struct count_busy {
5140 	const struct ublk_queue *ubq;
5141 	unsigned int nr_busy;
5142 };
5143 
5144 static bool ublk_count_busy_req(struct request *rq, void *data)
5145 {
5146 	struct count_busy *idle = data;
5147 
5148 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5149 		idle->nr_busy += 1;
5150 	return true;
5151 }
5152 
5153 /* uring_cmd is guaranteed to be active if the associated request is idle */
5154 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5155 {
5156 	struct count_busy data = {
5157 		.ubq = ubq,
5158 	};
5159 
5160 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5161 	return data.nr_busy < ubq->q_depth;
5162 }
5163 
5164 /* Wait until each hw queue has at least one idle IO */
5165 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5166 				 unsigned int timeout_ms)
5167 {
5168 	unsigned int elapsed = 0;
5169 	int ret;
5170 
5171 	/*
5172 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5173 	 * or new fetch command, so needn't wait any more
5174 	 */
5175 	if (ublk_dev_support_batch_io(ub))
5176 		return 0;
5177 
5178 	while (elapsed < timeout_ms && !signal_pending(current)) {
5179 		unsigned int queues_cancelable = 0;
5180 		int i;
5181 
5182 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5183 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5184 
5185 			queues_cancelable += !!ubq_has_idle_io(ubq);
5186 		}
5187 
5188 		/*
5189 		 * Each queue needs at least one active command for
5190 		 * notifying ublk server
5191 		 */
5192 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5193 			break;
5194 
5195 		msleep(UBLK_REQUEUE_DELAY_MS);
5196 		elapsed += UBLK_REQUEUE_DELAY_MS;
5197 	}
5198 
5199 	if (signal_pending(current))
5200 		ret = -EINTR;
5201 	else if (elapsed >= timeout_ms)
5202 		ret = -EBUSY;
5203 	else
5204 		ret = 0;
5205 
5206 	return ret;
5207 }
5208 
5209 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5210 				 const struct ublksrv_ctrl_cmd *header)
5211 {
5212 	/* zero means wait forever */
5213 	u64 timeout_ms = header->data[0];
5214 	struct gendisk *disk;
5215 	int ret = -ENODEV;
5216 
5217 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5218 		return -EOPNOTSUPP;
5219 
5220 	mutex_lock(&ub->mutex);
5221 	disk = ublk_get_disk(ub);
5222 	if (!disk)
5223 		goto unlock;
5224 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5225 		goto put_disk;
5226 
5227 	ret = 0;
5228 	/* already in expected state */
5229 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5230 		goto put_disk;
5231 
5232 	/* Mark the device as canceling */
5233 	mutex_lock(&ub->cancel_mutex);
5234 	blk_mq_quiesce_queue(disk->queue);
5235 	ublk_set_canceling(ub, true);
5236 	blk_mq_unquiesce_queue(disk->queue);
5237 	mutex_unlock(&ub->cancel_mutex);
5238 
5239 	if (!timeout_ms)
5240 		timeout_ms = UINT_MAX;
5241 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5242 
5243 put_disk:
5244 	ublk_put_disk(disk);
5245 unlock:
5246 	mutex_unlock(&ub->mutex);
5247 
5248 	/* Cancel pending uring_cmd */
5249 	if (!ret)
5250 		ublk_cancel_dev(ub);
5251 	return ret;
5252 }
5253 
5254 /*
5255  * All control commands are sent via /dev/ublk-control, so we have to check
5256  * the destination device's permission
5257  */
5258 static int ublk_char_dev_permission(struct ublk_device *ub,
5259 		const char *dev_path, int mask)
5260 {
5261 	int err;
5262 	struct path path;
5263 	struct kstat stat;
5264 
5265 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5266 	if (err)
5267 		return err;
5268 
5269 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5270 	if (err)
5271 		goto exit;
5272 
5273 	err = -EPERM;
5274 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5275 		goto exit;
5276 
5277 	err = inode_permission(&nop_mnt_idmap,
5278 			d_backing_inode(path.dentry), mask);
5279 exit:
5280 	path_put(&path);
5281 	return err;
5282 }
5283 
5284 /*
5285  * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5286  * if device is started. If device is not yet started, only mutex is
5287  * needed since no I/O path can access the tree.
5288  *
5289  * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5290  * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5291 */
5292 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5293 {
5294 	unsigned int memflags = 0;
5295 
5296 	mutex_lock(&ub->mutex);
5297 	if (ub->ub_disk)
5298 		memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5299 
5300 	return memflags;
5301 }
5302 
5303 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5304 {
5305 	if (ub->ub_disk)
5306 		blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5307 	mutex_unlock(&ub->mutex);
5308 }
5309 
5310 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
5311 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5312 {
5313 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5314 	struct ublk_buf_range *range;
5315 
5316 	mas_lock(&mas);
5317 	mas_for_each(&mas, range, ULONG_MAX) {
5318 		if (range->buf_index == buf_index) {
5319 			mas_erase(&mas);
5320 			kfree(range);
5321 		}
5322 	}
5323 	mas_unlock(&mas);
5324 }
5325 
5326 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5327 			       struct page **pages, unsigned long nr_pages,
5328 			       int index, unsigned short flags)
5329 {
5330 	unsigned long i;
5331 	int ret;
5332 
5333 	for (i = 0; i < nr_pages; i++) {
5334 		unsigned long pfn = page_to_pfn(pages[i]);
5335 		unsigned long start = i;
5336 		struct ublk_buf_range *range;
5337 
5338 		/* Find run of consecutive PFNs */
5339 		while (i + 1 < nr_pages &&
5340 		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5341 			i++;
5342 
5343 		range = kzalloc(sizeof(*range), GFP_KERNEL);
5344 		if (!range) {
5345 			ret = -ENOMEM;
5346 			goto unwind;
5347 		}
5348 		range->buf_index = index;
5349 		range->flags = flags;
5350 		range->base_offset = start << PAGE_SHIFT;
5351 
5352 		ret = mtree_insert_range(&ub->buf_tree, pfn,
5353 					 pfn + (i - start),
5354 					 range, GFP_KERNEL);
5355 		if (ret) {
5356 			kfree(range);
5357 			goto unwind;
5358 		}
5359 	}
5360 	return 0;
5361 
5362 unwind:
5363 	ublk_buf_erase_ranges(ub, index);
5364 	return ret;
5365 }
5366 
5367 /*
5368  * Register a shared memory buffer for zero-copy I/O.
5369  * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5370  * internally. Returns buffer index (>= 0) on success.
5371  */
5372 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5373 			     struct ublksrv_ctrl_cmd *header)
5374 {
5375 	void __user *argp = (void __user *)(unsigned long)header->addr;
5376 	struct ublk_shmem_buf_reg buf_reg;
5377 	unsigned long nr_pages;
5378 	struct page **pages = NULL;
5379 	unsigned int gup_flags;
5380 	unsigned int memflags;
5381 	long pinned;
5382 	int index;
5383 	int ret;
5384 
5385 	if (!ublk_dev_support_shmem_zc(ub))
5386 		return -EOPNOTSUPP;
5387 
5388 	memset(&buf_reg, 0, sizeof(buf_reg));
5389 	if (copy_from_user(&buf_reg, argp,
5390 			   min_t(size_t, header->len, sizeof(buf_reg))))
5391 		return -EFAULT;
5392 
5393 	if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5394 		return -EINVAL;
5395 
5396 	if (buf_reg.reserved)
5397 		return -EINVAL;
5398 
5399 	if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5400 	    !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5401 		return -EINVAL;
5402 
5403 	nr_pages = buf_reg.len >> PAGE_SHIFT;
5404 
5405 	/* Pin pages before any locks (may sleep) */
5406 	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5407 	if (!pages)
5408 		return -ENOMEM;
5409 
5410 	gup_flags = FOLL_LONGTERM;
5411 	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5412 		gup_flags |= FOLL_WRITE;
5413 
5414 	pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5415 	if (pinned < 0) {
5416 		ret = pinned;
5417 		goto err_free_pages;
5418 	}
5419 	if (pinned != nr_pages) {
5420 		ret = -EFAULT;
5421 		goto err_unpin;
5422 	}
5423 
5424 	memflags = ublk_lock_buf_tree(ub);
5425 
5426 	index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5427 	if (index < 0) {
5428 		ret = index;
5429 		goto err_unlock;
5430 	}
5431 
5432 	ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5433 	if (ret) {
5434 		ida_free(&ub->buf_ida, index);
5435 		goto err_unlock;
5436 	}
5437 
5438 	ublk_unlock_buf_tree(ub, memflags);
5439 	kvfree(pages);
5440 	return index;
5441 
5442 err_unlock:
5443 	ublk_unlock_buf_tree(ub, memflags);
5444 err_unpin:
5445 	unpin_user_pages(pages, pinned);
5446 err_free_pages:
5447 	kvfree(pages);
5448 	return ret;
5449 }
5450 
5451 static void ublk_unpin_range_pages(unsigned long base_pfn,
5452 				   unsigned long nr_pages)
5453 {
5454 #define UBLK_UNPIN_BATCH	32
5455 	struct page *pages[UBLK_UNPIN_BATCH];
5456 	unsigned long off;
5457 
5458 	for (off = 0; off < nr_pages; ) {
5459 		unsigned int batch = min_t(unsigned long,
5460 					   nr_pages - off, UBLK_UNPIN_BATCH);
5461 		unsigned int j;
5462 
5463 		for (j = 0; j < batch; j++)
5464 			pages[j] = pfn_to_page(base_pfn + off + j);
5465 		unpin_user_pages(pages, batch);
5466 		off += batch;
5467 	}
5468 }
5469 
5470 /*
5471  * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under
5472  * mas_lock, collecting them into an xarray. Then drop the lock and
5473  * unpin pages + free ranges outside spinlock context.
5474  *
5475  * Returns true if the tree walk completed, false if more ranges remain.
5476  * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value().
5477  */
5478 #define UBLK_REMOVE_BATCH	64
5479 
5480 static bool __ublk_shmem_remove_ranges(struct ublk_device *ub,
5481 					int buf_index, int *ret)
5482 {
5483 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5484 	struct ublk_buf_range *range;
5485 	struct xarray to_unpin;
5486 	unsigned long idx;
5487 	unsigned int count = 0;
5488 	bool done = false;
5489 	void *entry;
5490 
5491 	xa_init(&to_unpin);
5492 
5493 	mas_lock(&mas);
5494 	mas_for_each(&mas, range, ULONG_MAX) {
5495 		unsigned long nr;
5496 
5497 		if (buf_index >= 0 && range->buf_index != buf_index)
5498 			continue;
5499 
5500 		*ret = 0;
5501 		nr = mas.last - mas.index + 1;
5502 		if (xa_err(xa_store(&to_unpin, mas.index,
5503 				    xa_mk_value(nr), GFP_ATOMIC)))
5504 			goto unlock;
5505 		mas_erase(&mas);
5506 		kfree(range);
5507 		if (++count >= UBLK_REMOVE_BATCH)
5508 			goto unlock;
5509 	}
5510 	done = true;
5511 unlock:
5512 	mas_unlock(&mas);
5513 
5514 	xa_for_each(&to_unpin, idx, entry)
5515 		ublk_unpin_range_pages(idx, xa_to_value(entry));
5516 	xa_destroy(&to_unpin);
5517 
5518 	return done;
5519 }
5520 
5521 /*
5522  * Remove ranges from the maple tree matching buf_index, unpin pages
5523  * and free range structs. If buf_index < 0, remove all ranges.
5524  * Processes ranges in batches to avoid holding the maple tree spinlock
5525  * across potentially expensive page unpinning.
5526  */
5527 static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index)
5528 {
5529 	int ret = -ENOENT;
5530 
5531 	while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret))
5532 		cond_resched();
5533 	return ret;
5534 }
5535 
5536 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5537 			       struct ublksrv_ctrl_cmd *header)
5538 {
5539 	int index = (int)header->data[0];
5540 	unsigned int memflags;
5541 	int ret;
5542 
5543 	if (!ublk_dev_support_shmem_zc(ub))
5544 		return -EOPNOTSUPP;
5545 
5546 	if (index < 0 || index > USHRT_MAX)
5547 		return -EINVAL;
5548 
5549 	memflags = ublk_lock_buf_tree(ub);
5550 
5551 	ret = ublk_shmem_remove_ranges(ub, index);
5552 	if (!ret)
5553 		ida_free(&ub->buf_ida, index);
5554 
5555 	ublk_unlock_buf_tree(ub, memflags);
5556 	return ret;
5557 }
5558 
5559 static void ublk_buf_cleanup(struct ublk_device *ub)
5560 {
5561 	ublk_shmem_remove_ranges(ub, -1);
5562 	mtree_destroy(&ub->buf_tree);
5563 	ida_destroy(&ub->buf_ida);
5564 }
5565 
5566 /* Check if request pages match a registered shared memory buffer */
5567 static bool ublk_try_buf_match(struct ublk_device *ub,
5568 				   struct request *rq,
5569 				   u32 *buf_idx, u32 *buf_off)
5570 {
5571 	struct req_iterator iter;
5572 	struct bio_vec bv;
5573 	int index = -1;
5574 	unsigned long expected_offset = 0;
5575 	bool first = true;
5576 
5577 	rq_for_each_bvec(bv, rq, iter) {
5578 		unsigned long pfn = page_to_pfn(bv.bv_page);
5579 		unsigned long end_pfn = pfn +
5580 			((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5581 		struct ublk_buf_range *range;
5582 		unsigned long off;
5583 		MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5584 
5585 		range = mas_walk(&mas);
5586 		if (!range)
5587 			return false;
5588 
5589 		/* verify all pages in this bvec fall within the range */
5590 		if (end_pfn > mas.last)
5591 			return false;
5592 
5593 		off = range->base_offset +
5594 			(pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5595 
5596 		if (first) {
5597 			/* Read-only buffer can't serve READ (kernel writes) */
5598 			if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5599 			    req_op(rq) != REQ_OP_WRITE)
5600 				return false;
5601 			index = range->buf_index;
5602 			expected_offset = off;
5603 			*buf_off = off;
5604 			first = false;
5605 		} else {
5606 			if (range->buf_index != index)
5607 				return false;
5608 			if (off != expected_offset)
5609 				return false;
5610 		}
5611 		expected_offset += bv.bv_len;
5612 	}
5613 
5614 	if (first)
5615 		return false;
5616 
5617 	*buf_idx = index;
5618 	return true;
5619 }
5620 
5621 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5622 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5623 {
5624 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5625 	void __user *argp = (void __user *)(unsigned long)header->addr;
5626 	char *dev_path = NULL;
5627 	int ret = 0;
5628 	int mask;
5629 
5630 	if (!unprivileged) {
5631 		if (!capable(CAP_SYS_ADMIN))
5632 			return -EPERM;
5633 		/*
5634 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5635 		 * char_dev_path in payload too, since userspace may not
5636 		 * know if the specified device is created as unprivileged
5637 		 * mode.
5638 		 */
5639 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5640 			return 0;
5641 	}
5642 
5643 	/*
5644 	 * User has to provide the char device path for unprivileged ublk
5645 	 *
5646 	 * header->addr always points to the dev path buffer, and
5647 	 * header->dev_path_len records length of dev path buffer.
5648 	 */
5649 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5650 		return -EINVAL;
5651 
5652 	if (header->len < header->dev_path_len)
5653 		return -EINVAL;
5654 
5655 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5656 	if (IS_ERR(dev_path))
5657 		return PTR_ERR(dev_path);
5658 
5659 	ret = -EINVAL;
5660 	switch (_IOC_NR(cmd_op)) {
5661 	case UBLK_CMD_GET_DEV_INFO:
5662 	case UBLK_CMD_GET_DEV_INFO2:
5663 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5664 	case UBLK_CMD_GET_PARAMS:
5665 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5666 		mask = MAY_READ;
5667 		break;
5668 	case UBLK_CMD_START_DEV:
5669 	case UBLK_CMD_STOP_DEV:
5670 	case UBLK_CMD_ADD_DEV:
5671 	case UBLK_CMD_DEL_DEV:
5672 	case UBLK_CMD_SET_PARAMS:
5673 	case UBLK_CMD_START_USER_RECOVERY:
5674 	case UBLK_CMD_END_USER_RECOVERY:
5675 	case UBLK_CMD_UPDATE_SIZE:
5676 	case UBLK_CMD_QUIESCE_DEV:
5677 	case UBLK_CMD_TRY_STOP_DEV:
5678 	case UBLK_CMD_REG_BUF:
5679 	case UBLK_CMD_UNREG_BUF:
5680 		mask = MAY_READ | MAY_WRITE;
5681 		break;
5682 	default:
5683 		goto exit;
5684 	}
5685 
5686 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5687 	if (!ret) {
5688 		header->len -= header->dev_path_len;
5689 		header->addr += header->dev_path_len;
5690 	}
5691 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5692 			__func__, ub->ub_number, cmd_op,
5693 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5694 			dev_path, ret);
5695 exit:
5696 	kfree(dev_path);
5697 	return ret;
5698 }
5699 
5700 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5701 {
5702 	switch (_IOC_NR(cmd_op)) {
5703 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5704 	case UBLK_CMD_GET_DEV_INFO:
5705 	case UBLK_CMD_GET_DEV_INFO2:
5706 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5707 		return false;
5708 	default:
5709 		return true;
5710 	}
5711 }
5712 
5713 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5714 		unsigned int issue_flags)
5715 {
5716 	/* May point to userspace-mapped memory */
5717 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5718 								    struct ublksrv_ctrl_cmd);
5719 	struct ublksrv_ctrl_cmd header;
5720 	struct ublk_device *ub = NULL;
5721 	u32 cmd_op = cmd->cmd_op;
5722 	int ret = -EINVAL;
5723 
5724 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5725 	    issue_flags & IO_URING_F_NONBLOCK)
5726 		return -EAGAIN;
5727 
5728 	if (!(issue_flags & IO_URING_F_SQE128))
5729 		return -EINVAL;
5730 
5731 	header.dev_id = READ_ONCE(ub_src->dev_id);
5732 	header.queue_id = READ_ONCE(ub_src->queue_id);
5733 	header.len = READ_ONCE(ub_src->len);
5734 	header.addr = READ_ONCE(ub_src->addr);
5735 	header.data[0] = READ_ONCE(ub_src->data[0]);
5736 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5737 	ublk_ctrl_cmd_dump(cmd_op, &header);
5738 
5739 	ret = ublk_check_cmd_op(cmd_op);
5740 	if (ret)
5741 		goto out;
5742 
5743 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5744 		ret = ublk_ctrl_get_features(&header);
5745 		goto out;
5746 	}
5747 
5748 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5749 		ret = -ENODEV;
5750 		ub = ublk_get_device_from_id(header.dev_id);
5751 		if (!ub)
5752 			goto out;
5753 
5754 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5755 		if (ret)
5756 			goto put_dev;
5757 	}
5758 
5759 	switch (_IOC_NR(cmd_op)) {
5760 	case UBLK_CMD_START_DEV:
5761 		ret = ublk_ctrl_start_dev(ub, &header);
5762 		break;
5763 	case UBLK_CMD_STOP_DEV:
5764 		ublk_ctrl_stop_dev(ub);
5765 		ret = 0;
5766 		break;
5767 	case UBLK_CMD_GET_DEV_INFO:
5768 	case UBLK_CMD_GET_DEV_INFO2:
5769 		ret = ublk_ctrl_get_dev_info(ub, &header);
5770 		break;
5771 	case UBLK_CMD_ADD_DEV:
5772 		ret = ublk_ctrl_add_dev(&header);
5773 		break;
5774 	case UBLK_CMD_DEL_DEV:
5775 		ret = ublk_ctrl_del_dev(&ub, true);
5776 		break;
5777 	case UBLK_CMD_DEL_DEV_ASYNC:
5778 		ret = ublk_ctrl_del_dev(&ub, false);
5779 		break;
5780 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5781 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5782 		break;
5783 	case UBLK_CMD_GET_PARAMS:
5784 		ret = ublk_ctrl_get_params(ub, &header);
5785 		break;
5786 	case UBLK_CMD_SET_PARAMS:
5787 		ret = ublk_ctrl_set_params(ub, &header);
5788 		break;
5789 	case UBLK_CMD_START_USER_RECOVERY:
5790 		ret = ublk_ctrl_start_recovery(ub);
5791 		break;
5792 	case UBLK_CMD_END_USER_RECOVERY:
5793 		ret = ublk_ctrl_end_recovery(ub, &header);
5794 		break;
5795 	case UBLK_CMD_UPDATE_SIZE:
5796 		ret = ublk_ctrl_set_size(ub, &header);
5797 		break;
5798 	case UBLK_CMD_QUIESCE_DEV:
5799 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5800 		break;
5801 	case UBLK_CMD_TRY_STOP_DEV:
5802 		ret = ublk_ctrl_try_stop_dev(ub);
5803 		break;
5804 	case UBLK_CMD_REG_BUF:
5805 		ret = ublk_ctrl_reg_buf(ub, &header);
5806 		break;
5807 	case UBLK_CMD_UNREG_BUF:
5808 		ret = ublk_ctrl_unreg_buf(ub, &header);
5809 		break;
5810 	default:
5811 		ret = -EOPNOTSUPP;
5812 		break;
5813 	}
5814 
5815  put_dev:
5816 	if (ub)
5817 		ublk_put_device(ub);
5818  out:
5819 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5820 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5821 	return ret;
5822 }
5823 
5824 static const struct file_operations ublk_ctl_fops = {
5825 	.open		= nonseekable_open,
5826 	.uring_cmd      = ublk_ctrl_uring_cmd,
5827 	.owner		= THIS_MODULE,
5828 	.llseek		= noop_llseek,
5829 };
5830 
5831 static struct miscdevice ublk_misc = {
5832 	.minor		= MISC_DYNAMIC_MINOR,
5833 	.name		= "ublk-control",
5834 	.fops		= &ublk_ctl_fops,
5835 };
5836 
5837 static int __init ublk_init(void)
5838 {
5839 	int ret;
5840 
5841 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5842 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5843 	/*
5844 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5845 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5846 	 */
5847 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5848 		     UBLKSRV_IO_INTEGRITY_FLAG);
5849 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5850 
5851 	init_waitqueue_head(&ublk_idr_wq);
5852 
5853 	ret = misc_register(&ublk_misc);
5854 	if (ret)
5855 		return ret;
5856 
5857 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5858 	if (ret)
5859 		goto unregister_mis;
5860 
5861 	ret = class_register(&ublk_chr_class);
5862 	if (ret)
5863 		goto free_chrdev_region;
5864 
5865 	return 0;
5866 
5867 free_chrdev_region:
5868 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5869 unregister_mis:
5870 	misc_deregister(&ublk_misc);
5871 	return ret;
5872 }
5873 
5874 static void __exit ublk_exit(void)
5875 {
5876 	struct ublk_device *ub;
5877 	int id;
5878 
5879 	idr_for_each_entry(&ublk_index_idr, ub, id)
5880 		ublk_remove(ub);
5881 
5882 	class_unregister(&ublk_chr_class);
5883 	misc_deregister(&ublk_misc);
5884 
5885 	idr_destroy(&ublk_index_idr);
5886 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5887 }
5888 
5889 module_init(ublk_init);
5890 module_exit(ublk_exit);
5891 
5892 static int ublk_set_max_unprivileged_ublks(const char *buf,
5893 					   const struct kernel_param *kp)
5894 {
5895 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5896 }
5897 
5898 static int ublk_get_max_unprivileged_ublks(char *buf,
5899 					   const struct kernel_param *kp)
5900 {
5901 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5902 }
5903 
5904 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5905 	.set = ublk_set_max_unprivileged_ublks,
5906 	.get = ublk_get_max_unprivileged_ublks,
5907 };
5908 
5909 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5910 		&unprivileged_ublks_max, 0644);
5911 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5912 
5913 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5914 MODULE_DESCRIPTION("Userspace block device");
5915 MODULE_LICENSE("GPL");
5916