xref: /linux/drivers/block/ublk_drv.c (revision f7700a4415afb3ac1767a556094e4ef8bd440e41)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53 
54 #define UBLK_MINORS		(1U << MINORBITS)
55 
56 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
57 
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF	_IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF	_IOC_NR(UBLK_U_CMD_UNREG_BUF)
65 
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX	(1ULL << 32)
68 
69 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71 
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 		| UBLK_F_URING_CMD_COMP_IN_TASK \
75 		| UBLK_F_NEED_GET_DATA \
76 		| UBLK_F_USER_RECOVERY \
77 		| UBLK_F_USER_RECOVERY_REISSUE \
78 		| UBLK_F_UNPRIVILEGED_DEV \
79 		| UBLK_F_CMD_IOCTL_ENCODE \
80 		| UBLK_F_USER_COPY \
81 		| UBLK_F_ZONED \
82 		| UBLK_F_USER_RECOVERY_FAIL_IO \
83 		| UBLK_F_UPDATE_SIZE \
84 		| UBLK_F_AUTO_BUF_REG \
85 		| UBLK_F_QUIESCE \
86 		| UBLK_F_PER_IO_DAEMON \
87 		| UBLK_F_BUF_REG_OFF_DAEMON \
88 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 		| UBLK_F_SAFE_STOP_DEV \
90 		| UBLK_F_BATCH_IO \
91 		| UBLK_F_NO_AUTO_PART_SCAN \
92 		| UBLK_F_SHMEM_ZC)
93 
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 		| UBLK_F_USER_RECOVERY_REISSUE \
96 		| UBLK_F_USER_RECOVERY_FAIL_IO)
97 
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL                                \
100 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
102 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 	 UBLK_PARAM_TYPE_INTEGRITY)
104 
105 #define UBLK_BATCH_F_ALL  \
106 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
107 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109 
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 	struct list_head node;
113 	struct io_uring_cmd *cmd;
114 	unsigned short buf_group;
115 };
116 
117 struct ublk_uring_cmd_pdu {
118 	/*
119 	 * Store requests in same batch temporarily for queuing them to
120 	 * daemon context.
121 	 *
122 	 * It should have been stored to request payload, but we do want
123 	 * to avoid extra pre-allocation, and uring_cmd payload is always
124 	 * free for us
125 	 */
126 	union {
127 		struct request *req;
128 		struct request *req_list;
129 	};
130 
131 	/*
132 	 * The following two are valid in this cmd whole lifetime, and
133 	 * setup in ublk uring_cmd handler
134 	 */
135 	struct ublk_queue *ubq;
136 
137 	union {
138 		u16 tag;
139 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 	};
141 };
142 
143 struct ublk_batch_io_data {
144 	struct ublk_device *ub;
145 	struct io_uring_cmd *cmd;
146 	struct ublk_batch_io header;
147 	unsigned int issue_flags;
148 	struct io_comp_batch *iob;
149 };
150 
151 /*
152  * io command is active: sqe cmd is received, and its cqe isn't done
153  *
154  * If the flag is set, the io command is owned by ublk driver, and waited
155  * for incoming blk-mq request from the ublk block device.
156  *
157  * If the flag is cleared, the io command will be completed, and owned by
158  * ublk server.
159  */
160 #define UBLK_IO_FLAG_ACTIVE	0x01
161 
162 /*
163  * IO command is completed via cqe, and it is being handled by ublksrv, and
164  * not committed yet
165  *
166  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167  * cross verification
168  */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170 
171 /*
172  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173  * get data buffer address from ublksrv.
174  *
175  * Then, bio data could be copied into this data buffer for a WRITE request
176  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177  */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179 
180 /*
181  * request buffer is registered automatically, so we have to unregister it
182  * before completing this request.
183  *
184  * io_uring will unregister buffer automatically for us during exiting.
185  */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
187 
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED	0x80000000
190 
191 /*
192  * Initialize refcount to a large number to include any registered buffers.
193  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194  * any buffers registered on the io daemon task.
195  */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197 
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
200 
201 union ublk_io_buf {
202 	__u64	addr;
203 	struct ublk_auto_buf_reg auto_reg;
204 };
205 
206 struct ublk_io {
207 	union ublk_io_buf buf;
208 	unsigned int flags;
209 	int res;
210 
211 	union {
212 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
213 		struct io_uring_cmd *cmd;
214 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 		struct request *req;
216 	};
217 
218 	struct task_struct *task;
219 
220 	/*
221 	 * The number of uses of this I/O by the ublk server
222 	 * if user copy or zero copy are enabled:
223 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
225 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 	 * - 1 for each io_uring registered buffer not registered on task
227 	 * The I/O can only be completed once all references are dropped.
228 	 * User copy and buffer registration operations are only permitted
229 	 * if the reference count is nonzero.
230 	 */
231 	refcount_t ref;
232 	/* Count of buffers registered on task and not yet unregistered */
233 	unsigned task_registered_buffers;
234 
235 	void *buf_ctx_handle;
236 	spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238 
239 struct ublk_queue {
240 	int q_id;
241 	int q_depth;
242 
243 	unsigned long flags;
244 	struct ublksrv_io_desc *io_cmd_buf;
245 
246 	bool force_abort;
247 	bool canceling;
248 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 	spinlock_t		cancel_lock;
250 	struct ublk_device *dev;
251 	u32 nr_io_ready;
252 
253 	/*
254 	 * For supporting UBLK_F_BATCH_IO only.
255 	 *
256 	 * Inflight ublk request tag is saved in this fifo
257 	 *
258 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 	 * so lock is required for storing request tag to fifo
260 	 *
261 	 * Make sure just one reader for fetching request from task work
262 	 * function to ublk server, so no need to grab the lock in reader
263 	 * side.
264 	 *
265 	 * Batch I/O State Management:
266 	 *
267 	 * The batch I/O system uses implicit state management based on the
268 	 * combination of three key variables below.
269 	 *
270 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 	 *   No fetch commands available, events queue in evts_fifo
272 	 *
273 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 	 *   Fetch commands available but none processing events
275 	 *
276 	 * - ACTIVE: active_fcmd
277 	 *   One fetch command actively processing events from evts_fifo
278 	 *
279 	 * Key Invariants:
280 	 * - At most one active_fcmd at any time (single reader)
281 	 * - active_fcmd is always from fcmd_head list when non-NULL
282 	 * - evts_fifo can be read locklessly by the single active reader
283 	 * - All state transitions require evts_lock protection
284 	 * - Multiple writers to evts_fifo require lock protection
285 	 */
286 	struct {
287 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 		spinlock_t evts_lock;
289 
290 		/* List of fetch commands available to process events */
291 		struct list_head fcmd_head;
292 
293 		/* Currently active fetch command (NULL = none active) */
294 		struct ublk_batch_fetch_cmd  *active_fcmd;
295 	}____cacheline_aligned_in_smp;
296 
297 	struct ublk_io ios[] __counted_by(q_depth);
298 };
299 
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 	unsigned short buf_index;
303 	unsigned short flags;
304 	unsigned int base_offset;	/* byte offset within buffer */
305 };
306 
307 struct ublk_device {
308 	struct gendisk		*ub_disk;
309 
310 	struct ublksrv_ctrl_dev_info	dev_info;
311 
312 	struct blk_mq_tag_set	tag_set;
313 
314 	struct cdev		cdev;
315 	struct device		cdev_dev;
316 
317 #define UB_STATE_OPEN		0
318 #define UB_STATE_USED		1
319 #define UB_STATE_DELETED	2
320 	unsigned long		state;
321 	int			ub_number;
322 
323 	struct mutex		mutex;
324 
325 	spinlock_t		lock;
326 	struct mm_struct	*mm;
327 
328 	struct ublk_params	params;
329 
330 	struct completion	completion;
331 	u32			nr_queue_ready;
332 	bool 			unprivileged_daemons;
333 	struct mutex cancel_mutex;
334 	bool canceling;
335 	pid_t 	ublksrv_tgid;
336 	struct delayed_work	exit_work;
337 	struct work_struct	partition_scan_work;
338 
339 	bool			block_open; /* protected by open_mutex */
340 
341 	/* shared memory zero copy */
342 	struct maple_tree	buf_tree;
343 	struct ida		buf_ida;
344 
345 	struct ublk_queue       *queues[];
346 };
347 
348 /* header of ublk_params */
349 struct ublk_params_header {
350 	__u32	len;
351 	__u32	types;
352 };
353 
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 				  u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 		u16 q_id, u16 tag, struct ublk_io *io);
362 static inline unsigned int ublk_req_build_flags(struct request *req);
363 static void ublk_batch_dispatch(struct ublk_queue *ubq,
364 				const struct ublk_batch_io_data *data,
365 				struct ublk_batch_fetch_cmd *fcmd);
366 
367 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
368 {
369 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
370 }
371 
372 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
373 {
374 	return ubq->flags & UBLK_F_BATCH_IO;
375 }
376 
377 static inline void ublk_io_lock(struct ublk_io *io)
378 {
379 	spin_lock(&io->lock);
380 }
381 
382 static inline void ublk_io_unlock(struct ublk_io *io)
383 {
384 	spin_unlock(&io->lock);
385 }
386 
387 /* Initialize the event queue */
388 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
389 				    int numa_node)
390 {
391 	spin_lock_init(&q->evts_lock);
392 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
393 }
394 
395 /* Check if event queue is empty */
396 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
397 {
398 	return kfifo_is_empty(&q->evts_fifo);
399 }
400 
401 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
402 {
403 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
404 	kfifo_free(&q->evts_fifo);
405 }
406 
407 static inline struct ublksrv_io_desc *
408 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
409 {
410 	return &ubq->io_cmd_buf[tag];
411 }
412 
413 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
414 {
415 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
416 }
417 
418 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
419 {
420 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
421 }
422 
423 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
424 {
425 	return ubq->flags & UBLK_F_SHMEM_ZC;
426 }
427 
428 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
429 					unsigned int tag)
430 {
431 	return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
432 }
433 
434 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
435 {
436 	return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
437 }
438 
439 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
440 {
441 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
442 }
443 
444 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
445 {
446 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
447 }
448 
449 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
450 {
451 	return ubq->flags & UBLK_F_USER_COPY;
452 }
453 
454 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
455 {
456 	return ub->dev_info.flags & UBLK_F_USER_COPY;
457 }
458 
459 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
460 {
461 	return ub->dev_info.flags & UBLK_F_ZONED;
462 }
463 
464 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
465 {
466 	return ubq->flags & UBLK_F_ZONED;
467 }
468 
469 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
470 {
471 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
472 }
473 
474 #ifdef CONFIG_BLK_DEV_ZONED
475 
476 struct ublk_zoned_report_desc {
477 	__u64 sector;
478 	__u32 operation;
479 	__u32 nr_zones;
480 };
481 
482 static DEFINE_XARRAY(ublk_zoned_report_descs);
483 
484 static int ublk_zoned_insert_report_desc(const struct request *req,
485 		struct ublk_zoned_report_desc *desc)
486 {
487 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
488 			    desc, GFP_KERNEL);
489 }
490 
491 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
492 		const struct request *req)
493 {
494 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
495 }
496 
497 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
498 		const struct request *req)
499 {
500 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
501 }
502 
503 static int ublk_get_nr_zones(const struct ublk_device *ub)
504 {
505 	const struct ublk_param_basic *p = &ub->params.basic;
506 
507 	/* Zone size is a power of 2 */
508 	return p->dev_sectors >> ilog2(p->chunk_sectors);
509 }
510 
511 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
512 {
513 	return blk_revalidate_disk_zones(ub->ub_disk);
514 }
515 
516 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
517 {
518 	const struct ublk_param_zoned *p = &ub->params.zoned;
519 	int nr_zones;
520 
521 	if (!ublk_dev_is_zoned(ub))
522 		return -EINVAL;
523 
524 	if (!p->max_zone_append_sectors)
525 		return -EINVAL;
526 
527 	nr_zones = ublk_get_nr_zones(ub);
528 
529 	if (p->max_active_zones > nr_zones)
530 		return -EINVAL;
531 
532 	if (p->max_open_zones > nr_zones)
533 		return -EINVAL;
534 
535 	return 0;
536 }
537 
538 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
539 {
540 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
541 }
542 
543 /* Based on virtblk_alloc_report_buffer */
544 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
545 				      unsigned int nr_zones, size_t *buflen)
546 {
547 	struct request_queue *q = ublk->ub_disk->queue;
548 	size_t bufsize;
549 	void *buf;
550 
551 	nr_zones = min_t(unsigned int, nr_zones,
552 			 ublk->ub_disk->nr_zones);
553 
554 	bufsize = nr_zones * sizeof(struct blk_zone);
555 	bufsize =
556 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
557 
558 	while (bufsize >= sizeof(struct blk_zone)) {
559 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
560 		if (buf) {
561 			*buflen = bufsize;
562 			return buf;
563 		}
564 		bufsize >>= 1;
565 	}
566 
567 	*buflen = 0;
568 	return NULL;
569 }
570 
571 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
572 		      unsigned int nr_zones, struct blk_report_zones_args *args)
573 {
574 	struct ublk_device *ub = disk->private_data;
575 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
576 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
577 	unsigned int done_zones = 0;
578 	unsigned int max_zones_per_request;
579 	int ret;
580 	struct blk_zone *buffer;
581 	size_t buffer_length;
582 
583 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
584 			 nr_zones);
585 
586 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
587 	if (!buffer)
588 		return -ENOMEM;
589 
590 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
591 
592 	while (done_zones < nr_zones) {
593 		unsigned int remaining_zones = nr_zones - done_zones;
594 		unsigned int zones_in_request =
595 			min_t(unsigned int, remaining_zones, max_zones_per_request);
596 		struct request *req;
597 		struct ublk_zoned_report_desc desc;
598 		blk_status_t status;
599 
600 		memset(buffer, 0, buffer_length);
601 
602 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
603 		if (IS_ERR(req)) {
604 			ret = PTR_ERR(req);
605 			goto out;
606 		}
607 
608 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
609 		desc.sector = sector;
610 		desc.nr_zones = zones_in_request;
611 		ret = ublk_zoned_insert_report_desc(req, &desc);
612 		if (ret)
613 			goto free_req;
614 
615 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
616 		if (ret)
617 			goto erase_desc;
618 
619 		status = blk_execute_rq(req, 0);
620 		ret = blk_status_to_errno(status);
621 erase_desc:
622 		ublk_zoned_erase_report_desc(req);
623 free_req:
624 		blk_mq_free_request(req);
625 		if (ret)
626 			goto out;
627 
628 		for (unsigned int i = 0; i < zones_in_request; i++) {
629 			struct blk_zone *zone = buffer + i;
630 
631 			/* A zero length zone means no more zones in this response */
632 			if (!zone->len)
633 				break;
634 
635 			ret = disk_report_zone(disk, zone, i, args);
636 			if (ret)
637 				goto out;
638 
639 			done_zones++;
640 			sector += zone_size_sectors;
641 
642 		}
643 	}
644 
645 	ret = done_zones;
646 
647 out:
648 	kvfree(buffer);
649 	return ret;
650 }
651 
652 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
653 					 struct request *req)
654 {
655 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
656 	struct ublk_io *io = &ubq->ios[req->tag];
657 	struct ublk_zoned_report_desc *desc;
658 	u32 ublk_op;
659 
660 	switch (req_op(req)) {
661 	case REQ_OP_ZONE_OPEN:
662 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
663 		break;
664 	case REQ_OP_ZONE_CLOSE:
665 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
666 		break;
667 	case REQ_OP_ZONE_FINISH:
668 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
669 		break;
670 	case REQ_OP_ZONE_RESET:
671 		ublk_op = UBLK_IO_OP_ZONE_RESET;
672 		break;
673 	case REQ_OP_ZONE_APPEND:
674 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
675 		break;
676 	case REQ_OP_ZONE_RESET_ALL:
677 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
678 		break;
679 	case REQ_OP_DRV_IN:
680 		desc = ublk_zoned_get_report_desc(req);
681 		if (!desc)
682 			return BLK_STS_IOERR;
683 		ublk_op = desc->operation;
684 		switch (ublk_op) {
685 		case UBLK_IO_OP_REPORT_ZONES:
686 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
687 			iod->nr_zones = desc->nr_zones;
688 			iod->start_sector = desc->sector;
689 			return BLK_STS_OK;
690 		default:
691 			return BLK_STS_IOERR;
692 		}
693 	case REQ_OP_DRV_OUT:
694 		/* We do not support drv_out */
695 		return BLK_STS_NOTSUPP;
696 	default:
697 		return BLK_STS_IOERR;
698 	}
699 
700 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
701 	iod->nr_sectors = blk_rq_sectors(req);
702 	iod->start_sector = blk_rq_pos(req);
703 	iod->addr = io->buf.addr;
704 
705 	return BLK_STS_OK;
706 }
707 
708 #else
709 
710 #define ublk_report_zones (NULL)
711 
712 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
713 {
714 	return -EOPNOTSUPP;
715 }
716 
717 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
718 {
719 }
720 
721 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
722 {
723 	return 0;
724 }
725 
726 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
727 					 struct request *req)
728 {
729 	return BLK_STS_NOTSUPP;
730 }
731 
732 #endif
733 
734 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
735 				      bool need_map, struct io_comp_batch *iob);
736 
737 static dev_t ublk_chr_devt;
738 static const struct class ublk_chr_class = {
739 	.name = "ublk-char",
740 };
741 
742 static DEFINE_IDR(ublk_index_idr);
743 static DEFINE_SPINLOCK(ublk_idr_lock);
744 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
745 
746 static DEFINE_MUTEX(ublk_ctl_mutex);
747 
748 static struct ublk_batch_fetch_cmd *
749 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
750 {
751 	struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
752 
753 	if (fcmd) {
754 		fcmd->cmd = cmd;
755 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
756 	}
757 	return fcmd;
758 }
759 
760 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
761 {
762 	kfree(fcmd);
763 }
764 
765 static void __ublk_release_fcmd(struct ublk_queue *ubq)
766 {
767 	WRITE_ONCE(ubq->active_fcmd, NULL);
768 }
769 
770 /*
771  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
772  * dispatching
773  */
774 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
775 					const struct ublk_batch_io_data *data,
776 					struct ublk_batch_fetch_cmd *fcmd,
777 					int res)
778 {
779 	spin_lock(&ubq->evts_lock);
780 	list_del_init(&fcmd->node);
781 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
782 	__ublk_release_fcmd(ubq);
783 	spin_unlock(&ubq->evts_lock);
784 
785 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
786 	ublk_batch_free_fcmd(fcmd);
787 }
788 
789 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
790 				     struct io_br_sel *sel,
791 				     unsigned int issue_flags)
792 {
793 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
794 		return -ENOBUFS;
795 	return 0;
796 }
797 
798 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
799 				       void __user *buf, const u16 *tag_buf,
800 				       unsigned int len)
801 {
802 	if (copy_to_user(buf, tag_buf, len))
803 		return -EFAULT;
804 	return len;
805 }
806 
807 #define UBLK_MAX_UBLKS UBLK_MINORS
808 
809 /*
810  * Max unprivileged ublk devices allowed to add
811  *
812  * It can be extended to one per-user limit in future or even controlled
813  * by cgroup.
814  */
815 static unsigned int unprivileged_ublks_max = 64;
816 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
817 
818 static struct miscdevice ublk_misc;
819 
820 static inline unsigned ublk_pos_to_hwq(loff_t pos)
821 {
822 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
823 		UBLK_QID_BITS_MASK;
824 }
825 
826 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
827 {
828 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
829 }
830 
831 static inline unsigned ublk_pos_to_tag(loff_t pos)
832 {
833 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
834 		UBLK_TAG_BITS_MASK;
835 }
836 
837 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
838 {
839 	const struct ublk_param_basic *p = &ub->params.basic;
840 
841 	if (p->attrs & UBLK_ATTR_READ_ONLY)
842 		set_disk_ro(ub->ub_disk, true);
843 
844 	set_capacity(ub->ub_disk, p->dev_sectors);
845 }
846 
847 static int ublk_integrity_flags(u32 flags)
848 {
849 	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
850 
851 	if (flags & LBMD_PI_CAP_INTEGRITY) {
852 		flags &= ~LBMD_PI_CAP_INTEGRITY;
853 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
854 	}
855 	if (flags & LBMD_PI_CAP_REFTAG) {
856 		flags &= ~LBMD_PI_CAP_REFTAG;
857 		ret_flags |= BLK_INTEGRITY_REF_TAG;
858 	}
859 	return flags ? -EINVAL : ret_flags;
860 }
861 
862 static int ublk_integrity_pi_tuple_size(u8 csum_type)
863 {
864 	switch (csum_type) {
865 	case LBMD_PI_CSUM_NONE:
866 		return 0;
867 	case LBMD_PI_CSUM_IP:
868 	case LBMD_PI_CSUM_CRC16_T10DIF:
869 		return 8;
870 	case LBMD_PI_CSUM_CRC64_NVME:
871 		return 16;
872 	default:
873 		return -EINVAL;
874 	}
875 }
876 
877 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
878 {
879 	switch (csum_type) {
880 	case LBMD_PI_CSUM_NONE:
881 		return BLK_INTEGRITY_CSUM_NONE;
882 	case LBMD_PI_CSUM_IP:
883 		return BLK_INTEGRITY_CSUM_IP;
884 	case LBMD_PI_CSUM_CRC16_T10DIF:
885 		return BLK_INTEGRITY_CSUM_CRC;
886 	case LBMD_PI_CSUM_CRC64_NVME:
887 		return BLK_INTEGRITY_CSUM_CRC64;
888 	default:
889 		WARN_ON_ONCE(1);
890 		return BLK_INTEGRITY_CSUM_NONE;
891 	}
892 }
893 
894 static int ublk_validate_params(const struct ublk_device *ub)
895 {
896 	/* basic param is the only one which must be set */
897 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
898 		const struct ublk_param_basic *p = &ub->params.basic;
899 
900 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
901 			return -EINVAL;
902 
903 		/*
904 		 * 256M is a reasonable upper bound for physical block size,
905 		 * io_min and io_opt; it aligns with the maximum physical
906 		 * block size possible in NVMe.
907 		 */
908 		if (p->physical_bs_shift > ilog2(SZ_256M))
909 			return -EINVAL;
910 
911 		if (p->io_min_shift > ilog2(SZ_256M))
912 			return -EINVAL;
913 
914 		if (p->io_opt_shift > ilog2(SZ_256M))
915 			return -EINVAL;
916 
917 		if (p->logical_bs_shift > p->physical_bs_shift)
918 			return -EINVAL;
919 
920 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
921 			return -EINVAL;
922 
923 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
924 			return -EINVAL;
925 	} else
926 		return -EINVAL;
927 
928 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
929 		const struct ublk_param_discard *p = &ub->params.discard;
930 
931 		/* So far, only support single segment discard */
932 		if (p->max_discard_sectors && p->max_discard_segments != 1)
933 			return -EINVAL;
934 
935 		if (!p->discard_granularity)
936 			return -EINVAL;
937 	}
938 
939 	/* dev_t is read-only */
940 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
941 		return -EINVAL;
942 
943 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
944 		return ublk_dev_param_zoned_validate(ub);
945 	else if (ublk_dev_is_zoned(ub))
946 		return -EINVAL;
947 
948 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
949 		const struct ublk_param_dma_align *p = &ub->params.dma;
950 
951 		if (p->alignment >= PAGE_SIZE)
952 			return -EINVAL;
953 
954 		if (!is_power_of_2(p->alignment + 1))
955 			return -EINVAL;
956 	}
957 
958 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
959 		const struct ublk_param_segment *p = &ub->params.seg;
960 
961 		if (!is_power_of_2(p->seg_boundary_mask + 1))
962 			return -EINVAL;
963 
964 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
965 			return -EINVAL;
966 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
967 			return -EINVAL;
968 	}
969 
970 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
971 		const struct ublk_param_integrity *p = &ub->params.integrity;
972 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
973 		int flags = ublk_integrity_flags(p->flags);
974 
975 		if (!ublk_dev_support_integrity(ub))
976 			return -EINVAL;
977 		if (flags < 0)
978 			return flags;
979 		if (pi_tuple_size < 0)
980 			return pi_tuple_size;
981 		if (!p->metadata_size)
982 			return -EINVAL;
983 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
984 		    p->flags & LBMD_PI_CAP_REFTAG)
985 			return -EINVAL;
986 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
987 			return -EINVAL;
988 		if (p->interval_exp < SECTOR_SHIFT ||
989 		    p->interval_exp > ub->params.basic.logical_bs_shift)
990 			return -EINVAL;
991 	}
992 
993 	return 0;
994 }
995 
996 static void ublk_apply_params(struct ublk_device *ub)
997 {
998 	ublk_dev_param_basic_apply(ub);
999 
1000 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
1001 		ublk_dev_param_zoned_apply(ub);
1002 }
1003 
1004 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
1005 {
1006 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
1007 		!ublk_support_auto_buf_reg(ubq);
1008 }
1009 
1010 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
1011 {
1012 	return !ublk_dev_support_user_copy(ub) &&
1013 	       !ublk_dev_support_zero_copy(ub) &&
1014 	       !ublk_dev_support_auto_buf_reg(ub);
1015 }
1016 
1017 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1018 {
1019 	/*
1020 	 * read()/write() is involved in user copy, so request reference
1021 	 * has to be grabbed
1022 	 *
1023 	 * for zero copy, request buffer need to be registered to io_uring
1024 	 * buffer table, so reference is needed
1025 	 *
1026 	 * For auto buffer register, ublk server still may issue
1027 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1028 	 * so reference is required too.
1029 	 */
1030 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1031 		ublk_support_auto_buf_reg(ubq);
1032 }
1033 
1034 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1035 {
1036 	return ublk_dev_support_user_copy(ub) ||
1037 	       ublk_dev_support_zero_copy(ub) ||
1038 	       ublk_dev_support_auto_buf_reg(ub);
1039 }
1040 
1041 /*
1042  * ublk IO Reference Counting Design
1043  * ==================================
1044  *
1045  * For user-copy and zero-copy modes, ublk uses a split reference model with
1046  * two counters that together track IO lifetime:
1047  *
1048  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
1049  *   - io->task_registered_buffers: count of buffers registered on the IO task
1050  *
1051  * Key Invariant:
1052  * --------------
1053  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1054  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1055  * when no active references exist. After IO completion, both counters become
1056  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1057  * task_registered_buffers are 0.
1058  *
1059  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1060  * exit to determine if all references have been released.
1061  *
1062  * Why Split Counters:
1063  * -------------------
1064  * Buffers registered on the IO daemon task can use the lightweight
1065  * task_registered_buffers counter (simple increment/decrement) instead of
1066  * atomic refcount operations. The ublk_io_release() callback checks if
1067  * current == io->task to decide which counter to update.
1068  *
1069  * This optimization only applies before IO completion. At completion,
1070  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1071  * After that, all subsequent buffer unregistrations must use the atomic ref
1072  * since they may be releasing the last reference.
1073  *
1074  * Reference Lifecycle:
1075  * --------------------
1076  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1077  *
1078  * 2. During IO processing:
1079  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1080  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1081  *    - Buffer unregister callback (ublk_io_release):
1082  *      * If on-task: task_registered_buffers--
1083  *      * If off-task: ref-- via ublk_put_req_ref()
1084  *
1085  * 3. ublk_sub_req_ref() at IO completion:
1086  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1087  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1088  *    - This effectively collapses task_registered_buffers into the atomic ref,
1089  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1090  *      buffers that were already counted
1091  *
1092  * Example (zero-copy, register on-task, unregister off-task):
1093  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1094  *   - Register buffer on-task: task_registered_buffers = 1
1095  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1096  *   - Completion via ublk_sub_req_ref():
1097  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1098  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1099  *
1100  * Example (auto buffer registration):
1101  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1102  *
1103  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1104  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1105  *   - Completion via ublk_sub_req_ref():
1106  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1107  *
1108  * Example (zero-copy, ublk server killed):
1109  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1110  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1111  *
1112  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1113  *   - Register buffer on-task: task_registered_buffers = 1
1114  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1115  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1116  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1117  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1118  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1119  *     and abort pending requests
1120  *
1121  * Batch IO Special Case:
1122  * ----------------------
1123  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1124  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1125  * task_registered_buffers counter still tracks registered buffers for the
1126  * invariant check, even though the callback doesn't decrement it.
1127  *
1128  * Note: updating task_registered_buffers is protected by io->lock.
1129  */
1130 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1131 		struct ublk_io *io)
1132 {
1133 	if (ublk_need_req_ref(ubq))
1134 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1135 }
1136 
1137 static inline bool ublk_get_req_ref(struct ublk_io *io)
1138 {
1139 	return refcount_inc_not_zero(&io->ref);
1140 }
1141 
1142 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1143 {
1144 	if (!refcount_dec_and_test(&io->ref))
1145 		return;
1146 
1147 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1148 	__ublk_complete_rq(req, io, false, NULL);
1149 }
1150 
1151 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1152 {
1153 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1154 
1155 	io->task_registered_buffers = 0;
1156 	return refcount_sub_and_test(sub_refs, &io->ref);
1157 }
1158 
1159 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1160 {
1161 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1162 }
1163 
1164 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1165 {
1166 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1167 }
1168 
1169 /* Called in slow path only, keep it noinline for trace purpose */
1170 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1171 {
1172 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1173 		return ub;
1174 	return NULL;
1175 }
1176 
1177 /* Called in slow path only, keep it noinline for trace purpose */
1178 static noinline void ublk_put_device(struct ublk_device *ub)
1179 {
1180 	put_device(&ub->cdev_dev);
1181 }
1182 
1183 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1184 		int qid)
1185 {
1186 	return dev->queues[qid];
1187 }
1188 
1189 static inline bool ublk_rq_has_data(const struct request *rq)
1190 {
1191 	return bio_has_data(rq->bio);
1192 }
1193 
1194 static inline struct ublksrv_io_desc *
1195 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1196 {
1197 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1198 }
1199 
1200 static inline int __ublk_queue_cmd_buf_size(int depth)
1201 {
1202 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1203 }
1204 
1205 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1206 {
1207 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1208 }
1209 
1210 static int ublk_max_cmd_buf_size(void)
1211 {
1212 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1213 }
1214 
1215 /*
1216  * Should I/O outstanding to the ublk server when it exits be reissued?
1217  * If not, outstanding I/O will get errors.
1218  */
1219 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1220 {
1221 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1222 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1223 }
1224 
1225 /*
1226  * Should I/O issued while there is no ublk server queue? If not, I/O
1227  * issued while there is no ublk server will get errors.
1228  */
1229 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1230 {
1231 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1232 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1233 }
1234 
1235 /*
1236  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1237  * of the device flags for smaller cache footprint - better for fast
1238  * paths.
1239  */
1240 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1241 {
1242 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1243 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1244 }
1245 
1246 /*
1247  * Should ublk devices be stopped (i.e. no recovery possible) when the
1248  * ublk server exits? If not, devices can be used again by a future
1249  * incarnation of a ublk server via the start_recovery/end_recovery
1250  * commands.
1251  */
1252 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1253 {
1254 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1255 }
1256 
1257 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1258 {
1259 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1260 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1261 }
1262 
1263 static void ublk_free_disk(struct gendisk *disk)
1264 {
1265 	struct ublk_device *ub = disk->private_data;
1266 
1267 	clear_bit(UB_STATE_USED, &ub->state);
1268 	ublk_put_device(ub);
1269 }
1270 
1271 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1272 		unsigned int *owner_gid)
1273 {
1274 	kuid_t uid;
1275 	kgid_t gid;
1276 
1277 	current_uid_gid(&uid, &gid);
1278 
1279 	*owner_uid = from_kuid(&init_user_ns, uid);
1280 	*owner_gid = from_kgid(&init_user_ns, gid);
1281 }
1282 
1283 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1284 {
1285 	struct ublk_device *ub = disk->private_data;
1286 
1287 	if (capable(CAP_SYS_ADMIN))
1288 		return 0;
1289 
1290 	/*
1291 	 * If it is one unprivileged device, only owner can open
1292 	 * the disk. Otherwise it could be one trap made by one
1293 	 * evil user who grants this disk's privileges to other
1294 	 * users deliberately.
1295 	 *
1296 	 * This way is reasonable too given anyone can create
1297 	 * unprivileged device, and no need other's grant.
1298 	 */
1299 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1300 		unsigned int curr_uid, curr_gid;
1301 
1302 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1303 
1304 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1305 				ub->dev_info.owner_gid)
1306 			return -EPERM;
1307 	}
1308 
1309 	if (ub->block_open)
1310 		return -ENXIO;
1311 
1312 	return 0;
1313 }
1314 
1315 static const struct block_device_operations ub_fops = {
1316 	.owner =	THIS_MODULE,
1317 	.open =		ublk_open,
1318 	.free_disk =	ublk_free_disk,
1319 	.report_zones =	ublk_report_zones,
1320 };
1321 
1322 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1323 				struct iov_iter *uiter, int dir, size_t *done)
1324 {
1325 	unsigned len;
1326 	void *bv_buf;
1327 	size_t copied;
1328 
1329 	if (*offset >= bv->bv_len) {
1330 		*offset -= bv->bv_len;
1331 		return true;
1332 	}
1333 
1334 	len = bv->bv_len - *offset;
1335 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1336 	/*
1337 	 * Bio pages may originate from slab caches without a usercopy region
1338 	 * (e.g. jbd2 frozen metadata buffers).  This is the same data that
1339 	 * the loop driver writes to its backing file — no exposure risk.
1340 	 * The bvec length is always trusted, so the size check in
1341 	 * check_copy_size() is not needed either.  Use the unchecked
1342 	 * helpers to avoid false positives on slab pages.
1343 	 */
1344 	if (dir == ITER_DEST)
1345 		copied = _copy_to_iter(bv_buf, len, uiter);
1346 	else
1347 		copied = _copy_from_iter(bv_buf, len, uiter);
1348 
1349 	kunmap_local(bv_buf);
1350 
1351 	*done += copied;
1352 	if (copied < len)
1353 		return false;
1354 
1355 	*offset = 0;
1356 	return true;
1357 }
1358 
1359 /*
1360  * Copy data between request pages and io_iter, and 'offset'
1361  * is the start point of linear offset of request.
1362  */
1363 static size_t ublk_copy_user_pages(const struct request *req,
1364 		unsigned offset, struct iov_iter *uiter, int dir)
1365 {
1366 	struct req_iterator iter;
1367 	struct bio_vec bv;
1368 	size_t done = 0;
1369 
1370 	rq_for_each_segment(bv, req, iter) {
1371 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1372 			break;
1373 	}
1374 	return done;
1375 }
1376 
1377 #ifdef CONFIG_BLK_DEV_INTEGRITY
1378 static size_t ublk_copy_user_integrity(const struct request *req,
1379 		unsigned offset, struct iov_iter *uiter, int dir)
1380 {
1381 	size_t done = 0;
1382 	struct bio *bio = req->bio;
1383 	struct bvec_iter iter;
1384 	struct bio_vec iv;
1385 
1386 	if (!blk_integrity_rq(req))
1387 		return 0;
1388 
1389 	bio_for_each_integrity_vec(iv, bio, iter) {
1390 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1391 			break;
1392 	}
1393 
1394 	return done;
1395 }
1396 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1397 static size_t ublk_copy_user_integrity(const struct request *req,
1398 		unsigned offset, struct iov_iter *uiter, int dir)
1399 {
1400 	return 0;
1401 }
1402 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1403 
1404 static inline bool ublk_need_map_req(const struct request *req)
1405 {
1406 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1407 }
1408 
1409 static inline bool ublk_need_unmap_req(const struct request *req)
1410 {
1411 	return ublk_rq_has_data(req) &&
1412 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1413 }
1414 
1415 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1416 				const struct request *req,
1417 				const struct ublk_io *io)
1418 {
1419 	const unsigned int rq_bytes = blk_rq_bytes(req);
1420 
1421 	if (!ublk_need_map_io(ubq))
1422 		return rq_bytes;
1423 
1424 	/*
1425 	 * no zero copy, we delay copy WRITE request data into ublksrv
1426 	 * context and the big benefit is that pinning pages in current
1427 	 * context is pretty fast, see ublk_pin_user_pages
1428 	 */
1429 	if (ublk_need_map_req(req)) {
1430 		struct iov_iter iter;
1431 		const int dir = ITER_DEST;
1432 
1433 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1434 		return ublk_copy_user_pages(req, 0, &iter, dir);
1435 	}
1436 	return rq_bytes;
1437 }
1438 
1439 static unsigned int ublk_unmap_io(bool need_map,
1440 		const struct request *req,
1441 		const struct ublk_io *io)
1442 {
1443 	const unsigned int rq_bytes = blk_rq_bytes(req);
1444 
1445 	if (!need_map)
1446 		return rq_bytes;
1447 
1448 	if (ublk_need_unmap_req(req)) {
1449 		struct iov_iter iter;
1450 		const int dir = ITER_SOURCE;
1451 
1452 		WARN_ON_ONCE(io->res > rq_bytes);
1453 
1454 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1455 		return ublk_copy_user_pages(req, 0, &iter, dir);
1456 	}
1457 	return rq_bytes;
1458 }
1459 
1460 static inline unsigned int ublk_req_build_flags(struct request *req)
1461 {
1462 	unsigned flags = 0;
1463 
1464 	if (req->cmd_flags & REQ_FAILFAST_DEV)
1465 		flags |= UBLK_IO_F_FAILFAST_DEV;
1466 
1467 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1468 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1469 
1470 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1471 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
1472 
1473 	if (req->cmd_flags & REQ_META)
1474 		flags |= UBLK_IO_F_META;
1475 
1476 	if (req->cmd_flags & REQ_FUA)
1477 		flags |= UBLK_IO_F_FUA;
1478 
1479 	if (req->cmd_flags & REQ_NOUNMAP)
1480 		flags |= UBLK_IO_F_NOUNMAP;
1481 
1482 	if (req->cmd_flags & REQ_SWAP)
1483 		flags |= UBLK_IO_F_SWAP;
1484 
1485 	if (blk_integrity_rq(req))
1486 		flags |= UBLK_IO_F_INTEGRITY;
1487 
1488 	return flags;
1489 }
1490 
1491 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1492 {
1493 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1494 	struct ublk_io *io = &ubq->ios[req->tag];
1495 	u32 ublk_op;
1496 
1497 	switch (req_op(req)) {
1498 	case REQ_OP_READ:
1499 		ublk_op = UBLK_IO_OP_READ;
1500 		break;
1501 	case REQ_OP_WRITE:
1502 		ublk_op = UBLK_IO_OP_WRITE;
1503 		break;
1504 	case REQ_OP_FLUSH:
1505 		ublk_op = UBLK_IO_OP_FLUSH;
1506 		break;
1507 	case REQ_OP_DISCARD:
1508 		ublk_op = UBLK_IO_OP_DISCARD;
1509 		break;
1510 	case REQ_OP_WRITE_ZEROES:
1511 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1512 		break;
1513 	default:
1514 		if (ublk_queue_is_zoned(ubq))
1515 			return ublk_setup_iod_zoned(ubq, req);
1516 		return BLK_STS_IOERR;
1517 	}
1518 
1519 	/* need to translate since kernel may change */
1520 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
1521 	iod->nr_sectors = blk_rq_sectors(req);
1522 	iod->start_sector = blk_rq_pos(req);
1523 
1524 	/* Try shmem zero-copy match before setting addr */
1525 	if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
1526 		u32 buf_idx, buf_off;
1527 
1528 		if (ublk_try_buf_match(ubq->dev, req,
1529 					  &buf_idx, &buf_off)) {
1530 			iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
1531 			iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
1532 			return BLK_STS_OK;
1533 		}
1534 	}
1535 
1536 	iod->addr = io->buf.addr;
1537 
1538 	return BLK_STS_OK;
1539 }
1540 
1541 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1542 		struct io_uring_cmd *ioucmd)
1543 {
1544 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1545 }
1546 
1547 static void ublk_end_request(struct request *req, blk_status_t error)
1548 {
1549 	local_bh_disable();
1550 	blk_mq_end_request(req, error);
1551 	local_bh_enable();
1552 }
1553 
1554 /* todo: handle partial completion */
1555 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1556 				      bool need_map, struct io_comp_batch *iob)
1557 {
1558 	unsigned int unmapped_bytes;
1559 	blk_status_t res = BLK_STS_OK;
1560 	bool requeue;
1561 
1562 	/* failed read IO if nothing is read */
1563 	if (!io->res && req_op(req) == REQ_OP_READ)
1564 		io->res = -EIO;
1565 
1566 	if (io->res < 0) {
1567 		res = errno_to_blk_status(io->res);
1568 		goto exit;
1569 	}
1570 
1571 	/*
1572 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1573 	 * directly.
1574 	 *
1575 	 * Both the two needn't unmap.
1576 	 */
1577 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1578 	    req_op(req) != REQ_OP_DRV_IN)
1579 		goto exit;
1580 
1581 	/* shmem zero copy: no data to unmap, pages already shared */
1582 	if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1583 		goto exit;
1584 
1585 	/* for READ request, writing data in iod->addr to rq buffers */
1586 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1587 
1588 	/*
1589 	 * Extremely impossible since we got data filled in just before
1590 	 *
1591 	 * Re-read simply for this unlikely case.
1592 	 */
1593 	if (unlikely(unmapped_bytes < io->res))
1594 		io->res = unmapped_bytes;
1595 
1596 	/*
1597 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1598 	 * happens off this path, then that will prevent ublk's blkdev_release()
1599 	 * from being called on current's task work, see fput() implementation.
1600 	 *
1601 	 * Otherwise, ublk server may not provide forward progress in case of
1602 	 * reading the partition table from bdev_open() with disk->open_mutex
1603 	 * held, and causes dead lock as we could already be holding
1604 	 * disk->open_mutex here.
1605 	 *
1606 	 * Preferably we would not be doing IO with a mutex held that is also
1607 	 * used for release, but this work-around will suffice for now.
1608 	 */
1609 	local_bh_disable();
1610 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1611 	local_bh_enable();
1612 	if (requeue)
1613 		blk_mq_requeue_request(req, true);
1614 	else if (likely(!blk_should_fake_timeout(req->q))) {
1615 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1616 			return;
1617 		__blk_mq_end_request(req, BLK_STS_OK);
1618 	}
1619 
1620 	return;
1621 exit:
1622 	ublk_end_request(req, res);
1623 }
1624 
1625 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1626 						     struct request *req)
1627 {
1628 	/* read cmd first because req will overwrite it */
1629 	struct io_uring_cmd *cmd = io->cmd;
1630 
1631 	/* mark this cmd owned by ublksrv */
1632 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1633 
1634 	/*
1635 	 * clear ACTIVE since we are done with this sqe/cmd slot
1636 	 * We can only accept io cmd in case of being not active.
1637 	 */
1638 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1639 
1640 	io->req = req;
1641 	return cmd;
1642 }
1643 
1644 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1645 				 int res, unsigned issue_flags)
1646 {
1647 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1648 
1649 	/* tell ublksrv one io request is coming */
1650 	io_uring_cmd_done(cmd, res, issue_flags);
1651 }
1652 
1653 #define UBLK_REQUEUE_DELAY_MS	3
1654 
1655 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1656 		struct request *rq)
1657 {
1658 	/* We cannot process this rq so just requeue it. */
1659 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1660 		blk_mq_requeue_request(rq, false);
1661 	else
1662 		ublk_end_request(rq, BLK_STS_IOERR);
1663 }
1664 
1665 static void
1666 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1667 {
1668 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1669 
1670 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1671 }
1672 
1673 enum auto_buf_reg_res {
1674 	AUTO_BUF_REG_FAIL,
1675 	AUTO_BUF_REG_FALLBACK,
1676 	AUTO_BUF_REG_OK,
1677 };
1678 
1679 /*
1680  * Setup io state after auto buffer registration.
1681  *
1682  * Must be called after ublk_auto_buf_register() is done.
1683  * Caller must hold io->lock in batch context.
1684  */
1685 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1686 				   struct request *req, struct ublk_io *io,
1687 				   struct io_uring_cmd *cmd,
1688 				   enum auto_buf_reg_res res)
1689 {
1690 	if (res == AUTO_BUF_REG_OK) {
1691 		io->task_registered_buffers = 1;
1692 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1693 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1694 	}
1695 	ublk_init_req_ref(ubq, io);
1696 	__ublk_prep_compl_io_cmd(io, req);
1697 }
1698 
1699 /* Register request bvec to io_uring for auto buffer registration. */
1700 static enum auto_buf_reg_res
1701 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1702 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1703 		       unsigned int issue_flags)
1704 {
1705 	int ret;
1706 
1707 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1708 				      io->buf.auto_reg.index, issue_flags);
1709 	if (ret) {
1710 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1711 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1712 			return AUTO_BUF_REG_FALLBACK;
1713 		}
1714 		ublk_end_request(req, BLK_STS_IOERR);
1715 		return AUTO_BUF_REG_FAIL;
1716 	}
1717 
1718 	return AUTO_BUF_REG_OK;
1719 }
1720 
1721 /*
1722  * Dispatch IO to userspace with auto buffer registration.
1723  *
1724  * Only called in non-batch context from task work, io->lock not held.
1725  */
1726 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1727 				   struct request *req, struct ublk_io *io,
1728 				   struct io_uring_cmd *cmd,
1729 				   unsigned int issue_flags)
1730 {
1731 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1732 			issue_flags);
1733 
1734 	if (res != AUTO_BUF_REG_FAIL) {
1735 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1736 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1737 	}
1738 }
1739 
1740 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1741 			  struct ublk_io *io)
1742 {
1743 	unsigned mapped_bytes;
1744 
1745 	/* shmem zero copy: skip data copy, pages already shared */
1746 	if (ublk_iod_is_shmem_zc(ubq, req->tag))
1747 		return true;
1748 
1749 	mapped_bytes = ublk_map_io(ubq, req, io);
1750 
1751 	/* partially mapped, update io descriptor */
1752 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1753 		/*
1754 		 * Nothing mapped, retry until we succeed.
1755 		 *
1756 		 * We may never succeed in mapping any bytes here because
1757 		 * of OOM. TODO: reserve one buffer with single page pinned
1758 		 * for providing forward progress guarantee.
1759 		 */
1760 		if (unlikely(!mapped_bytes)) {
1761 			blk_mq_requeue_request(req, false);
1762 			blk_mq_delay_kick_requeue_list(req->q,
1763 					UBLK_REQUEUE_DELAY_MS);
1764 			return false;
1765 		}
1766 
1767 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1768 			mapped_bytes >> 9;
1769 	}
1770 
1771 	return true;
1772 }
1773 
1774 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1775 {
1776 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1777 	int tag = req->tag;
1778 	struct ublk_io *io = &ubq->ios[tag];
1779 
1780 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1781 			__func__, ubq->q_id, req->tag, io->flags,
1782 			ublk_get_iod(ubq, req->tag)->addr);
1783 
1784 	/*
1785 	 * Task is exiting if either:
1786 	 *
1787 	 * (1) current != io->task.
1788 	 * io_uring_cmd_complete_in_task() tries to run task_work
1789 	 * in a workqueue if cmd's task is PF_EXITING.
1790 	 *
1791 	 * (2) current->flags & PF_EXITING.
1792 	 */
1793 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1794 		__ublk_abort_rq(ubq, req);
1795 		return;
1796 	}
1797 
1798 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1799 		/*
1800 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1801 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1802 		 * and notify it.
1803 		 */
1804 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1805 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1806 				__func__, ubq->q_id, req->tag, io->flags);
1807 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1808 				     issue_flags);
1809 		return;
1810 	}
1811 
1812 	if (!ublk_start_io(ubq, req, io))
1813 		return;
1814 
1815 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1816 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1817 	} else {
1818 		ublk_init_req_ref(ubq, io);
1819 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1820 	}
1821 }
1822 
1823 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1824 				       const struct ublk_batch_io_data *data,
1825 				       unsigned short tag)
1826 {
1827 	struct ublk_device *ub = data->ub;
1828 	struct ublk_io *io = &ubq->ios[tag];
1829 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1830 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1831 	struct io_uring_cmd *cmd = data->cmd;
1832 
1833 	if (!ublk_start_io(ubq, req, io))
1834 		return false;
1835 
1836 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1837 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1838 				data->issue_flags);
1839 
1840 		if (res == AUTO_BUF_REG_FAIL)
1841 			return false;
1842 	}
1843 
1844 	ublk_io_lock(io);
1845 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1846 	ublk_io_unlock(io);
1847 
1848 	return true;
1849 }
1850 
1851 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1852 				     const struct ublk_batch_io_data *data,
1853 				     unsigned short *tag_buf,
1854 				     unsigned int len)
1855 {
1856 	bool has_unused = false;
1857 	unsigned int i;
1858 
1859 	for (i = 0; i < len; i++) {
1860 		unsigned short tag = tag_buf[i];
1861 
1862 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1863 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1864 			has_unused = true;
1865 		}
1866 	}
1867 
1868 	return has_unused;
1869 }
1870 
1871 /*
1872  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1873  * Returns the new length after filtering.
1874  */
1875 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1876 					    unsigned int len)
1877 {
1878 	unsigned int i, j;
1879 
1880 	for (i = 0, j = 0; i < len; i++) {
1881 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1882 			if (i != j)
1883 				tag_buf[j] = tag_buf[i];
1884 			j++;
1885 		}
1886 	}
1887 
1888 	return j;
1889 }
1890 
1891 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1892 		const struct ublk_batch_io_data *data,
1893 		unsigned short *tag_buf, size_t len, int ret)
1894 {
1895 	int i, res;
1896 
1897 	/*
1898 	 * Undo prep state for all IOs since userspace never received them.
1899 	 * This restores IOs to pre-prepared state so they can be cleanly
1900 	 * re-prepared when tags are pulled from FIFO again.
1901 	 */
1902 	for (i = 0; i < len; i++) {
1903 		struct ublk_io *io = &ubq->ios[tag_buf[i]];
1904 		int index = -1;
1905 
1906 		ublk_io_lock(io);
1907 		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1908 			index = io->buf.auto_reg.index;
1909 		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1910 		io->flags |= UBLK_IO_FLAG_ACTIVE;
1911 		ublk_io_unlock(io);
1912 
1913 		if (index != -1)
1914 			io_buffer_unregister_bvec(data->cmd, index,
1915 					data->issue_flags);
1916 	}
1917 
1918 	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1919 		tag_buf, len, &ubq->evts_lock);
1920 
1921 	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1922 			"tags(%d %zu) ret %d\n", __func__, res, len,
1923 			ret);
1924 }
1925 
1926 #define MAX_NR_TAG 128
1927 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1928 				 const struct ublk_batch_io_data *data,
1929 				 struct ublk_batch_fetch_cmd *fcmd)
1930 {
1931 	const unsigned int tag_sz = sizeof(unsigned short);
1932 	unsigned short tag_buf[MAX_NR_TAG];
1933 	struct io_br_sel sel;
1934 	size_t len = 0;
1935 	bool needs_filter;
1936 	int ret;
1937 
1938 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1939 
1940 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1941 					 data->issue_flags);
1942 	if (sel.val < 0)
1943 		return sel.val;
1944 	if (!sel.addr)
1945 		return -ENOBUFS;
1946 
1947 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1948 	len = min(len, sizeof(tag_buf)) / tag_sz;
1949 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1950 
1951 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1952 	/* Filter out unused tags before posting to userspace */
1953 	if (unlikely(needs_filter)) {
1954 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1955 
1956 		/* return actual length if all are failed or requeued */
1957 		if (!new_len) {
1958 			/* release the selected buffer */
1959 			sel.val = 0;
1960 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1961 						&sel, data->issue_flags));
1962 			return len;
1963 		}
1964 		len = new_len;
1965 	}
1966 
1967 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1968 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1969 	if (unlikely(ret < 0))
1970 		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1971 	return ret;
1972 }
1973 
1974 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1975 		struct ublk_queue *ubq)
1976 {
1977 	struct ublk_batch_fetch_cmd *fcmd;
1978 
1979 	lockdep_assert_held(&ubq->evts_lock);
1980 
1981 	/*
1982 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1983 	 *
1984 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1985 	 *
1986 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1987 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1988 	 */
1989 	smp_mb();
1990 	if (READ_ONCE(ubq->active_fcmd)) {
1991 		fcmd = NULL;
1992 	} else {
1993 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1994 				struct ublk_batch_fetch_cmd, node);
1995 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1996 	}
1997 	return fcmd;
1998 }
1999 
2000 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2001 {
2002 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
2003 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2004 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2005 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2006 	struct ublk_batch_io_data data = {
2007 		.ub = pdu->ubq->dev,
2008 		.cmd = fcmd->cmd,
2009 		.issue_flags = issue_flags,
2010 	};
2011 
2012 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
2013 
2014 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
2015 }
2016 
2017 static void
2018 ublk_batch_dispatch(struct ublk_queue *ubq,
2019 		    const struct ublk_batch_io_data *data,
2020 		    struct ublk_batch_fetch_cmd *fcmd)
2021 {
2022 	struct ublk_batch_fetch_cmd *new_fcmd;
2023 	unsigned tried = 0;
2024 	int ret = 0;
2025 
2026 again:
2027 	while (!ublk_io_evts_empty(ubq)) {
2028 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
2029 		if (ret <= 0)
2030 			break;
2031 	}
2032 
2033 	if (ret < 0) {
2034 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2035 		return;
2036 	}
2037 
2038 	__ublk_release_fcmd(ubq);
2039 	/*
2040 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2041 	 * checking ubq->evts_fifo.
2042 	 *
2043 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2044 	 */
2045 	smp_mb();
2046 	if (likely(ublk_io_evts_empty(ubq)))
2047 		return;
2048 
2049 	spin_lock(&ubq->evts_lock);
2050 	new_fcmd = __ublk_acquire_fcmd(ubq);
2051 	spin_unlock(&ubq->evts_lock);
2052 
2053 	if (!new_fcmd)
2054 		return;
2055 
2056 	/* Avoid lockup by allowing to handle at most 32 batches */
2057 	if (new_fcmd == fcmd && tried++ < 32)
2058 		goto again;
2059 
2060 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2061 }
2062 
2063 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2064 {
2065 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2066 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2067 	struct ublk_queue *ubq = pdu->ubq;
2068 
2069 	ublk_dispatch_req(ubq, pdu->req);
2070 }
2071 
2072 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2073 {
2074 	unsigned short tag = rq->tag;
2075 	struct ublk_batch_fetch_cmd *fcmd = NULL;
2076 
2077 	spin_lock(&ubq->evts_lock);
2078 	kfifo_put(&ubq->evts_fifo, tag);
2079 	if (last)
2080 		fcmd = __ublk_acquire_fcmd(ubq);
2081 	spin_unlock(&ubq->evts_lock);
2082 
2083 	if (fcmd)
2084 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2085 }
2086 
2087 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2088 {
2089 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2090 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2091 
2092 	pdu->req = rq;
2093 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2094 }
2095 
2096 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2097 {
2098 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2099 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2100 	struct request *rq = pdu->req_list;
2101 	struct request *next;
2102 
2103 	do {
2104 		next = rq->rq_next;
2105 		rq->rq_next = NULL;
2106 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2107 		rq = next;
2108 	} while (rq);
2109 }
2110 
2111 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2112 {
2113 	struct io_uring_cmd *cmd = io->cmd;
2114 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2115 
2116 	pdu->req_list = rq_list_peek(l);
2117 	rq_list_init(l);
2118 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2119 }
2120 
2121 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2122 {
2123 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2124 	pid_t tgid = ubq->dev->ublksrv_tgid;
2125 	struct task_struct *p;
2126 	struct pid *pid;
2127 
2128 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2129 		return BLK_EH_RESET_TIMER;
2130 
2131 	if (unlikely(!tgid))
2132 		return BLK_EH_RESET_TIMER;
2133 
2134 	rcu_read_lock();
2135 	pid = find_vpid(tgid);
2136 	p = pid_task(pid, PIDTYPE_PID);
2137 	if (p)
2138 		send_sig(SIGKILL, p, 0);
2139 	rcu_read_unlock();
2140 	return BLK_EH_DONE;
2141 }
2142 
2143 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2144 				  bool check_cancel)
2145 {
2146 	blk_status_t res;
2147 
2148 	if (unlikely(READ_ONCE(ubq->fail_io)))
2149 		return BLK_STS_TARGET;
2150 
2151 	/* With recovery feature enabled, force_abort is set in
2152 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2153 	 * abort all requeued and new rqs here to let del_gendisk()
2154 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2155 	 * to avoid UAF on io_uring ctx.
2156 	 *
2157 	 * Note: force_abort is guaranteed to be seen because it is set
2158 	 * before request queue is unqiuesced.
2159 	 */
2160 	if (ublk_nosrv_should_queue_io(ubq) &&
2161 	    unlikely(READ_ONCE(ubq->force_abort)))
2162 		return BLK_STS_IOERR;
2163 
2164 	if (check_cancel && unlikely(ubq->canceling))
2165 		return BLK_STS_IOERR;
2166 
2167 	/* fill iod to slot in io cmd buffer */
2168 	res = ublk_setup_iod(ubq, rq);
2169 	if (unlikely(res != BLK_STS_OK))
2170 		return BLK_STS_IOERR;
2171 
2172 	blk_mq_start_request(rq);
2173 	return BLK_STS_OK;
2174 }
2175 
2176 /*
2177  * Common helper for queue_rq that handles request preparation and
2178  * cancellation checks. Returns status and sets should_queue to indicate
2179  * whether the caller should proceed with queuing the request.
2180  */
2181 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2182 						   struct request *rq,
2183 						   bool *should_queue)
2184 {
2185 	blk_status_t res;
2186 
2187 	res = ublk_prep_req(ubq, rq, false);
2188 	if (res != BLK_STS_OK) {
2189 		*should_queue = false;
2190 		return res;
2191 	}
2192 
2193 	/*
2194 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2195 	 * is dealt with, otherwise this request may not be failed in case
2196 	 * of recovery, and cause hang when deleting disk
2197 	 */
2198 	if (unlikely(ubq->canceling)) {
2199 		*should_queue = false;
2200 		__ublk_abort_rq(ubq, rq);
2201 		return BLK_STS_OK;
2202 	}
2203 
2204 	*should_queue = true;
2205 	return BLK_STS_OK;
2206 }
2207 
2208 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2209 		const struct blk_mq_queue_data *bd)
2210 {
2211 	struct ublk_queue *ubq = hctx->driver_data;
2212 	struct request *rq = bd->rq;
2213 	bool should_queue;
2214 	blk_status_t res;
2215 
2216 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2217 	if (!should_queue)
2218 		return res;
2219 
2220 	ublk_queue_cmd(ubq, rq);
2221 	return BLK_STS_OK;
2222 }
2223 
2224 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2225 		const struct blk_mq_queue_data *bd)
2226 {
2227 	struct ublk_queue *ubq = hctx->driver_data;
2228 	struct request *rq = bd->rq;
2229 	bool should_queue;
2230 	blk_status_t res;
2231 
2232 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2233 	if (!should_queue)
2234 		return res;
2235 
2236 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2237 	return BLK_STS_OK;
2238 }
2239 
2240 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2241 					     const struct ublk_io *io2)
2242 {
2243 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2244 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2245 		(io->task == io2->task);
2246 }
2247 
2248 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2249 {
2250 	struct ublk_queue *ubq = hctx->driver_data;
2251 	struct ublk_batch_fetch_cmd *fcmd;
2252 
2253 	spin_lock(&ubq->evts_lock);
2254 	fcmd = __ublk_acquire_fcmd(ubq);
2255 	spin_unlock(&ubq->evts_lock);
2256 
2257 	if (fcmd)
2258 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2259 }
2260 
2261 static void ublk_queue_rqs(struct rq_list *rqlist)
2262 {
2263 	struct rq_list requeue_list = { };
2264 	struct rq_list submit_list = { };
2265 	struct ublk_io *io = NULL;
2266 	struct request *req;
2267 
2268 	while ((req = rq_list_pop(rqlist))) {
2269 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2270 		struct ublk_io *this_io = &this_q->ios[req->tag];
2271 
2272 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2273 			rq_list_add_tail(&requeue_list, req);
2274 			continue;
2275 		}
2276 
2277 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2278 				!rq_list_empty(&submit_list))
2279 			ublk_queue_cmd_list(io, &submit_list);
2280 		io = this_io;
2281 		rq_list_add_tail(&submit_list, req);
2282 	}
2283 
2284 	if (!rq_list_empty(&submit_list))
2285 		ublk_queue_cmd_list(io, &submit_list);
2286 	*rqlist = requeue_list;
2287 }
2288 
2289 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2290 {
2291 	unsigned short tags[MAX_NR_TAG];
2292 	struct ublk_batch_fetch_cmd *fcmd;
2293 	struct request *rq;
2294 	unsigned cnt = 0;
2295 
2296 	spin_lock(&ubq->evts_lock);
2297 	rq_list_for_each(l, rq) {
2298 		tags[cnt++] = (unsigned short)rq->tag;
2299 		if (cnt >= MAX_NR_TAG) {
2300 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2301 			cnt = 0;
2302 		}
2303 	}
2304 	if (cnt)
2305 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2306 	fcmd = __ublk_acquire_fcmd(ubq);
2307 	spin_unlock(&ubq->evts_lock);
2308 
2309 	rq_list_init(l);
2310 	if (fcmd)
2311 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2312 }
2313 
2314 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2315 {
2316 	struct rq_list requeue_list = { };
2317 	struct rq_list submit_list = { };
2318 	struct ublk_queue *ubq = NULL;
2319 	struct request *req;
2320 
2321 	while ((req = rq_list_pop(rqlist))) {
2322 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2323 
2324 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2325 			rq_list_add_tail(&requeue_list, req);
2326 			continue;
2327 		}
2328 
2329 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2330 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2331 		ubq = this_q;
2332 		rq_list_add_tail(&submit_list, req);
2333 	}
2334 
2335 	if (!rq_list_empty(&submit_list))
2336 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2337 	*rqlist = requeue_list;
2338 }
2339 
2340 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2341 		unsigned int hctx_idx)
2342 {
2343 	struct ublk_device *ub = driver_data;
2344 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2345 
2346 	hctx->driver_data = ubq;
2347 	return 0;
2348 }
2349 
2350 static const struct blk_mq_ops ublk_mq_ops = {
2351 	.queue_rq       = ublk_queue_rq,
2352 	.queue_rqs      = ublk_queue_rqs,
2353 	.init_hctx	= ublk_init_hctx,
2354 	.timeout	= ublk_timeout,
2355 };
2356 
2357 static const struct blk_mq_ops ublk_batch_mq_ops = {
2358 	.commit_rqs	= ublk_commit_rqs,
2359 	.queue_rq       = ublk_batch_queue_rq,
2360 	.queue_rqs      = ublk_batch_queue_rqs,
2361 	.init_hctx	= ublk_init_hctx,
2362 	.timeout	= ublk_timeout,
2363 };
2364 
2365 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2366 {
2367 	int i;
2368 
2369 	ubq->nr_io_ready = 0;
2370 
2371 	for (i = 0; i < ubq->q_depth; i++) {
2372 		struct ublk_io *io = &ubq->ios[i];
2373 
2374 		/*
2375 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2376 		 * io->cmd
2377 		 */
2378 		io->flags &= UBLK_IO_FLAG_CANCELED;
2379 		io->cmd = NULL;
2380 		io->buf.addr = 0;
2381 
2382 		/*
2383 		 * old task is PF_EXITING, put it now
2384 		 *
2385 		 * It could be NULL in case of closing one quiesced
2386 		 * device.
2387 		 */
2388 		if (io->task) {
2389 			put_task_struct(io->task);
2390 			io->task = NULL;
2391 		}
2392 
2393 		WARN_ON_ONCE(refcount_read(&io->ref));
2394 		WARN_ON_ONCE(io->task_registered_buffers);
2395 	}
2396 }
2397 
2398 static int ublk_ch_open(struct inode *inode, struct file *filp)
2399 {
2400 	struct ublk_device *ub = container_of(inode->i_cdev,
2401 			struct ublk_device, cdev);
2402 
2403 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2404 		return -EBUSY;
2405 	filp->private_data = ub;
2406 	ub->ublksrv_tgid = current->tgid;
2407 	return 0;
2408 }
2409 
2410 static void ublk_reset_ch_dev(struct ublk_device *ub)
2411 {
2412 	int i;
2413 
2414 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2415 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2416 
2417 		/* Sync with ublk_cancel_cmd() */
2418 		spin_lock(&ubq->cancel_lock);
2419 		ublk_queue_reinit(ub, ubq);
2420 		spin_unlock(&ubq->cancel_lock);
2421 	}
2422 
2423 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2424 	ub->mm = NULL;
2425 	ub->nr_queue_ready = 0;
2426 	ub->unprivileged_daemons = false;
2427 	ub->ublksrv_tgid = -1;
2428 }
2429 
2430 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2431 {
2432 	struct gendisk *disk;
2433 
2434 	spin_lock(&ub->lock);
2435 	disk = ub->ub_disk;
2436 	if (disk)
2437 		get_device(disk_to_dev(disk));
2438 	spin_unlock(&ub->lock);
2439 
2440 	return disk;
2441 }
2442 
2443 static void ublk_put_disk(struct gendisk *disk)
2444 {
2445 	if (disk)
2446 		put_device(disk_to_dev(disk));
2447 }
2448 
2449 static void ublk_partition_scan_work(struct work_struct *work)
2450 {
2451 	struct ublk_device *ub =
2452 		container_of(work, struct ublk_device, partition_scan_work);
2453 	/* Hold disk reference to prevent UAF during concurrent teardown */
2454 	struct gendisk *disk = ublk_get_disk(ub);
2455 
2456 	if (!disk)
2457 		return;
2458 
2459 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2460 					     &disk->state)))
2461 		goto out;
2462 
2463 	mutex_lock(&disk->open_mutex);
2464 	bdev_disk_changed(disk, false);
2465 	mutex_unlock(&disk->open_mutex);
2466 out:
2467 	ublk_put_disk(disk);
2468 }
2469 
2470 /*
2471  * Use this function to ensure that ->canceling is consistently set for
2472  * the device and all queues. Do not set these flags directly.
2473  *
2474  * Caller must ensure that:
2475  * - cancel_mutex is held. This ensures that there is no concurrent
2476  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2477  * - there are no concurrent reads of ubq->canceling from the queue_rq
2478  *   path. This can be done by quiescing the queue, or through other
2479  *   means.
2480  */
2481 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2482 	__must_hold(&ub->cancel_mutex)
2483 {
2484 	int i;
2485 
2486 	ub->canceling = canceling;
2487 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2488 		ublk_get_queue(ub, i)->canceling = canceling;
2489 }
2490 
2491 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2492 {
2493 	int i, j;
2494 
2495 	if (!ublk_dev_need_req_ref(ub))
2496 		return false;
2497 
2498 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2499 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2500 
2501 		for (j = 0; j < ubq->q_depth; j++) {
2502 			struct ublk_io *io = &ubq->ios[j];
2503 			unsigned int refs = refcount_read(&io->ref) +
2504 				io->task_registered_buffers;
2505 
2506 			/*
2507 			 * UBLK_REFCOUNT_INIT or zero means no active
2508 			 * reference
2509 			 */
2510 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2511 				return true;
2512 
2513 			/* reset to zero if the io hasn't active references */
2514 			refcount_set(&io->ref, 0);
2515 			io->task_registered_buffers = 0;
2516 		}
2517 	}
2518 	return false;
2519 }
2520 
2521 static void ublk_ch_release_work_fn(struct work_struct *work)
2522 {
2523 	struct ublk_device *ub =
2524 		container_of(work, struct ublk_device, exit_work.work);
2525 	struct gendisk *disk;
2526 	int i;
2527 
2528 	/*
2529 	 * For zero-copy and auto buffer register modes, I/O references
2530 	 * might not be dropped naturally when the daemon is killed, but
2531 	 * io_uring guarantees that registered bvec kernel buffers are
2532 	 * unregistered finally when freeing io_uring context, then the
2533 	 * active references are dropped.
2534 	 *
2535 	 * Wait until active references are dropped for avoiding use-after-free
2536 	 *
2537 	 * registered buffer may be unregistered in io_ring's release hander,
2538 	 * so have to wait by scheduling work function for avoiding the two
2539 	 * file release dependency.
2540 	 */
2541 	if (ublk_check_and_reset_active_ref(ub)) {
2542 		schedule_delayed_work(&ub->exit_work, 1);
2543 		return;
2544 	}
2545 
2546 	/*
2547 	 * disk isn't attached yet, either device isn't live, or it has
2548 	 * been removed already, so we needn't to do anything
2549 	 */
2550 	disk = ublk_get_disk(ub);
2551 	if (!disk)
2552 		goto out;
2553 
2554 	/*
2555 	 * All uring_cmd are done now, so abort any request outstanding to
2556 	 * the ublk server
2557 	 *
2558 	 * This can be done in lockless way because ublk server has been
2559 	 * gone
2560 	 *
2561 	 * More importantly, we have to provide forward progress guarantee
2562 	 * without holding ub->mutex, otherwise control task grabbing
2563 	 * ub->mutex triggers deadlock
2564 	 *
2565 	 * All requests may be inflight, so ->canceling may not be set, set
2566 	 * it now.
2567 	 */
2568 	mutex_lock(&ub->cancel_mutex);
2569 	ublk_set_canceling(ub, true);
2570 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2571 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2572 	mutex_unlock(&ub->cancel_mutex);
2573 	blk_mq_kick_requeue_list(disk->queue);
2574 
2575 	/*
2576 	 * All infligh requests have been completed or requeued and any new
2577 	 * request will be failed or requeued via `->canceling` now, so it is
2578 	 * fine to grab ub->mutex now.
2579 	 */
2580 	mutex_lock(&ub->mutex);
2581 
2582 	/* double check after grabbing lock */
2583 	if (!ub->ub_disk)
2584 		goto unlock;
2585 
2586 	/*
2587 	 * Transition the device to the nosrv state. What exactly this
2588 	 * means depends on the recovery flags
2589 	 */
2590 	if (ublk_nosrv_should_stop_dev(ub)) {
2591 		/*
2592 		 * Allow any pending/future I/O to pass through quickly
2593 		 * with an error. This is needed because del_gendisk
2594 		 * waits for all pending I/O to complete
2595 		 */
2596 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2597 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2598 
2599 		ublk_stop_dev_unlocked(ub);
2600 	} else {
2601 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2602 			/* ->canceling is set and all requests are aborted */
2603 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2604 		} else {
2605 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2606 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2607 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2608 		}
2609 	}
2610 unlock:
2611 	mutex_unlock(&ub->mutex);
2612 	ublk_put_disk(disk);
2613 
2614 	/* all uring_cmd has been done now, reset device & ubq */
2615 	ublk_reset_ch_dev(ub);
2616 out:
2617 	clear_bit(UB_STATE_OPEN, &ub->state);
2618 
2619 	/* put the reference grabbed in ublk_ch_release() */
2620 	ublk_put_device(ub);
2621 }
2622 
2623 static int ublk_ch_release(struct inode *inode, struct file *filp)
2624 {
2625 	struct ublk_device *ub = filp->private_data;
2626 
2627 	/*
2628 	 * Grab ublk device reference, so it won't be gone until we are
2629 	 * really released from work function.
2630 	 */
2631 	ublk_get_device(ub);
2632 
2633 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2634 	schedule_delayed_work(&ub->exit_work, 0);
2635 	return 0;
2636 }
2637 
2638 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
2639 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2640 {
2641 	struct ublk_device *ub = filp->private_data;
2642 	size_t sz = vma->vm_end - vma->vm_start;
2643 	unsigned max_sz = ublk_max_cmd_buf_size();
2644 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2645 	int q_id, ret = 0;
2646 
2647 	spin_lock(&ub->lock);
2648 	if (!ub->mm)
2649 		ub->mm = current->mm;
2650 	if (current->mm != ub->mm)
2651 		ret = -EINVAL;
2652 	spin_unlock(&ub->lock);
2653 
2654 	if (ret)
2655 		return ret;
2656 
2657 	if (vma->vm_flags & VM_WRITE)
2658 		return -EPERM;
2659 
2660 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2661 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2662 		return -EINVAL;
2663 
2664 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2665 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2666 			__func__, q_id, current->pid, vma->vm_start,
2667 			phys_off, (unsigned long)sz);
2668 
2669 	if (sz != ublk_queue_cmd_buf_size(ub))
2670 		return -EINVAL;
2671 
2672 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2673 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2674 }
2675 
2676 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2677 		struct request *req)
2678 {
2679 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2680 			io->flags & UBLK_IO_FLAG_ACTIVE);
2681 
2682 	if (ublk_nosrv_should_reissue_outstanding(ub))
2683 		blk_mq_requeue_request(req, false);
2684 	else {
2685 		io->res = -EIO;
2686 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2687 	}
2688 }
2689 
2690 /*
2691  * Request tag may just be filled to event kfifo, not get chance to
2692  * dispatch, abort these requests too
2693  */
2694 static void ublk_abort_batch_queue(struct ublk_device *ub,
2695 				   struct ublk_queue *ubq)
2696 {
2697 	unsigned short tag;
2698 
2699 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2700 		struct request *req = blk_mq_tag_to_rq(
2701 				ub->tag_set.tags[ubq->q_id], tag);
2702 
2703 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2704 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2705 	}
2706 }
2707 
2708 /*
2709  * Called from ublk char device release handler, when any uring_cmd is
2710  * done, meantime request queue is "quiesced" since all inflight requests
2711  * can't be completed because ublk server is dead.
2712  *
2713  * So no one can hold our request IO reference any more, simply ignore the
2714  * reference, and complete the request immediately
2715  */
2716 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2717 {
2718 	int i;
2719 
2720 	for (i = 0; i < ubq->q_depth; i++) {
2721 		struct ublk_io *io = &ubq->ios[i];
2722 
2723 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2724 			__ublk_fail_req(ub, io, io->req);
2725 	}
2726 
2727 	if (ublk_support_batch_io(ubq))
2728 		ublk_abort_batch_queue(ub, ubq);
2729 }
2730 
2731 static void ublk_start_cancel(struct ublk_device *ub)
2732 {
2733 	struct gendisk *disk = ublk_get_disk(ub);
2734 
2735 	/* Our disk has been dead */
2736 	if (!disk)
2737 		return;
2738 
2739 	mutex_lock(&ub->cancel_mutex);
2740 	if (ub->canceling)
2741 		goto out;
2742 	/*
2743 	 * Now we are serialized with ublk_queue_rq()
2744 	 *
2745 	 * Make sure that ubq->canceling is set when queue is frozen,
2746 	 * because ublk_queue_rq() has to rely on this flag for avoiding to
2747 	 * touch completed uring_cmd
2748 	 */
2749 	blk_mq_quiesce_queue(disk->queue);
2750 	ublk_set_canceling(ub, true);
2751 	blk_mq_unquiesce_queue(disk->queue);
2752 out:
2753 	mutex_unlock(&ub->cancel_mutex);
2754 	ublk_put_disk(disk);
2755 }
2756 
2757 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2758 		unsigned int issue_flags)
2759 {
2760 	struct ublk_io *io = &ubq->ios[tag];
2761 	struct ublk_device *ub = ubq->dev;
2762 	struct io_uring_cmd *cmd = NULL;
2763 	struct request *req;
2764 	bool done;
2765 
2766 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2767 		return;
2768 
2769 	/*
2770 	 * Don't try to cancel this command if the request is started for
2771 	 * avoiding race between io_uring_cmd_done() and
2772 	 * io_uring_cmd_complete_in_task().
2773 	 *
2774 	 * Either the started request will be aborted via __ublk_abort_rq(),
2775 	 * then this uring_cmd is canceled next time, or it will be done in
2776 	 * task work function ublk_dispatch_req() because io_uring guarantees
2777 	 * that ublk_dispatch_req() is always called
2778 	 */
2779 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2780 	if (req && blk_mq_request_started(req) && req->tag == tag)
2781 		return;
2782 
2783 	spin_lock(&ubq->cancel_lock);
2784 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2785 	if (!done) {
2786 		io->flags |= UBLK_IO_FLAG_CANCELED;
2787 		cmd = io->cmd;
2788 		io->cmd = NULL;
2789 	}
2790 	spin_unlock(&ubq->cancel_lock);
2791 
2792 	if (!done && cmd)
2793 		io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, issue_flags);
2794 }
2795 
2796 /*
2797  * Cancel a batch fetch command if it hasn't been claimed by another path.
2798  *
2799  * An fcmd can only be cancelled if:
2800  * 1. It's not the active_fcmd (which is currently being processed)
2801  * 2. It's still on the list (!list_empty check) - once removed from the list,
2802  *    the fcmd is considered claimed and will be freed by whoever removed it
2803  *
2804  * Use list_del_init() so subsequent list_empty() checks work correctly.
2805  */
2806 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2807 				  struct ublk_batch_fetch_cmd *fcmd,
2808 				  unsigned int issue_flags)
2809 {
2810 	bool done;
2811 
2812 	spin_lock(&ubq->evts_lock);
2813 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2814 	if (done)
2815 		list_del_init(&fcmd->node);
2816 	spin_unlock(&ubq->evts_lock);
2817 
2818 	if (done) {
2819 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2820 		ublk_batch_free_fcmd(fcmd);
2821 	}
2822 }
2823 
2824 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2825 {
2826 	struct ublk_batch_fetch_cmd *fcmd;
2827 	LIST_HEAD(fcmd_list);
2828 
2829 	spin_lock(&ubq->evts_lock);
2830 	ubq->force_abort = true;
2831 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2832 	fcmd = READ_ONCE(ubq->active_fcmd);
2833 	if (fcmd)
2834 		list_move(&fcmd->node, &ubq->fcmd_head);
2835 	spin_unlock(&ubq->evts_lock);
2836 
2837 	while (!list_empty(&fcmd_list)) {
2838 		fcmd = list_first_entry(&fcmd_list,
2839 				struct ublk_batch_fetch_cmd, node);
2840 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2841 	}
2842 }
2843 
2844 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2845 				 unsigned int issue_flags)
2846 {
2847 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2848 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2849 	struct ublk_queue *ubq = pdu->ubq;
2850 
2851 	ublk_start_cancel(ubq->dev);
2852 
2853 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2854 }
2855 
2856 /*
2857  * The ublk char device won't be closed when calling cancel fn, so both
2858  * ublk device and queue are guaranteed to be live
2859  *
2860  * Two-stage cancel:
2861  *
2862  * - make every active uring_cmd done in ->cancel_fn()
2863  *
2864  * - aborting inflight ublk IO requests in ublk char device release handler,
2865  *   which depends on 1st stage because device can only be closed iff all
2866  *   uring_cmd are done
2867  *
2868  * Do _not_ try to acquire ub->mutex before all inflight requests are
2869  * aborted, otherwise deadlock may be caused.
2870  */
2871 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2872 		unsigned int issue_flags)
2873 {
2874 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2875 	struct ublk_queue *ubq = pdu->ubq;
2876 	struct task_struct *task;
2877 	struct ublk_io *io;
2878 
2879 	if (WARN_ON_ONCE(!ubq))
2880 		return;
2881 
2882 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2883 		return;
2884 
2885 	task = io_uring_cmd_get_task(cmd);
2886 	io = &ubq->ios[pdu->tag];
2887 	if (WARN_ON_ONCE(task && task != io->task))
2888 		return;
2889 
2890 	ublk_start_cancel(ubq->dev);
2891 
2892 	WARN_ON_ONCE(io->cmd != cmd);
2893 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2894 }
2895 
2896 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2897 {
2898 	return ubq->nr_io_ready == ubq->q_depth;
2899 }
2900 
2901 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2902 {
2903 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2904 }
2905 
2906 static void ublk_cancel_queue(struct ublk_queue *ubq)
2907 {
2908 	int i;
2909 
2910 	if (ublk_support_batch_io(ubq)) {
2911 		ublk_batch_cancel_queue(ubq);
2912 		return;
2913 	}
2914 
2915 	for (i = 0; i < ubq->q_depth; i++)
2916 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2917 }
2918 
2919 /* Cancel all pending commands, must be called after del_gendisk() returns */
2920 static void ublk_cancel_dev(struct ublk_device *ub)
2921 {
2922 	int i;
2923 
2924 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2925 		ublk_cancel_queue(ublk_get_queue(ub, i));
2926 }
2927 
2928 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2929 {
2930 	bool *idle = data;
2931 
2932 	if (blk_mq_request_started(rq)) {
2933 		*idle = false;
2934 		return false;
2935 	}
2936 	return true;
2937 }
2938 
2939 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2940 {
2941 	bool idle;
2942 
2943 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2944 	while (true) {
2945 		idle = true;
2946 		blk_mq_tagset_busy_iter(&ub->tag_set,
2947 				ublk_check_inflight_rq, &idle);
2948 		if (idle)
2949 			break;
2950 		msleep(UBLK_REQUEUE_DELAY_MS);
2951 	}
2952 }
2953 
2954 static void ublk_force_abort_dev(struct ublk_device *ub)
2955 {
2956 	int i;
2957 
2958 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2959 			__func__, ub->dev_info.dev_id,
2960 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2961 			"LIVE" : "QUIESCED");
2962 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2963 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2964 		ublk_wait_tagset_rqs_idle(ub);
2965 
2966 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2967 		ublk_get_queue(ub, i)->force_abort = true;
2968 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2969 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2970 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2971 }
2972 
2973 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2974 {
2975 	struct gendisk *disk;
2976 
2977 	/* Sync with ublk_abort_queue() by holding the lock */
2978 	spin_lock(&ub->lock);
2979 	disk = ub->ub_disk;
2980 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2981 	ub->dev_info.ublksrv_pid = -1;
2982 	ub->ub_disk = NULL;
2983 	spin_unlock(&ub->lock);
2984 
2985 	return disk;
2986 }
2987 
2988 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2989 	__must_hold(&ub->mutex)
2990 {
2991 	struct gendisk *disk;
2992 
2993 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2994 		return;
2995 
2996 	if (ublk_nosrv_dev_should_queue_io(ub))
2997 		ublk_force_abort_dev(ub);
2998 	del_gendisk(ub->ub_disk);
2999 	disk = ublk_detach_disk(ub);
3000 	put_disk(disk);
3001 }
3002 
3003 static void ublk_stop_dev(struct ublk_device *ub)
3004 {
3005 	mutex_lock(&ub->mutex);
3006 	ublk_stop_dev_unlocked(ub);
3007 	mutex_unlock(&ub->mutex);
3008 	cancel_work_sync(&ub->partition_scan_work);
3009 	ublk_cancel_dev(ub);
3010 }
3011 
3012 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
3013 {
3014 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
3015 	spin_lock(&ubq->cancel_lock);
3016 	io->flags &= ~UBLK_IO_FLAG_CANCELED;
3017 	spin_unlock(&ubq->cancel_lock);
3018 }
3019 
3020 /* reset per-queue io flags */
3021 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
3022 {
3023 	spin_lock(&ubq->cancel_lock);
3024 	ubq->canceling = false;
3025 	spin_unlock(&ubq->cancel_lock);
3026 	ubq->fail_io = false;
3027 }
3028 
3029 /* device can only be started after all IOs are ready */
3030 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
3031 	struct ublk_io *io)
3032 	__must_hold(&ub->mutex)
3033 {
3034 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3035 
3036 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3037 		ub->unprivileged_daemons = true;
3038 
3039 	ubq->nr_io_ready++;
3040 	ublk_reset_io_flags(ubq, io);
3041 
3042 	/* Check if this specific queue is now fully ready */
3043 	if (ublk_queue_ready(ubq)) {
3044 		ub->nr_queue_ready++;
3045 
3046 		/*
3047 		 * Reset queue flags as soon as this queue is ready.
3048 		 * This clears the canceling flag, allowing batch FETCH commands
3049 		 * to succeed during recovery without waiting for all queues.
3050 		 */
3051 		ublk_queue_reset_io_flags(ubq);
3052 	}
3053 
3054 	/* Check if all queues are ready */
3055 	if (ublk_dev_ready(ub)) {
3056 		/*
3057 		 * All queues ready - clear device-level canceling flag
3058 		 * and complete the recovery/initialization.
3059 		 */
3060 		mutex_lock(&ub->cancel_mutex);
3061 		ub->canceling = false;
3062 		mutex_unlock(&ub->cancel_mutex);
3063 		complete_all(&ub->completion);
3064 	}
3065 }
3066 
3067 static inline int ublk_check_cmd_op(u32 cmd_op)
3068 {
3069 	u32 ioc_type = _IOC_TYPE(cmd_op);
3070 
3071 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3072 		return -EOPNOTSUPP;
3073 
3074 	if (ioc_type != 'u' && ioc_type != 0)
3075 		return -EOPNOTSUPP;
3076 
3077 	return 0;
3078 }
3079 
3080 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3081 {
3082 	struct ublk_auto_buf_reg buf;
3083 
3084 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3085 
3086 	if (buf.reserved0 || buf.reserved1)
3087 		return -EINVAL;
3088 
3089 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3090 		return -EINVAL;
3091 	io->buf.auto_reg = buf;
3092 	return 0;
3093 }
3094 
3095 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3096 				    struct io_uring_cmd *cmd,
3097 				    u16 *buf_idx)
3098 {
3099 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3100 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3101 
3102 		/*
3103 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3104 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3105 		 * `io_ring_ctx`.
3106 		 *
3107 		 * If this uring_cmd's io_ring_ctx isn't same with the
3108 		 * one for registering the buffer, it is ublk server's
3109 		 * responsibility for unregistering the buffer, otherwise
3110 		 * this ublk request gets stuck.
3111 		 */
3112 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3113 			*buf_idx = io->buf.auto_reg.index;
3114 	}
3115 }
3116 
3117 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3118 				    struct io_uring_cmd *cmd,
3119 				    u16 *buf_idx)
3120 {
3121 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3122 	return ublk_set_auto_buf_reg(io, cmd);
3123 }
3124 
3125 /* Once we return, `io->req` can't be used any more */
3126 static inline struct request *
3127 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3128 {
3129 	struct request *req = io->req;
3130 
3131 	io->cmd = cmd;
3132 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3133 	/* now this cmd slot is owned by ublk driver */
3134 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3135 
3136 	return req;
3137 }
3138 
3139 static inline int
3140 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3141 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3142 		   u16 *buf_idx)
3143 {
3144 	if (ublk_dev_support_auto_buf_reg(ub))
3145 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3146 
3147 	io->buf.addr = buf_addr;
3148 	return 0;
3149 }
3150 
3151 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3152 				    unsigned int issue_flags,
3153 				    struct ublk_queue *ubq, unsigned int tag)
3154 {
3155 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3156 
3157 	/*
3158 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3159 	 * commands are completed
3160 	 */
3161 	pdu->ubq = ubq;
3162 	pdu->tag = tag;
3163 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3164 }
3165 
3166 static void ublk_io_release(void *priv)
3167 {
3168 	struct request *rq = priv;
3169 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3170 	struct ublk_io *io = &ubq->ios[rq->tag];
3171 
3172 	/*
3173 	 * task_registered_buffers may be 0 if buffers were registered off task
3174 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3175 	 */
3176 	if (current == io->task && io->task_registered_buffers)
3177 		io->task_registered_buffers--;
3178 	else
3179 		ublk_put_req_ref(io, rq);
3180 }
3181 
3182 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3183 				struct ublk_device *ub,
3184 				u16 q_id, u16 tag,
3185 				struct ublk_io *io,
3186 				unsigned int index, unsigned int issue_flags)
3187 {
3188 	struct request *req;
3189 	int ret;
3190 
3191 	if (!ublk_dev_support_zero_copy(ub))
3192 		return -EINVAL;
3193 
3194 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3195 	if (!req)
3196 		return -EINVAL;
3197 
3198 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3199 				      issue_flags);
3200 	if (ret) {
3201 		ublk_put_req_ref(io, req);
3202 		return ret;
3203 	}
3204 
3205 	return 0;
3206 }
3207 
3208 static int
3209 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3210 			    struct ublk_device *ub,
3211 			    u16 q_id, u16 tag, struct ublk_io *io,
3212 			    unsigned index, unsigned issue_flags)
3213 {
3214 	unsigned new_registered_buffers;
3215 	struct request *req = io->req;
3216 	int ret;
3217 
3218 	/*
3219 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3220 	 * If not, fall back on the thread-safe buffer registration.
3221 	 */
3222 	new_registered_buffers = io->task_registered_buffers + 1;
3223 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3224 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3225 					    issue_flags);
3226 
3227 	if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3228 		return -EINVAL;
3229 
3230 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3231 				      issue_flags);
3232 	if (ret)
3233 		return ret;
3234 
3235 	io->task_registered_buffers = new_registered_buffers;
3236 	return 0;
3237 }
3238 
3239 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3240 				  const struct ublk_device *ub,
3241 				  unsigned int index, unsigned int issue_flags)
3242 {
3243 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3244 		return -EINVAL;
3245 
3246 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3247 }
3248 
3249 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3250 {
3251 	if (ublk_dev_need_map_io(ub)) {
3252 		/*
3253 		 * FETCH_RQ has to provide IO buffer if NEED GET
3254 		 * DATA is not enabled
3255 		 */
3256 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3257 			return -EINVAL;
3258 	} else if (buf_addr) {
3259 		/* User copy requires addr to be unset */
3260 		return -EINVAL;
3261 	}
3262 	return 0;
3263 }
3264 
3265 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3266 			struct ublk_io *io, u16 q_id)
3267 {
3268 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3269 	if (ublk_dev_ready(ub))
3270 		return -EBUSY;
3271 
3272 	/* allow each command to be FETCHed at most once */
3273 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3274 		return -EINVAL;
3275 
3276 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3277 
3278 	ublk_fill_io_cmd(io, cmd);
3279 
3280 	if (ublk_dev_support_batch_io(ub))
3281 		WRITE_ONCE(io->task, NULL);
3282 	else
3283 		WRITE_ONCE(io->task, get_task_struct(current));
3284 
3285 	return 0;
3286 }
3287 
3288 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3289 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3290 {
3291 	int ret;
3292 
3293 	/*
3294 	 * When handling FETCH command for setting up ublk uring queue,
3295 	 * ub->mutex is the innermost lock, and we won't block for handling
3296 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3297 	 */
3298 	mutex_lock(&ub->mutex);
3299 	ret = __ublk_fetch(cmd, ub, io, q_id);
3300 	if (!ret)
3301 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3302 	if (!ret)
3303 		ublk_mark_io_ready(ub, q_id, io);
3304 	mutex_unlock(&ub->mutex);
3305 	return ret;
3306 }
3307 
3308 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3309 				       struct ublk_io *io, __u64 buf_addr)
3310 {
3311 	struct request *req = io->req;
3312 
3313 	if (ublk_dev_need_map_io(ub)) {
3314 		/*
3315 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3316 		 * NEED GET DATA is not enabled or it is Read IO.
3317 		 */
3318 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3319 					req_op(req) == REQ_OP_READ))
3320 			return -EINVAL;
3321 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3322 		/*
3323 		 * User copy requires addr to be unset when command is
3324 		 * not zone append
3325 		 */
3326 		return -EINVAL;
3327 	}
3328 
3329 	return 0;
3330 }
3331 
3332 static bool ublk_need_complete_req(const struct ublk_device *ub,
3333 				   struct ublk_io *io)
3334 {
3335 	if (ublk_dev_need_req_ref(ub))
3336 		return ublk_sub_req_ref(io);
3337 	return true;
3338 }
3339 
3340 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3341 			  struct request *req)
3342 {
3343 	/*
3344 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3345 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3346 	 * do the copy work.
3347 	 */
3348 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3349 	/* update iod->addr because ublksrv may have passed a new io buffer */
3350 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3351 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3352 			__func__, ubq->q_id, req->tag, io->flags,
3353 			ublk_get_iod(ubq, req->tag)->addr);
3354 
3355 	return ublk_start_io(ubq, req, io);
3356 }
3357 
3358 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3359 		unsigned int issue_flags)
3360 {
3361 	/* May point to userspace-mapped memory */
3362 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3363 							       struct ublksrv_io_cmd);
3364 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3365 	struct ublk_device *ub = cmd->file->private_data;
3366 	struct ublk_queue *ubq;
3367 	struct ublk_io *io = NULL;
3368 	u32 cmd_op = cmd->cmd_op;
3369 	u16 q_id = READ_ONCE(ub_src->q_id);
3370 	u16 tag = READ_ONCE(ub_src->tag);
3371 	s32 result = READ_ONCE(ub_src->result);
3372 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3373 	struct request *req;
3374 	int ret;
3375 	bool compl;
3376 
3377 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3378 
3379 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3380 			__func__, cmd->cmd_op, q_id, tag, result);
3381 
3382 	ret = ublk_check_cmd_op(cmd_op);
3383 	if (ret)
3384 		goto out;
3385 
3386 	/*
3387 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3388 	 * so no need to validate the q_id, tag, or task
3389 	 */
3390 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3391 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3392 
3393 	ret = -EINVAL;
3394 	if (q_id >= ub->dev_info.nr_hw_queues)
3395 		goto out;
3396 
3397 	ubq = ublk_get_queue(ub, q_id);
3398 
3399 	if (tag >= ub->dev_info.queue_depth)
3400 		goto out;
3401 
3402 	io = &ubq->ios[tag];
3403 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3404 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3405 		ret = ublk_check_fetch_buf(ub, addr);
3406 		if (ret)
3407 			goto out;
3408 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3409 		if (ret)
3410 			goto out;
3411 
3412 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3413 		return -EIOCBQUEUED;
3414 	}
3415 
3416 	if (READ_ONCE(io->task) != current) {
3417 		/*
3418 		 * ublk_register_io_buf() accesses only the io's refcount,
3419 		 * so can be handled on any task
3420 		 */
3421 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3422 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3423 						    addr, issue_flags);
3424 
3425 		goto out;
3426 	}
3427 
3428 	/* there is pending io cmd, something must be wrong */
3429 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3430 		ret = -EBUSY;
3431 		goto out;
3432 	}
3433 
3434 	/*
3435 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3436 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3437 	 */
3438 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3439 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3440 		goto out;
3441 
3442 	switch (_IOC_NR(cmd_op)) {
3443 	case UBLK_IO_REGISTER_IO_BUF:
3444 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3445 						   issue_flags);
3446 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3447 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3448 		if (ret)
3449 			goto out;
3450 		io->res = result;
3451 		req = ublk_fill_io_cmd(io, cmd);
3452 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3453 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3454 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3455 		compl = ublk_need_complete_req(ub, io);
3456 
3457 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3458 			req->__sector = addr;
3459 		if (compl)
3460 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3461 
3462 		if (ret)
3463 			goto out;
3464 		break;
3465 	case UBLK_IO_NEED_GET_DATA:
3466 		/*
3467 		 * ublk_get_data() may fail and fallback to requeue, so keep
3468 		 * uring_cmd active first and prepare for handling new requeued
3469 		 * request
3470 		 */
3471 		req = ublk_fill_io_cmd(io, cmd);
3472 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3473 		WARN_ON_ONCE(ret);
3474 		if (likely(ublk_get_data(ubq, io, req))) {
3475 			__ublk_prep_compl_io_cmd(io, req);
3476 			return UBLK_IO_RES_OK;
3477 		}
3478 		break;
3479 	default:
3480 		goto out;
3481 	}
3482 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3483 	return -EIOCBQUEUED;
3484 
3485  out:
3486 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3487 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3488 	return ret;
3489 }
3490 
3491 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3492 		u16 q_id, u16 tag, struct ublk_io *io)
3493 {
3494 	struct request *req;
3495 
3496 	/*
3497 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3498 	 * which would overwrite it with io->cmd
3499 	 */
3500 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3501 	if (!req)
3502 		return NULL;
3503 
3504 	if (!ublk_get_req_ref(io))
3505 		return NULL;
3506 
3507 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3508 		goto fail_put;
3509 
3510 	if (!ublk_rq_has_data(req))
3511 		goto fail_put;
3512 
3513 	return req;
3514 fail_put:
3515 	ublk_put_req_ref(io, req);
3516 	return NULL;
3517 }
3518 
3519 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3520 {
3521 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3522 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3523 	int ret = -ECANCELED;
3524 
3525 	if (!tw.cancel)
3526 		ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3527 	if (ret != -EIOCBQUEUED)
3528 		io_uring_cmd_done(cmd, ret, issue_flags);
3529 }
3530 
3531 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3532 {
3533 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3534 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3535 		return 0;
3536 	}
3537 
3538 	/* well-implemented server won't run into unlocked */
3539 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3540 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3541 		return -EIOCBQUEUED;
3542 	}
3543 
3544 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3545 }
3546 
3547 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3548 					const struct ublk_elem_header *elem)
3549 {
3550 	const void *buf = elem;
3551 
3552 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3553 		return *(const __u64 *)(buf + sizeof(*elem));
3554 	return 0;
3555 }
3556 
3557 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3558 					const struct ublk_elem_header *elem)
3559 {
3560 	const void *buf = elem;
3561 
3562 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3563 		return *(const __u64 *)(buf + sizeof(*elem) +
3564 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3565 	return -1;
3566 }
3567 
3568 static struct ublk_auto_buf_reg
3569 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3570 			const struct ublk_elem_header *elem)
3571 {
3572 	struct ublk_auto_buf_reg reg = {
3573 		.index = elem->buf_index,
3574 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3575 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3576 	};
3577 
3578 	return reg;
3579 }
3580 
3581 /*
3582  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3583  * it is the least common multiple(LCM) of 8, 16 and 24
3584  */
3585 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3586 struct ublk_batch_io_iter {
3587 	void __user *uaddr;
3588 	unsigned done, total;
3589 	unsigned char elem_bytes;
3590 	/* copy to this buffer from user space */
3591 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3592 };
3593 
3594 static inline int
3595 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3596 		    struct ublk_batch_io_iter *iter,
3597 		    const struct ublk_batch_io_data *data,
3598 		    unsigned bytes,
3599 		    int (*cb)(struct ublk_queue *q,
3600 			    const struct ublk_batch_io_data *data,
3601 			    const struct ublk_elem_header *elem))
3602 {
3603 	unsigned int i;
3604 	int ret = 0;
3605 
3606 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3607 		const struct ublk_elem_header *elem =
3608 			(const struct ublk_elem_header *)&iter->buf[i];
3609 
3610 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3611 			ret = -EINVAL;
3612 			break;
3613 		}
3614 
3615 		ret = cb(ubq, data, elem);
3616 		if (unlikely(ret))
3617 			break;
3618 	}
3619 
3620 	iter->done += i;
3621 	return ret;
3622 }
3623 
3624 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3625 			     const struct ublk_batch_io_data *data,
3626 			     int (*cb)(struct ublk_queue *q,
3627 				     const struct ublk_batch_io_data *data,
3628 				     const struct ublk_elem_header *elem))
3629 {
3630 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3631 	int ret = 0;
3632 
3633 	while (iter->done < iter->total) {
3634 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3635 
3636 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3637 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3638 					data->ub->dev_info.dev_id);
3639 			return -EFAULT;
3640 		}
3641 
3642 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3643 		if (ret)
3644 			return ret;
3645 	}
3646 	return 0;
3647 }
3648 
3649 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3650 				const struct ublk_batch_io_data *data,
3651 				const struct ublk_elem_header *elem)
3652 {
3653 	struct ublk_io *io = &ubq->ios[elem->tag];
3654 
3655 	/*
3656 	 * If queue was ready before this decrement, it won't be anymore,
3657 	 * so we need to decrement the queue ready count and restore the
3658 	 * canceling flag to prevent new requests from being queued.
3659 	 */
3660 	if (ublk_queue_ready(ubq)) {
3661 		data->ub->nr_queue_ready--;
3662 		spin_lock(&ubq->cancel_lock);
3663 		ubq->canceling = true;
3664 		spin_unlock(&ubq->cancel_lock);
3665 	}
3666 	ubq->nr_io_ready--;
3667 
3668 	ublk_io_lock(io);
3669 	io->flags = 0;
3670 	ublk_io_unlock(io);
3671 	return 0;
3672 }
3673 
3674 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3675 				       const struct ublk_batch_io_data *data)
3676 {
3677 	int ret;
3678 
3679 	/* Re-process only what we've already processed, starting from beginning */
3680 	iter->total = iter->done;
3681 	iter->done = 0;
3682 
3683 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3684 	WARN_ON_ONCE(ret);
3685 }
3686 
3687 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3688 			      const struct ublk_batch_io_data *data,
3689 			      const struct ublk_elem_header *elem)
3690 {
3691 	struct ublk_io *io = &ubq->ios[elem->tag];
3692 	const struct ublk_batch_io *uc = &data->header;
3693 	union ublk_io_buf buf = { 0 };
3694 	int ret;
3695 
3696 	if (ublk_dev_support_auto_buf_reg(data->ub))
3697 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3698 	else if (ublk_dev_need_map_io(data->ub)) {
3699 		buf.addr = ublk_batch_buf_addr(uc, elem);
3700 
3701 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3702 		if (ret)
3703 			return ret;
3704 	}
3705 
3706 	ublk_io_lock(io);
3707 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3708 	if (!ret)
3709 		io->buf = buf;
3710 	ublk_io_unlock(io);
3711 
3712 	if (!ret)
3713 		ublk_mark_io_ready(data->ub, ubq->q_id, io);
3714 
3715 	return ret;
3716 }
3717 
3718 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3719 {
3720 	const struct ublk_batch_io *uc = &data->header;
3721 	struct io_uring_cmd *cmd = data->cmd;
3722 	struct ublk_batch_io_iter iter = {
3723 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3724 		.total = uc->nr_elem * uc->elem_bytes,
3725 		.elem_bytes = uc->elem_bytes,
3726 	};
3727 	int ret;
3728 
3729 	mutex_lock(&data->ub->mutex);
3730 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3731 
3732 	if (ret && iter.done)
3733 		ublk_batch_revert_prep_cmd(&iter, data);
3734 	mutex_unlock(&data->ub->mutex);
3735 	return ret;
3736 }
3737 
3738 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3739 				      struct ublk_io *io,
3740 				      union ublk_io_buf *buf)
3741 {
3742 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3743 		return -EBUSY;
3744 
3745 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3746 	if (ublk_need_map_io(ubq) && !buf->addr)
3747 		return -EINVAL;
3748 	return 0;
3749 }
3750 
3751 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3752 				const struct ublk_batch_io_data *data,
3753 				const struct ublk_elem_header *elem)
3754 {
3755 	struct ublk_io *io = &ubq->ios[elem->tag];
3756 	const struct ublk_batch_io *uc = &data->header;
3757 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3758 	union ublk_io_buf buf = { 0 };
3759 	struct request *req = NULL;
3760 	bool auto_reg = false;
3761 	bool compl = false;
3762 	int ret;
3763 
3764 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3765 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3766 		auto_reg = true;
3767 	} else if (ublk_dev_need_map_io(data->ub))
3768 		buf.addr = ublk_batch_buf_addr(uc, elem);
3769 
3770 	ublk_io_lock(io);
3771 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3772 	if (!ret) {
3773 		io->res = elem->result;
3774 		io->buf = buf;
3775 		req = ublk_fill_io_cmd(io, data->cmd);
3776 
3777 		if (auto_reg)
3778 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3779 		compl = ublk_need_complete_req(data->ub, io);
3780 	}
3781 	ublk_io_unlock(io);
3782 
3783 	if (unlikely(ret)) {
3784 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3785 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3786 			elem->tag, ret);
3787 		return ret;
3788 	}
3789 
3790 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3791 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3792 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3793 		req->__sector = ublk_batch_zone_lba(uc, elem);
3794 	if (compl)
3795 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3796 	return 0;
3797 }
3798 
3799 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3800 {
3801 	const struct ublk_batch_io *uc = &data->header;
3802 	struct io_uring_cmd *cmd = data->cmd;
3803 	struct ublk_batch_io_iter iter = {
3804 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3805 		.total = uc->nr_elem * uc->elem_bytes,
3806 		.elem_bytes = uc->elem_bytes,
3807 	};
3808 	DEFINE_IO_COMP_BATCH(iob);
3809 	int ret;
3810 
3811 	data->iob = &iob;
3812 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3813 
3814 	if (iob.complete)
3815 		iob.complete(&iob);
3816 
3817 	return iter.done == 0 ? ret : iter.done;
3818 }
3819 
3820 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3821 {
3822 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3823 
3824 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3825 		return -EINVAL;
3826 
3827 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3828 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3829 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3830 		return -EINVAL;
3831 
3832 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3833 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3834 	if (uc->elem_bytes != elem_bytes)
3835 		return -EINVAL;
3836 	return 0;
3837 }
3838 
3839 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3840 {
3841 	const struct ublk_batch_io *uc = &data->header;
3842 
3843 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3844 		return -EINVAL;
3845 
3846 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3847 		return -E2BIG;
3848 
3849 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3850 			!ublk_dev_is_zoned(data->ub))
3851 		return -EINVAL;
3852 
3853 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3854 			!ublk_dev_need_map_io(data->ub))
3855 		return -EINVAL;
3856 
3857 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3858 			!ublk_dev_support_auto_buf_reg(data->ub))
3859 		return -EINVAL;
3860 
3861 	return ublk_check_batch_cmd_flags(uc);
3862 }
3863 
3864 static int ublk_batch_attach(struct ublk_queue *ubq,
3865 			     struct ublk_batch_io_data *data,
3866 			     struct ublk_batch_fetch_cmd *fcmd)
3867 {
3868 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3869 	bool free = false;
3870 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3871 
3872 	spin_lock(&ubq->evts_lock);
3873 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3874 		free = true;
3875 	} else {
3876 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3877 		new_fcmd = __ublk_acquire_fcmd(ubq);
3878 	}
3879 	spin_unlock(&ubq->evts_lock);
3880 
3881 	if (unlikely(free)) {
3882 		ublk_batch_free_fcmd(fcmd);
3883 		return -ENODEV;
3884 	}
3885 
3886 	pdu->ubq = ubq;
3887 	pdu->fcmd = fcmd;
3888 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3889 
3890 	if (!new_fcmd)
3891 		goto out;
3892 
3893 	/*
3894 	 * If the two fetch commands are originated from same io_ring_ctx,
3895 	 * run batch dispatch directly. Otherwise, schedule task work for
3896 	 * doing it.
3897 	 */
3898 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3899 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3900 		data->cmd = new_fcmd->cmd;
3901 		ublk_batch_dispatch(ubq, data, new_fcmd);
3902 	} else {
3903 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3904 				ublk_batch_tw_cb);
3905 	}
3906 out:
3907 	return -EIOCBQUEUED;
3908 }
3909 
3910 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3911 {
3912 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3913 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3914 
3915 	if (!fcmd)
3916 		return -ENOMEM;
3917 
3918 	return ublk_batch_attach(ubq, data, fcmd);
3919 }
3920 
3921 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3922 {
3923 	const struct ublk_batch_io *uc = &data->header;
3924 
3925 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3926 		return -EINVAL;
3927 
3928 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3929 		return -EINVAL;
3930 
3931 	if (uc->elem_bytes != sizeof(__u16))
3932 		return -EINVAL;
3933 
3934 	if (uc->flags != 0)
3935 		return -EINVAL;
3936 
3937 	return 0;
3938 }
3939 
3940 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3941 				     unsigned int issue_flags)
3942 {
3943 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3944 							       struct ublksrv_io_cmd);
3945 	struct ublk_device *ub = cmd->file->private_data;
3946 	unsigned tag = READ_ONCE(ub_cmd->tag);
3947 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3948 	unsigned index = READ_ONCE(ub_cmd->addr);
3949 	struct ublk_queue *ubq;
3950 	struct ublk_io *io;
3951 
3952 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3953 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3954 
3955 	if (q_id >= ub->dev_info.nr_hw_queues)
3956 		return -EINVAL;
3957 
3958 	if (tag >= ub->dev_info.queue_depth)
3959 		return -EINVAL;
3960 
3961 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3962 		return -EOPNOTSUPP;
3963 
3964 	ubq = ublk_get_queue(ub, q_id);
3965 	io = &ubq->ios[tag];
3966 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3967 			issue_flags);
3968 }
3969 
3970 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3971 				       unsigned int issue_flags)
3972 {
3973 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3974 							  struct ublk_batch_io);
3975 	struct ublk_device *ub = cmd->file->private_data;
3976 	struct ublk_batch_io_data data = {
3977 		.ub  = ub,
3978 		.cmd = cmd,
3979 		.header = (struct ublk_batch_io) {
3980 			.q_id = READ_ONCE(uc->q_id),
3981 			.flags = READ_ONCE(uc->flags),
3982 			.nr_elem = READ_ONCE(uc->nr_elem),
3983 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3984 		},
3985 		.issue_flags = issue_flags,
3986 	};
3987 	u32 cmd_op = cmd->cmd_op;
3988 	int ret = -EINVAL;
3989 
3990 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3991 		ublk_batch_cancel_fn(cmd, issue_flags);
3992 		return 0;
3993 	}
3994 
3995 	switch (cmd_op) {
3996 	case UBLK_U_IO_PREP_IO_CMDS:
3997 		ret = ublk_check_batch_cmd(&data);
3998 		if (ret)
3999 			goto out;
4000 		ret = ublk_handle_batch_prep_cmd(&data);
4001 		break;
4002 	case UBLK_U_IO_COMMIT_IO_CMDS:
4003 		ret = ublk_check_batch_cmd(&data);
4004 		if (ret)
4005 			goto out;
4006 		ret = ublk_handle_batch_commit_cmd(&data);
4007 		break;
4008 	case UBLK_U_IO_FETCH_IO_CMDS:
4009 		ret = ublk_validate_batch_fetch_cmd(&data);
4010 		if (ret)
4011 			goto out;
4012 		ret = ublk_handle_batch_fetch_cmd(&data);
4013 		break;
4014 	default:
4015 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
4016 		break;
4017 	}
4018 out:
4019 	return ret;
4020 }
4021 
4022 static inline bool ublk_check_ubuf_dir(const struct request *req,
4023 		int ubuf_dir)
4024 {
4025 	/* copy ubuf to request pages */
4026 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
4027 	    ubuf_dir == ITER_SOURCE)
4028 		return true;
4029 
4030 	/* copy request pages to ubuf */
4031 	if ((req_op(req) == REQ_OP_WRITE ||
4032 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
4033 	    ubuf_dir == ITER_DEST)
4034 		return true;
4035 
4036 	return false;
4037 }
4038 
4039 static ssize_t
4040 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4041 {
4042 	struct ublk_device *ub = iocb->ki_filp->private_data;
4043 	struct ublk_queue *ubq;
4044 	struct request *req;
4045 	struct ublk_io *io;
4046 	unsigned data_len;
4047 	bool is_integrity;
4048 	bool on_daemon;
4049 	size_t buf_off;
4050 	u16 tag, q_id;
4051 	ssize_t ret;
4052 
4053 	if (!user_backed_iter(iter))
4054 		return -EACCES;
4055 
4056 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4057 		return -EACCES;
4058 
4059 	tag = ublk_pos_to_tag(iocb->ki_pos);
4060 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
4061 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4062 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4063 
4064 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4065 		return -EINVAL;
4066 
4067 	if (q_id >= ub->dev_info.nr_hw_queues)
4068 		return -EINVAL;
4069 
4070 	ubq = ublk_get_queue(ub, q_id);
4071 	if (!ublk_dev_support_user_copy(ub))
4072 		return -EACCES;
4073 
4074 	if (tag >= ub->dev_info.queue_depth)
4075 		return -EINVAL;
4076 
4077 	io = &ubq->ios[tag];
4078 	on_daemon = current == READ_ONCE(io->task);
4079 	if (on_daemon) {
4080 		/* On daemon, io can't be completed concurrently, so skip ref */
4081 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4082 			return -EINVAL;
4083 
4084 		req = io->req;
4085 		if (!ublk_rq_has_data(req))
4086 			return -EINVAL;
4087 	} else {
4088 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
4089 		if (!req)
4090 			return -EINVAL;
4091 	}
4092 
4093 	if (is_integrity) {
4094 		struct blk_integrity *bi = &req->q->limits.integrity;
4095 
4096 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4097 	} else {
4098 		data_len = blk_rq_bytes(req);
4099 	}
4100 	if (buf_off > data_len) {
4101 		ret = -EINVAL;
4102 		goto out;
4103 	}
4104 
4105 	if (!ublk_check_ubuf_dir(req, dir)) {
4106 		ret = -EACCES;
4107 		goto out;
4108 	}
4109 
4110 	if (is_integrity)
4111 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4112 	else
4113 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4114 
4115 out:
4116 	if (!on_daemon)
4117 		ublk_put_req_ref(io, req);
4118 	return ret;
4119 }
4120 
4121 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4122 {
4123 	return ublk_user_copy(iocb, to, ITER_DEST);
4124 }
4125 
4126 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4127 {
4128 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4129 }
4130 
4131 static const struct file_operations ublk_ch_fops = {
4132 	.owner = THIS_MODULE,
4133 	.open = ublk_ch_open,
4134 	.release = ublk_ch_release,
4135 	.read_iter = ublk_ch_read_iter,
4136 	.write_iter = ublk_ch_write_iter,
4137 	.uring_cmd = ublk_ch_uring_cmd,
4138 	.mmap = ublk_ch_mmap,
4139 };
4140 
4141 static const struct file_operations ublk_ch_batch_io_fops = {
4142 	.owner = THIS_MODULE,
4143 	.open = ublk_ch_open,
4144 	.release = ublk_ch_release,
4145 	.read_iter = ublk_ch_read_iter,
4146 	.write_iter = ublk_ch_write_iter,
4147 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4148 	.mmap = ublk_ch_mmap,
4149 };
4150 
4151 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4152 {
4153 	int size, i;
4154 
4155 	size = ublk_queue_cmd_buf_size(ub);
4156 
4157 	for (i = 0; i < ubq->q_depth; i++) {
4158 		struct ublk_io *io = &ubq->ios[i];
4159 		if (io->task)
4160 			put_task_struct(io->task);
4161 		WARN_ON_ONCE(refcount_read(&io->ref));
4162 		WARN_ON_ONCE(io->task_registered_buffers);
4163 	}
4164 
4165 	if (ubq->io_cmd_buf)
4166 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4167 
4168 	if (ublk_dev_support_batch_io(ub))
4169 		ublk_io_evts_deinit(ubq);
4170 
4171 	kvfree(ubq);
4172 }
4173 
4174 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4175 {
4176 	struct ublk_queue *ubq = ub->queues[q_id];
4177 
4178 	if (!ubq)
4179 		return;
4180 
4181 	__ublk_deinit_queue(ub, ubq);
4182 	ub->queues[q_id] = NULL;
4183 }
4184 
4185 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4186 {
4187 	unsigned int cpu;
4188 
4189 	/* Find first CPU mapped to this queue */
4190 	for_each_possible_cpu(cpu) {
4191 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4192 			return cpu_to_node(cpu);
4193 	}
4194 
4195 	return NUMA_NO_NODE;
4196 }
4197 
4198 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4199 {
4200 	int depth = ub->dev_info.queue_depth;
4201 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4202 	struct ublk_queue *ubq;
4203 	struct page *page;
4204 	int numa_node;
4205 	int size, i, ret;
4206 
4207 	/* Determine NUMA node based on queue's CPU affinity */
4208 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4209 
4210 	/* Allocate queue structure on local NUMA node */
4211 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4212 			    numa_node);
4213 	if (!ubq)
4214 		return -ENOMEM;
4215 
4216 	spin_lock_init(&ubq->cancel_lock);
4217 	ubq->flags = ub->dev_info.flags;
4218 	ubq->q_id = q_id;
4219 	ubq->q_depth = depth;
4220 	size = ublk_queue_cmd_buf_size(ub);
4221 
4222 	/* Allocate I/O command buffer on local NUMA node */
4223 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4224 	if (!page) {
4225 		kvfree(ubq);
4226 		return -ENOMEM;
4227 	}
4228 	ubq->io_cmd_buf = page_address(page);
4229 
4230 	for (i = 0; i < ubq->q_depth; i++)
4231 		spin_lock_init(&ubq->ios[i].lock);
4232 
4233 	if (ublk_dev_support_batch_io(ub)) {
4234 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4235 		if (ret)
4236 			goto fail;
4237 		INIT_LIST_HEAD(&ubq->fcmd_head);
4238 	}
4239 	ub->queues[q_id] = ubq;
4240 	ubq->dev = ub;
4241 
4242 	return 0;
4243 fail:
4244 	__ublk_deinit_queue(ub, ubq);
4245 	return ret;
4246 }
4247 
4248 static void ublk_deinit_queues(struct ublk_device *ub)
4249 {
4250 	int i;
4251 
4252 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4253 		ublk_deinit_queue(ub, i);
4254 }
4255 
4256 static int ublk_init_queues(struct ublk_device *ub)
4257 {
4258 	int i, ret;
4259 
4260 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4261 		ret = ublk_init_queue(ub, i);
4262 		if (ret)
4263 			goto fail;
4264 	}
4265 
4266 	init_completion(&ub->completion);
4267 	return 0;
4268 
4269  fail:
4270 	ublk_deinit_queues(ub);
4271 	return ret;
4272 }
4273 
4274 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4275 {
4276 	int i = idx;
4277 	int err;
4278 
4279 	spin_lock(&ublk_idr_lock);
4280 	/* allocate id, if @id >= 0, we're requesting that specific id */
4281 	if (i >= 0) {
4282 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4283 		if (err == -ENOSPC)
4284 			err = -EEXIST;
4285 	} else {
4286 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4287 				GFP_NOWAIT);
4288 	}
4289 	spin_unlock(&ublk_idr_lock);
4290 
4291 	if (err >= 0)
4292 		ub->ub_number = err;
4293 
4294 	return err;
4295 }
4296 
4297 static void ublk_free_dev_number(struct ublk_device *ub)
4298 {
4299 	spin_lock(&ublk_idr_lock);
4300 	idr_remove(&ublk_index_idr, ub->ub_number);
4301 	wake_up_all(&ublk_idr_wq);
4302 	spin_unlock(&ublk_idr_lock);
4303 }
4304 
4305 static void ublk_cdev_rel(struct device *dev)
4306 {
4307 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4308 
4309 	ublk_buf_cleanup(ub);
4310 	blk_mq_free_tag_set(&ub->tag_set);
4311 	ublk_deinit_queues(ub);
4312 	ublk_free_dev_number(ub);
4313 	mutex_destroy(&ub->mutex);
4314 	mutex_destroy(&ub->cancel_mutex);
4315 	kfree(ub);
4316 }
4317 
4318 static int ublk_add_chdev(struct ublk_device *ub)
4319 {
4320 	struct device *dev = &ub->cdev_dev;
4321 	int minor = ub->ub_number;
4322 	int ret;
4323 
4324 	dev->parent = ublk_misc.this_device;
4325 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4326 	dev->class = &ublk_chr_class;
4327 	dev->release = ublk_cdev_rel;
4328 	device_initialize(dev);
4329 
4330 	ret = dev_set_name(dev, "ublkc%d", minor);
4331 	if (ret)
4332 		goto fail;
4333 
4334 	if (ublk_dev_support_batch_io(ub))
4335 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4336 	else
4337 		cdev_init(&ub->cdev, &ublk_ch_fops);
4338 	ret = cdev_device_add(&ub->cdev, dev);
4339 	if (ret)
4340 		goto fail;
4341 
4342 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4343 		unprivileged_ublks_added++;
4344 	return 0;
4345  fail:
4346 	put_device(dev);
4347 	return ret;
4348 }
4349 
4350 /* align max io buffer size with PAGE_SIZE */
4351 static void ublk_align_max_io_size(struct ublk_device *ub)
4352 {
4353 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4354 
4355 	ub->dev_info.max_io_buf_bytes =
4356 		round_down(max_io_bytes, PAGE_SIZE);
4357 }
4358 
4359 static int ublk_add_tag_set(struct ublk_device *ub)
4360 {
4361 	if (ublk_dev_support_batch_io(ub))
4362 		ub->tag_set.ops = &ublk_batch_mq_ops;
4363 	else
4364 		ub->tag_set.ops = &ublk_mq_ops;
4365 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4366 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4367 	ub->tag_set.numa_node = NUMA_NO_NODE;
4368 	ub->tag_set.driver_data = ub;
4369 	return blk_mq_alloc_tag_set(&ub->tag_set);
4370 }
4371 
4372 static void ublk_remove(struct ublk_device *ub)
4373 {
4374 	bool unprivileged;
4375 
4376 	ublk_stop_dev(ub);
4377 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4378 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4379 	ublk_put_device(ub);
4380 
4381 	if (unprivileged)
4382 		unprivileged_ublks_added--;
4383 }
4384 
4385 static struct ublk_device *ublk_get_device_from_id(int idx)
4386 {
4387 	struct ublk_device *ub = NULL;
4388 
4389 	if (idx < 0)
4390 		return NULL;
4391 
4392 	spin_lock(&ublk_idr_lock);
4393 	ub = idr_find(&ublk_index_idr, idx);
4394 	if (ub)
4395 		ub = ublk_get_device(ub);
4396 	spin_unlock(&ublk_idr_lock);
4397 
4398 	return ub;
4399 }
4400 
4401 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4402 {
4403 	rcu_read_lock();
4404 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4405 	rcu_read_unlock();
4406 
4407 	return ub->ublksrv_tgid == ublksrv_pid;
4408 }
4409 
4410 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4411 		const struct ublksrv_ctrl_cmd *header)
4412 {
4413 	const struct ublk_param_basic *p = &ub->params.basic;
4414 	int ublksrv_pid = (int)header->data[0];
4415 	struct queue_limits lim = {
4416 		.logical_block_size	= 1 << p->logical_bs_shift,
4417 		.physical_block_size	= 1 << p->physical_bs_shift,
4418 		.io_min			= 1 << p->io_min_shift,
4419 		.io_opt			= 1 << p->io_opt_shift,
4420 		.max_hw_sectors		= p->max_sectors,
4421 		.chunk_sectors		= p->chunk_sectors,
4422 		.virt_boundary_mask	= p->virt_boundary_mask,
4423 		.max_segments		= USHRT_MAX,
4424 		.max_segment_size	= UINT_MAX,
4425 		.dma_alignment		= 3,
4426 	};
4427 	struct gendisk *disk;
4428 	int ret = -EINVAL;
4429 
4430 	if (ublksrv_pid <= 0)
4431 		return -EINVAL;
4432 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4433 		return -EINVAL;
4434 
4435 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4436 		const struct ublk_param_discard *pd = &ub->params.discard;
4437 
4438 		lim.discard_alignment = pd->discard_alignment;
4439 		lim.discard_granularity = pd->discard_granularity;
4440 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4441 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4442 		lim.max_discard_segments = pd->max_discard_segments;
4443 	}
4444 
4445 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4446 		const struct ublk_param_zoned *p = &ub->params.zoned;
4447 
4448 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4449 			return -EOPNOTSUPP;
4450 
4451 		lim.features |= BLK_FEAT_ZONED;
4452 		lim.max_active_zones = p->max_active_zones;
4453 		lim.max_open_zones =  p->max_open_zones;
4454 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4455 	}
4456 
4457 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4458 		lim.features |= BLK_FEAT_WRITE_CACHE;
4459 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4460 			lim.features |= BLK_FEAT_FUA;
4461 	}
4462 
4463 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4464 		lim.features |= BLK_FEAT_ROTATIONAL;
4465 
4466 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4467 		lim.dma_alignment = ub->params.dma.alignment;
4468 
4469 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4470 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4471 		lim.max_segment_size = ub->params.seg.max_segment_size;
4472 		lim.max_segments = ub->params.seg.max_segments;
4473 	}
4474 
4475 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4476 		const struct ublk_param_integrity *p = &ub->params.integrity;
4477 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4478 
4479 		lim.max_integrity_segments =
4480 			p->max_integrity_segments ?: USHRT_MAX;
4481 		lim.integrity = (struct blk_integrity) {
4482 			.flags = ublk_integrity_flags(p->flags),
4483 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4484 			.metadata_size = p->metadata_size,
4485 			.pi_offset = p->pi_offset,
4486 			.interval_exp = p->interval_exp,
4487 			.tag_size = p->tag_size,
4488 			.pi_tuple_size = pi_tuple_size,
4489 		};
4490 	}
4491 
4492 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4493 		return -EINTR;
4494 
4495 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4496 		return -EINVAL;
4497 
4498 	mutex_lock(&ub->mutex);
4499 	/* device may become not ready in case of F_BATCH */
4500 	if (!ublk_dev_ready(ub)) {
4501 		ret = -EINVAL;
4502 		goto out_unlock;
4503 	}
4504 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4505 	    test_bit(UB_STATE_USED, &ub->state)) {
4506 		ret = -EEXIST;
4507 		goto out_unlock;
4508 	}
4509 
4510 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4511 	if (IS_ERR(disk)) {
4512 		ret = PTR_ERR(disk);
4513 		goto out_unlock;
4514 	}
4515 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4516 	disk->fops = &ub_fops;
4517 	disk->private_data = ub;
4518 
4519 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4520 	ub->ub_disk = disk;
4521 
4522 	ublk_apply_params(ub);
4523 
4524 	/*
4525 	 * Suppress partition scan to avoid potential IO hang.
4526 	 *
4527 	 * If ublk server error occurs during partition scan, the IO may
4528 	 * wait while holding ub->mutex, which can deadlock with other
4529 	 * operations that need the mutex. Defer partition scan to async
4530 	 * work.
4531 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4532 	 * permanently.
4533 	 */
4534 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4535 
4536 	ublk_get_device(ub);
4537 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4538 
4539 	if (ublk_dev_is_zoned(ub)) {
4540 		ret = ublk_revalidate_disk_zones(ub);
4541 		if (ret)
4542 			goto out_put_cdev;
4543 	}
4544 
4545 	ret = add_disk(disk);
4546 	if (ret)
4547 		goto out_put_cdev;
4548 
4549 	set_bit(UB_STATE_USED, &ub->state);
4550 
4551 	/* Skip partition scan if disabled by user */
4552 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4553 		/* Not clear for unprivileged daemons, see comment above */
4554 		if (!ub->unprivileged_daemons)
4555 			clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4556 	} else {
4557 		/* Schedule async partition scan for trusted daemons */
4558 		if (!ub->unprivileged_daemons)
4559 			schedule_work(&ub->partition_scan_work);
4560 	}
4561 
4562 out_put_cdev:
4563 	if (ret) {
4564 		ublk_detach_disk(ub);
4565 		ublk_put_device(ub);
4566 	}
4567 	if (ret)
4568 		put_disk(disk);
4569 out_unlock:
4570 	mutex_unlock(&ub->mutex);
4571 	return ret;
4572 }
4573 
4574 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4575 		const struct ublksrv_ctrl_cmd *header)
4576 {
4577 	void __user *argp = (void __user *)(unsigned long)header->addr;
4578 	cpumask_var_t cpumask;
4579 	unsigned long queue;
4580 	unsigned int retlen;
4581 	unsigned int i;
4582 	int ret;
4583 
4584 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4585 		return -EINVAL;
4586 	if (header->len & (sizeof(unsigned long)-1))
4587 		return -EINVAL;
4588 	if (!header->addr)
4589 		return -EINVAL;
4590 
4591 	queue = header->data[0];
4592 	if (queue >= ub->dev_info.nr_hw_queues)
4593 		return -EINVAL;
4594 
4595 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4596 		return -ENOMEM;
4597 
4598 	for_each_possible_cpu(i) {
4599 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4600 			cpumask_set_cpu(i, cpumask);
4601 	}
4602 
4603 	ret = -EFAULT;
4604 	retlen = min_t(unsigned short, header->len, cpumask_size());
4605 	if (copy_to_user(argp, cpumask, retlen))
4606 		goto out_free_cpumask;
4607 	if (retlen != header->len &&
4608 	    clear_user(argp + retlen, header->len - retlen))
4609 		goto out_free_cpumask;
4610 
4611 	ret = 0;
4612 out_free_cpumask:
4613 	free_cpumask_var(cpumask);
4614 	return ret;
4615 }
4616 
4617 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4618 {
4619 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4620 			info->dev_id, info->flags);
4621 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4622 			info->nr_hw_queues, info->queue_depth);
4623 }
4624 
4625 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4626 {
4627 	void __user *argp = (void __user *)(unsigned long)header->addr;
4628 	struct ublksrv_ctrl_dev_info info;
4629 	struct ublk_device *ub;
4630 	int ret = -EINVAL;
4631 
4632 	if (header->len < sizeof(info) || !header->addr)
4633 		return -EINVAL;
4634 	if (header->queue_id != (u16)-1) {
4635 		pr_warn("%s: queue_id is wrong %x\n",
4636 			__func__, header->queue_id);
4637 		return -EINVAL;
4638 	}
4639 
4640 	if (copy_from_user(&info, argp, sizeof(info)))
4641 		return -EFAULT;
4642 
4643 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4644 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4645 		return -EINVAL;
4646 
4647 	if (capable(CAP_SYS_ADMIN))
4648 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4649 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4650 		return -EPERM;
4651 
4652 	/* forbid nonsense combinations of recovery flags */
4653 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4654 	case 0:
4655 	case UBLK_F_USER_RECOVERY:
4656 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4657 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4658 		break;
4659 	default:
4660 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4661 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4662 		return -EINVAL;
4663 	}
4664 
4665 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4666 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4667 		return -EINVAL;
4668 	}
4669 
4670 	/*
4671 	 * unprivileged device can't be trusted, but RECOVERY and
4672 	 * RECOVERY_REISSUE still may hang error handling, so can't
4673 	 * support recovery features for unprivileged ublk now
4674 	 *
4675 	 * TODO: provide forward progress for RECOVERY handler, so that
4676 	 * unprivileged device can benefit from it
4677 	 */
4678 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4679 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4680 				UBLK_F_USER_RECOVERY);
4681 
4682 		/*
4683 		 * For USER_COPY, we depends on userspace to fill request
4684 		 * buffer by pwrite() to ublk char device, which can't be
4685 		 * used for unprivileged device
4686 		 *
4687 		 * Same with zero copy or auto buffer register.
4688 		 */
4689 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4690 					UBLK_F_AUTO_BUF_REG))
4691 			return -EINVAL;
4692 	}
4693 
4694 	/* User copy is required to access integrity buffer */
4695 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4696 		return -EINVAL;
4697 
4698 	/* the created device is always owned by current user */
4699 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4700 
4701 	if (header->dev_id != info.dev_id) {
4702 		pr_warn("%s: dev id not match %u %u\n",
4703 			__func__, header->dev_id, info.dev_id);
4704 		return -EINVAL;
4705 	}
4706 
4707 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4708 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4709 			__func__, UBLK_MAX_UBLKS - 1);
4710 		return -EINVAL;
4711 	}
4712 
4713 	ublk_dump_dev_info(&info);
4714 
4715 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4716 	if (ret)
4717 		return ret;
4718 
4719 	ret = -EACCES;
4720 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4721 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4722 		goto out_unlock;
4723 
4724 	ret = -ENOMEM;
4725 	ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4726 	if (!ub)
4727 		goto out_unlock;
4728 	mutex_init(&ub->mutex);
4729 	spin_lock_init(&ub->lock);
4730 	mutex_init(&ub->cancel_mutex);
4731 	mt_init(&ub->buf_tree);
4732 	ida_init(&ub->buf_ida);
4733 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4734 
4735 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4736 	if (ret < 0)
4737 		goto out_free_ub;
4738 
4739 	memcpy(&ub->dev_info, &info, sizeof(info));
4740 
4741 	/* update device id */
4742 	ub->dev_info.dev_id = ub->ub_number;
4743 
4744 	/*
4745 	 * 64bit flags will be copied back to userspace as feature
4746 	 * negotiation result, so have to clear flags which driver
4747 	 * doesn't support yet, then userspace can get correct flags
4748 	 * (features) to handle.
4749 	 */
4750 	ub->dev_info.flags &= UBLK_F_ALL;
4751 
4752 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4753 		UBLK_F_URING_CMD_COMP_IN_TASK |
4754 		UBLK_F_PER_IO_DAEMON |
4755 		UBLK_F_BUF_REG_OFF_DAEMON |
4756 		UBLK_F_SAFE_STOP_DEV;
4757 
4758 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4759 	if (ublk_dev_support_batch_io(ub))
4760 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4761 
4762 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4763 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4764 				UBLK_F_AUTO_BUF_REG))
4765 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4766 
4767 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4768 	if (ublk_dev_support_batch_io(ub))
4769 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4770 
4771 	/*
4772 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4773 	 * returning write_append_lba, which is only allowed in case of
4774 	 * user copy or zero copy
4775 	 */
4776 	if (ublk_dev_is_zoned(ub) &&
4777 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4778 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4779 		ret = -EINVAL;
4780 		goto out_free_dev_number;
4781 	}
4782 
4783 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4784 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4785 	ublk_align_max_io_size(ub);
4786 
4787 	ret = ublk_add_tag_set(ub);
4788 	if (ret)
4789 		goto out_free_dev_number;
4790 
4791 	ret = ublk_init_queues(ub);
4792 	if (ret)
4793 		goto out_free_tag_set;
4794 
4795 	ret = -EFAULT;
4796 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4797 		goto out_deinit_queues;
4798 
4799 	/*
4800 	 * Add the char dev so that ublksrv daemon can be setup.
4801 	 * ublk_add_chdev() will cleanup everything if it fails.
4802 	 */
4803 	ret = ublk_add_chdev(ub);
4804 	goto out_unlock;
4805 
4806 out_deinit_queues:
4807 	ublk_deinit_queues(ub);
4808 out_free_tag_set:
4809 	blk_mq_free_tag_set(&ub->tag_set);
4810 out_free_dev_number:
4811 	ublk_free_dev_number(ub);
4812 out_free_ub:
4813 	mutex_destroy(&ub->mutex);
4814 	mutex_destroy(&ub->cancel_mutex);
4815 	kfree(ub);
4816 out_unlock:
4817 	mutex_unlock(&ublk_ctl_mutex);
4818 	return ret;
4819 }
4820 
4821 static inline bool ublk_idr_freed(int id)
4822 {
4823 	void *ptr;
4824 
4825 	spin_lock(&ublk_idr_lock);
4826 	ptr = idr_find(&ublk_index_idr, id);
4827 	spin_unlock(&ublk_idr_lock);
4828 
4829 	return ptr == NULL;
4830 }
4831 
4832 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4833 {
4834 	struct ublk_device *ub = *p_ub;
4835 	int idx = ub->ub_number;
4836 	int ret;
4837 
4838 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4839 	if (ret)
4840 		return ret;
4841 
4842 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4843 		ublk_remove(ub);
4844 		set_bit(UB_STATE_DELETED, &ub->state);
4845 	}
4846 
4847 	/* Mark the reference as consumed */
4848 	*p_ub = NULL;
4849 	ublk_put_device(ub);
4850 	mutex_unlock(&ublk_ctl_mutex);
4851 
4852 	/*
4853 	 * Wait until the idr is removed, then it can be reused after
4854 	 * DEL_DEV command is returned.
4855 	 *
4856 	 * If we returns because of user interrupt, future delete command
4857 	 * may come:
4858 	 *
4859 	 * - the device number isn't freed, this device won't or needn't
4860 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4861 	 *   will be released after the last reference is dropped
4862 	 *
4863 	 * - the device number is freed already, we will not find this
4864 	 *   device via ublk_get_device_from_id()
4865 	 */
4866 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4867 		return -EINTR;
4868 	return 0;
4869 }
4870 
4871 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4872 				      const struct ublksrv_ctrl_cmd *header)
4873 {
4874 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4875 			__func__, cmd_op, header->dev_id, header->queue_id,
4876 			header->data[0], header->addr, header->len);
4877 }
4878 
4879 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4880 {
4881 	ublk_stop_dev(ub);
4882 }
4883 
4884 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4885 {
4886 	struct gendisk *disk;
4887 	int ret = 0;
4888 
4889 	disk = ublk_get_disk(ub);
4890 	if (!disk)
4891 		return -ENODEV;
4892 
4893 	mutex_lock(&disk->open_mutex);
4894 	if (disk_openers(disk) > 0) {
4895 		ret = -EBUSY;
4896 		goto unlock;
4897 	}
4898 	ub->block_open = true;
4899 	/* release open_mutex as del_gendisk() will reacquire it */
4900 	mutex_unlock(&disk->open_mutex);
4901 
4902 	ublk_ctrl_stop_dev(ub);
4903 	goto out;
4904 
4905 unlock:
4906 	mutex_unlock(&disk->open_mutex);
4907 out:
4908 	ublk_put_disk(disk);
4909 	return ret;
4910 }
4911 
4912 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4913 		const struct ublksrv_ctrl_cmd *header)
4914 {
4915 	struct task_struct *p;
4916 	struct pid *pid;
4917 	struct ublksrv_ctrl_dev_info dev_info;
4918 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4919 	void __user *argp = (void __user *)(unsigned long)header->addr;
4920 
4921 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4922 		return -EINVAL;
4923 
4924 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4925 	dev_info.ublksrv_pid = -1;
4926 
4927 	if (init_ublksrv_tgid > 0) {
4928 		rcu_read_lock();
4929 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4930 		p = pid_task(pid, PIDTYPE_TGID);
4931 		if (p) {
4932 			int vnr = task_tgid_vnr(p);
4933 
4934 			if (vnr)
4935 				dev_info.ublksrv_pid = vnr;
4936 		}
4937 		rcu_read_unlock();
4938 	}
4939 
4940 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4941 		return -EFAULT;
4942 
4943 	return 0;
4944 }
4945 
4946 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
4947 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4948 {
4949 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4950 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4951 
4952 	if (ub->ub_disk) {
4953 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4954 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4955 	} else {
4956 		ub->params.devt.disk_major = 0;
4957 		ub->params.devt.disk_minor = 0;
4958 	}
4959 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4960 }
4961 
4962 static int ublk_ctrl_get_params(struct ublk_device *ub,
4963 		const struct ublksrv_ctrl_cmd *header)
4964 {
4965 	void __user *argp = (void __user *)(unsigned long)header->addr;
4966 	struct ublk_params_header ph;
4967 	int ret;
4968 
4969 	if (header->len <= sizeof(ph) || !header->addr)
4970 		return -EINVAL;
4971 
4972 	if (copy_from_user(&ph, argp, sizeof(ph)))
4973 		return -EFAULT;
4974 
4975 	if (ph.len > header->len || !ph.len)
4976 		return -EINVAL;
4977 
4978 	if (ph.len > sizeof(struct ublk_params))
4979 		ph.len = sizeof(struct ublk_params);
4980 
4981 	mutex_lock(&ub->mutex);
4982 	ublk_ctrl_fill_params_devt(ub);
4983 	if (copy_to_user(argp, &ub->params, ph.len))
4984 		ret = -EFAULT;
4985 	else
4986 		ret = 0;
4987 	mutex_unlock(&ub->mutex);
4988 
4989 	return ret;
4990 }
4991 
4992 static int ublk_ctrl_set_params(struct ublk_device *ub,
4993 		const struct ublksrv_ctrl_cmd *header)
4994 {
4995 	void __user *argp = (void __user *)(unsigned long)header->addr;
4996 	struct ublk_params_header ph;
4997 	int ret = -EFAULT;
4998 
4999 	if (header->len <= sizeof(ph) || !header->addr)
5000 		return -EINVAL;
5001 
5002 	if (copy_from_user(&ph, argp, sizeof(ph)))
5003 		return -EFAULT;
5004 
5005 	if (ph.len > header->len || !ph.len || !ph.types)
5006 		return -EINVAL;
5007 
5008 	if (ph.len > sizeof(struct ublk_params))
5009 		ph.len = sizeof(struct ublk_params);
5010 
5011 	mutex_lock(&ub->mutex);
5012 	if (test_bit(UB_STATE_USED, &ub->state)) {
5013 		/*
5014 		 * Parameters can only be changed when device hasn't
5015 		 * been started yet
5016 		 */
5017 		ret = -EACCES;
5018 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
5019 		/* zero out partial copy so no stale params survive */
5020 		memset(&ub->params, 0, sizeof(ub->params));
5021 		ret = -EFAULT;
5022 	} else {
5023 		/* clear all we don't support yet */
5024 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
5025 		ret = ublk_validate_params(ub);
5026 		if (ret)
5027 			memset(&ub->params, 0, sizeof(ub->params));
5028 	}
5029 	mutex_unlock(&ub->mutex);
5030 
5031 	return ret;
5032 }
5033 
5034 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
5035 {
5036 	int ret = -EINVAL;
5037 
5038 	mutex_lock(&ub->mutex);
5039 	if (ublk_nosrv_should_stop_dev(ub))
5040 		goto out_unlock;
5041 	/*
5042 	 * START_RECOVERY is only allowd after:
5043 	 *
5044 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5045 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
5046 	 *     released.
5047 	 *
5048 	 * and one of the following holds
5049 	 *
5050 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5051 	 *     (a)has quiesced request queue
5052 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
5053 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5054 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
5055 	 *
5056 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5057 	 *     quiesced, but all I/O is being immediately errored
5058 	 */
5059 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5060 		ret = -EBUSY;
5061 		goto out_unlock;
5062 	}
5063 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5064 	init_completion(&ub->completion);
5065 	ret = 0;
5066  out_unlock:
5067 	mutex_unlock(&ub->mutex);
5068 	return ret;
5069 }
5070 
5071 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5072 		const struct ublksrv_ctrl_cmd *header)
5073 {
5074 	int ublksrv_pid = (int)header->data[0];
5075 	int ret = -EINVAL;
5076 
5077 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5078 		 header->dev_id);
5079 
5080 	if (wait_for_completion_interruptible(&ub->completion))
5081 		return -EINTR;
5082 
5083 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5084 		 header->dev_id);
5085 
5086 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
5087 		return -EINVAL;
5088 
5089 	mutex_lock(&ub->mutex);
5090 	if (ublk_nosrv_should_stop_dev(ub))
5091 		goto out_unlock;
5092 
5093 	if (!ublk_dev_in_recoverable_state(ub)) {
5094 		ret = -EBUSY;
5095 		goto out_unlock;
5096 	}
5097 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5098 	ub->dev_info.state = UBLK_S_DEV_LIVE;
5099 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5100 			__func__, ublksrv_pid, header->dev_id);
5101 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
5102 	ret = 0;
5103  out_unlock:
5104 	mutex_unlock(&ub->mutex);
5105 	return ret;
5106 }
5107 
5108 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5109 {
5110 	void __user *argp = (void __user *)(unsigned long)header->addr;
5111 	u64 features = UBLK_F_ALL;
5112 
5113 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
5114 		return -EINVAL;
5115 
5116 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5117 		return -EFAULT;
5118 
5119 	return 0;
5120 }
5121 
5122 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5123 {
5124 	struct ublk_param_basic *p = &ub->params.basic;
5125 	u64 new_size = header->data[0];
5126 	int ret = 0;
5127 
5128 	mutex_lock(&ub->mutex);
5129 	if (!ub->ub_disk) {
5130 		ret = -ENODEV;
5131 		goto out;
5132 	}
5133 	p->dev_sectors = new_size;
5134 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5135 out:
5136 	mutex_unlock(&ub->mutex);
5137 	return ret;
5138 }
5139 
5140 struct count_busy {
5141 	const struct ublk_queue *ubq;
5142 	unsigned int nr_busy;
5143 };
5144 
5145 static bool ublk_count_busy_req(struct request *rq, void *data)
5146 {
5147 	struct count_busy *idle = data;
5148 
5149 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5150 		idle->nr_busy += 1;
5151 	return true;
5152 }
5153 
5154 /* uring_cmd is guaranteed to be active if the associated request is idle */
5155 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5156 {
5157 	struct count_busy data = {
5158 		.ubq = ubq,
5159 	};
5160 
5161 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5162 	return data.nr_busy < ubq->q_depth;
5163 }
5164 
5165 /* Wait until each hw queue has at least one idle IO */
5166 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5167 				 unsigned int timeout_ms)
5168 {
5169 	unsigned int elapsed = 0;
5170 	int ret;
5171 
5172 	/*
5173 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5174 	 * or new fetch command, so needn't wait any more
5175 	 */
5176 	if (ublk_dev_support_batch_io(ub))
5177 		return 0;
5178 
5179 	while (elapsed < timeout_ms && !signal_pending(current)) {
5180 		unsigned int queues_cancelable = 0;
5181 		int i;
5182 
5183 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5184 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5185 
5186 			queues_cancelable += !!ubq_has_idle_io(ubq);
5187 		}
5188 
5189 		/*
5190 		 * Each queue needs at least one active command for
5191 		 * notifying ublk server
5192 		 */
5193 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5194 			break;
5195 
5196 		msleep(UBLK_REQUEUE_DELAY_MS);
5197 		elapsed += UBLK_REQUEUE_DELAY_MS;
5198 	}
5199 
5200 	if (signal_pending(current))
5201 		ret = -EINTR;
5202 	else if (elapsed >= timeout_ms)
5203 		ret = -EBUSY;
5204 	else
5205 		ret = 0;
5206 
5207 	return ret;
5208 }
5209 
5210 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5211 				 const struct ublksrv_ctrl_cmd *header)
5212 {
5213 	/* zero means wait forever */
5214 	u64 timeout_ms = header->data[0];
5215 	struct gendisk *disk;
5216 	int ret = -ENODEV;
5217 
5218 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5219 		return -EOPNOTSUPP;
5220 
5221 	mutex_lock(&ub->mutex);
5222 	disk = ublk_get_disk(ub);
5223 	if (!disk)
5224 		goto unlock;
5225 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5226 		goto put_disk;
5227 
5228 	ret = 0;
5229 	/* already in expected state */
5230 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5231 		goto put_disk;
5232 
5233 	/* Mark the device as canceling */
5234 	mutex_lock(&ub->cancel_mutex);
5235 	blk_mq_quiesce_queue(disk->queue);
5236 	ublk_set_canceling(ub, true);
5237 	blk_mq_unquiesce_queue(disk->queue);
5238 	mutex_unlock(&ub->cancel_mutex);
5239 
5240 	if (!timeout_ms)
5241 		timeout_ms = UINT_MAX;
5242 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5243 
5244 put_disk:
5245 	ublk_put_disk(disk);
5246 unlock:
5247 	mutex_unlock(&ub->mutex);
5248 
5249 	/* Cancel pending uring_cmd */
5250 	if (!ret)
5251 		ublk_cancel_dev(ub);
5252 	return ret;
5253 }
5254 
5255 /*
5256  * All control commands are sent via /dev/ublk-control, so we have to check
5257  * the destination device's permission
5258  */
5259 static int ublk_char_dev_permission(struct ublk_device *ub,
5260 		const char *dev_path, int mask)
5261 {
5262 	int err;
5263 	struct path path;
5264 	struct kstat stat;
5265 
5266 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5267 	if (err)
5268 		return err;
5269 
5270 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5271 	if (err)
5272 		goto exit;
5273 
5274 	err = -EPERM;
5275 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5276 		goto exit;
5277 
5278 	err = inode_permission(&nop_mnt_idmap,
5279 			d_backing_inode(path.dentry), mask);
5280 exit:
5281 	path_put(&path);
5282 	return err;
5283 }
5284 
5285 /*
5286  * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5287  * if device is started. If device is not yet started, only mutex is
5288  * needed since no I/O path can access the tree.
5289  *
5290  * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5291  * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5292 */
5293 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5294 {
5295 	unsigned int memflags = 0;
5296 
5297 	mutex_lock(&ub->mutex);
5298 	if (ub->ub_disk)
5299 		memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5300 
5301 	return memflags;
5302 }
5303 
5304 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5305 {
5306 	if (ub->ub_disk)
5307 		blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5308 	mutex_unlock(&ub->mutex);
5309 }
5310 
5311 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
5312 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5313 {
5314 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5315 	struct ublk_buf_range *range;
5316 
5317 	mas_lock(&mas);
5318 	mas_for_each(&mas, range, ULONG_MAX) {
5319 		if (range->buf_index == buf_index) {
5320 			mas_erase(&mas);
5321 			kfree(range);
5322 		}
5323 	}
5324 	mas_unlock(&mas);
5325 }
5326 
5327 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5328 			       struct page **pages, unsigned long nr_pages,
5329 			       int index, unsigned short flags)
5330 {
5331 	unsigned long i;
5332 	int ret;
5333 
5334 	for (i = 0; i < nr_pages; i++) {
5335 		unsigned long pfn = page_to_pfn(pages[i]);
5336 		unsigned long start = i;
5337 		struct ublk_buf_range *range;
5338 
5339 		/* Find run of consecutive PFNs */
5340 		while (i + 1 < nr_pages &&
5341 		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5342 			i++;
5343 
5344 		range = kzalloc(sizeof(*range), GFP_KERNEL);
5345 		if (!range) {
5346 			ret = -ENOMEM;
5347 			goto unwind;
5348 		}
5349 		range->buf_index = index;
5350 		range->flags = flags;
5351 		range->base_offset = start << PAGE_SHIFT;
5352 
5353 		ret = mtree_insert_range(&ub->buf_tree, pfn,
5354 					 pfn + (i - start),
5355 					 range, GFP_KERNEL);
5356 		if (ret) {
5357 			kfree(range);
5358 			goto unwind;
5359 		}
5360 	}
5361 	return 0;
5362 
5363 unwind:
5364 	ublk_buf_erase_ranges(ub, index);
5365 	return ret;
5366 }
5367 
5368 /*
5369  * Register a shared memory buffer for zero-copy I/O.
5370  * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5371  * internally. Returns buffer index (>= 0) on success.
5372  */
5373 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5374 			     struct ublksrv_ctrl_cmd *header)
5375 {
5376 	void __user *argp = (void __user *)(unsigned long)header->addr;
5377 	struct ublk_shmem_buf_reg buf_reg;
5378 	unsigned long nr_pages;
5379 	struct page **pages = NULL;
5380 	unsigned int gup_flags;
5381 	unsigned int memflags;
5382 	long pinned;
5383 	int index;
5384 	int ret;
5385 
5386 	if (!ublk_dev_support_shmem_zc(ub))
5387 		return -EOPNOTSUPP;
5388 
5389 	memset(&buf_reg, 0, sizeof(buf_reg));
5390 	if (copy_from_user(&buf_reg, argp,
5391 			   min_t(size_t, header->len, sizeof(buf_reg))))
5392 		return -EFAULT;
5393 
5394 	if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5395 		return -EINVAL;
5396 
5397 	if (buf_reg.reserved)
5398 		return -EINVAL;
5399 
5400 	if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5401 	    !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5402 		return -EINVAL;
5403 
5404 	nr_pages = buf_reg.len >> PAGE_SHIFT;
5405 
5406 	/* Pin pages before any locks (may sleep) */
5407 	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5408 	if (!pages)
5409 		return -ENOMEM;
5410 
5411 	gup_flags = FOLL_LONGTERM;
5412 	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5413 		gup_flags |= FOLL_WRITE;
5414 
5415 	pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5416 	if (pinned < 0) {
5417 		ret = pinned;
5418 		goto err_free_pages;
5419 	}
5420 	if (pinned != nr_pages) {
5421 		ret = -EFAULT;
5422 		goto err_unpin;
5423 	}
5424 
5425 	memflags = ublk_lock_buf_tree(ub);
5426 
5427 	index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5428 	if (index < 0) {
5429 		ret = index;
5430 		goto err_unlock;
5431 	}
5432 
5433 	ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5434 	if (ret) {
5435 		ida_free(&ub->buf_ida, index);
5436 		goto err_unlock;
5437 	}
5438 
5439 	ublk_unlock_buf_tree(ub, memflags);
5440 	kvfree(pages);
5441 	return index;
5442 
5443 err_unlock:
5444 	ublk_unlock_buf_tree(ub, memflags);
5445 err_unpin:
5446 	unpin_user_pages(pages, pinned);
5447 err_free_pages:
5448 	kvfree(pages);
5449 	return ret;
5450 }
5451 
5452 static void ublk_unpin_range_pages(unsigned long base_pfn,
5453 				   unsigned long nr_pages)
5454 {
5455 #define UBLK_UNPIN_BATCH	32
5456 	struct page *pages[UBLK_UNPIN_BATCH];
5457 	unsigned long off;
5458 
5459 	for (off = 0; off < nr_pages; ) {
5460 		unsigned int batch = min_t(unsigned long,
5461 					   nr_pages - off, UBLK_UNPIN_BATCH);
5462 		unsigned int j;
5463 
5464 		for (j = 0; j < batch; j++)
5465 			pages[j] = pfn_to_page(base_pfn + off + j);
5466 		unpin_user_pages(pages, batch);
5467 		off += batch;
5468 	}
5469 }
5470 
5471 /*
5472  * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under
5473  * mas_lock, collecting them into an xarray. Then drop the lock and
5474  * unpin pages + free ranges outside spinlock context.
5475  *
5476  * Returns true if the tree walk completed, false if more ranges remain.
5477  * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value().
5478  */
5479 #define UBLK_REMOVE_BATCH	64
5480 
5481 static bool __ublk_shmem_remove_ranges(struct ublk_device *ub,
5482 					int buf_index, int *ret)
5483 {
5484 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5485 	struct ublk_buf_range *range;
5486 	struct xarray to_unpin;
5487 	unsigned long idx;
5488 	unsigned int count = 0;
5489 	bool done = false;
5490 	void *entry;
5491 
5492 	xa_init(&to_unpin);
5493 
5494 	mas_lock(&mas);
5495 	mas_for_each(&mas, range, ULONG_MAX) {
5496 		unsigned long nr;
5497 
5498 		if (buf_index >= 0 && range->buf_index != buf_index)
5499 			continue;
5500 
5501 		*ret = 0;
5502 		nr = mas.last - mas.index + 1;
5503 		if (xa_err(xa_store(&to_unpin, mas.index,
5504 				    xa_mk_value(nr), GFP_ATOMIC)))
5505 			goto unlock;
5506 		mas_erase(&mas);
5507 		kfree(range);
5508 		if (++count >= UBLK_REMOVE_BATCH)
5509 			goto unlock;
5510 	}
5511 	done = true;
5512 unlock:
5513 	mas_unlock(&mas);
5514 
5515 	xa_for_each(&to_unpin, idx, entry)
5516 		ublk_unpin_range_pages(idx, xa_to_value(entry));
5517 	xa_destroy(&to_unpin);
5518 
5519 	return done;
5520 }
5521 
5522 /*
5523  * Remove ranges from the maple tree matching buf_index, unpin pages
5524  * and free range structs. If buf_index < 0, remove all ranges.
5525  * Processes ranges in batches to avoid holding the maple tree spinlock
5526  * across potentially expensive page unpinning.
5527  */
5528 static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index)
5529 {
5530 	int ret = -ENOENT;
5531 
5532 	while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret))
5533 		cond_resched();
5534 	return ret;
5535 }
5536 
5537 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5538 			       struct ublksrv_ctrl_cmd *header)
5539 {
5540 	int index = (int)header->data[0];
5541 	unsigned int memflags;
5542 	int ret;
5543 
5544 	if (!ublk_dev_support_shmem_zc(ub))
5545 		return -EOPNOTSUPP;
5546 
5547 	if (index < 0 || index > USHRT_MAX)
5548 		return -EINVAL;
5549 
5550 	memflags = ublk_lock_buf_tree(ub);
5551 
5552 	ret = ublk_shmem_remove_ranges(ub, index);
5553 	if (!ret)
5554 		ida_free(&ub->buf_ida, index);
5555 
5556 	ublk_unlock_buf_tree(ub, memflags);
5557 	return ret;
5558 }
5559 
5560 static void ublk_buf_cleanup(struct ublk_device *ub)
5561 {
5562 	ublk_shmem_remove_ranges(ub, -1);
5563 	mtree_destroy(&ub->buf_tree);
5564 	ida_destroy(&ub->buf_ida);
5565 }
5566 
5567 /* Check if request pages match a registered shared memory buffer */
5568 static bool ublk_try_buf_match(struct ublk_device *ub,
5569 				   struct request *rq,
5570 				   u32 *buf_idx, u32 *buf_off)
5571 {
5572 	struct req_iterator iter;
5573 	struct bio_vec bv;
5574 	int index = -1;
5575 	unsigned long expected_offset = 0;
5576 	bool first = true;
5577 
5578 	rq_for_each_bvec(bv, rq, iter) {
5579 		unsigned long pfn = page_to_pfn(bv.bv_page);
5580 		unsigned long end_pfn = pfn +
5581 			((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5582 		struct ublk_buf_range *range;
5583 		unsigned long off;
5584 		MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5585 
5586 		range = mas_walk(&mas);
5587 		if (!range)
5588 			return false;
5589 
5590 		/* verify all pages in this bvec fall within the range */
5591 		if (end_pfn > mas.last)
5592 			return false;
5593 
5594 		off = range->base_offset +
5595 			(pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5596 
5597 		if (first) {
5598 			/* Read-only buffer can't serve READ (kernel writes) */
5599 			if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5600 			    req_op(rq) != REQ_OP_WRITE)
5601 				return false;
5602 			index = range->buf_index;
5603 			expected_offset = off;
5604 			*buf_off = off;
5605 			first = false;
5606 		} else {
5607 			if (range->buf_index != index)
5608 				return false;
5609 			if (off != expected_offset)
5610 				return false;
5611 		}
5612 		expected_offset += bv.bv_len;
5613 	}
5614 
5615 	if (first)
5616 		return false;
5617 
5618 	*buf_idx = index;
5619 	return true;
5620 }
5621 
5622 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5623 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5624 {
5625 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5626 	void __user *argp = (void __user *)(unsigned long)header->addr;
5627 	char *dev_path = NULL;
5628 	int ret = 0;
5629 	int mask;
5630 
5631 	if (!unprivileged) {
5632 		if (!capable(CAP_SYS_ADMIN))
5633 			return -EPERM;
5634 		/*
5635 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5636 		 * char_dev_path in payload too, since userspace may not
5637 		 * know if the specified device is created as unprivileged
5638 		 * mode.
5639 		 */
5640 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5641 			return 0;
5642 	}
5643 
5644 	/*
5645 	 * User has to provide the char device path for unprivileged ublk
5646 	 *
5647 	 * header->addr always points to the dev path buffer, and
5648 	 * header->dev_path_len records length of dev path buffer.
5649 	 */
5650 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5651 		return -EINVAL;
5652 
5653 	if (header->len < header->dev_path_len)
5654 		return -EINVAL;
5655 
5656 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5657 	if (IS_ERR(dev_path))
5658 		return PTR_ERR(dev_path);
5659 
5660 	ret = -EINVAL;
5661 	switch (_IOC_NR(cmd_op)) {
5662 	case UBLK_CMD_GET_DEV_INFO:
5663 	case UBLK_CMD_GET_DEV_INFO2:
5664 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5665 	case UBLK_CMD_GET_PARAMS:
5666 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5667 		mask = MAY_READ;
5668 		break;
5669 	case UBLK_CMD_START_DEV:
5670 	case UBLK_CMD_STOP_DEV:
5671 	case UBLK_CMD_ADD_DEV:
5672 	case UBLK_CMD_DEL_DEV:
5673 	case UBLK_CMD_SET_PARAMS:
5674 	case UBLK_CMD_START_USER_RECOVERY:
5675 	case UBLK_CMD_END_USER_RECOVERY:
5676 	case UBLK_CMD_UPDATE_SIZE:
5677 	case UBLK_CMD_QUIESCE_DEV:
5678 	case UBLK_CMD_TRY_STOP_DEV:
5679 	case UBLK_CMD_REG_BUF:
5680 	case UBLK_CMD_UNREG_BUF:
5681 		mask = MAY_READ | MAY_WRITE;
5682 		break;
5683 	default:
5684 		goto exit;
5685 	}
5686 
5687 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5688 	if (!ret) {
5689 		header->len -= header->dev_path_len;
5690 		header->addr += header->dev_path_len;
5691 	}
5692 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5693 			__func__, ub->ub_number, cmd_op,
5694 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5695 			dev_path, ret);
5696 exit:
5697 	kfree(dev_path);
5698 	return ret;
5699 }
5700 
5701 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5702 {
5703 	switch (_IOC_NR(cmd_op)) {
5704 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5705 	case UBLK_CMD_GET_DEV_INFO:
5706 	case UBLK_CMD_GET_DEV_INFO2:
5707 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5708 		return false;
5709 	default:
5710 		return true;
5711 	}
5712 }
5713 
5714 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5715 		unsigned int issue_flags)
5716 {
5717 	/* May point to userspace-mapped memory */
5718 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5719 								    struct ublksrv_ctrl_cmd);
5720 	struct ublksrv_ctrl_cmd header;
5721 	struct ublk_device *ub = NULL;
5722 	u32 cmd_op = cmd->cmd_op;
5723 	int ret = -EINVAL;
5724 
5725 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5726 	    issue_flags & IO_URING_F_NONBLOCK)
5727 		return -EAGAIN;
5728 
5729 	if (!(issue_flags & IO_URING_F_SQE128))
5730 		return -EINVAL;
5731 
5732 	header.dev_id = READ_ONCE(ub_src->dev_id);
5733 	header.queue_id = READ_ONCE(ub_src->queue_id);
5734 	header.len = READ_ONCE(ub_src->len);
5735 	header.addr = READ_ONCE(ub_src->addr);
5736 	header.data[0] = READ_ONCE(ub_src->data[0]);
5737 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5738 	ublk_ctrl_cmd_dump(cmd_op, &header);
5739 
5740 	ret = ublk_check_cmd_op(cmd_op);
5741 	if (ret)
5742 		goto out;
5743 
5744 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5745 		ret = ublk_ctrl_get_features(&header);
5746 		goto out;
5747 	}
5748 
5749 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5750 		ret = -ENODEV;
5751 		ub = ublk_get_device_from_id(header.dev_id);
5752 		if (!ub)
5753 			goto out;
5754 
5755 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5756 		if (ret)
5757 			goto put_dev;
5758 	}
5759 
5760 	switch (_IOC_NR(cmd_op)) {
5761 	case UBLK_CMD_START_DEV:
5762 		ret = ublk_ctrl_start_dev(ub, &header);
5763 		break;
5764 	case UBLK_CMD_STOP_DEV:
5765 		ublk_ctrl_stop_dev(ub);
5766 		ret = 0;
5767 		break;
5768 	case UBLK_CMD_GET_DEV_INFO:
5769 	case UBLK_CMD_GET_DEV_INFO2:
5770 		ret = ublk_ctrl_get_dev_info(ub, &header);
5771 		break;
5772 	case UBLK_CMD_ADD_DEV:
5773 		ret = ublk_ctrl_add_dev(&header);
5774 		break;
5775 	case UBLK_CMD_DEL_DEV:
5776 		ret = ublk_ctrl_del_dev(&ub, true);
5777 		break;
5778 	case UBLK_CMD_DEL_DEV_ASYNC:
5779 		ret = ublk_ctrl_del_dev(&ub, false);
5780 		break;
5781 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5782 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5783 		break;
5784 	case UBLK_CMD_GET_PARAMS:
5785 		ret = ublk_ctrl_get_params(ub, &header);
5786 		break;
5787 	case UBLK_CMD_SET_PARAMS:
5788 		ret = ublk_ctrl_set_params(ub, &header);
5789 		break;
5790 	case UBLK_CMD_START_USER_RECOVERY:
5791 		ret = ublk_ctrl_start_recovery(ub);
5792 		break;
5793 	case UBLK_CMD_END_USER_RECOVERY:
5794 		ret = ublk_ctrl_end_recovery(ub, &header);
5795 		break;
5796 	case UBLK_CMD_UPDATE_SIZE:
5797 		ret = ublk_ctrl_set_size(ub, &header);
5798 		break;
5799 	case UBLK_CMD_QUIESCE_DEV:
5800 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5801 		break;
5802 	case UBLK_CMD_TRY_STOP_DEV:
5803 		ret = ublk_ctrl_try_stop_dev(ub);
5804 		break;
5805 	case UBLK_CMD_REG_BUF:
5806 		ret = ublk_ctrl_reg_buf(ub, &header);
5807 		break;
5808 	case UBLK_CMD_UNREG_BUF:
5809 		ret = ublk_ctrl_unreg_buf(ub, &header);
5810 		break;
5811 	default:
5812 		ret = -EOPNOTSUPP;
5813 		break;
5814 	}
5815 
5816  put_dev:
5817 	if (ub)
5818 		ublk_put_device(ub);
5819  out:
5820 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5821 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5822 	return ret;
5823 }
5824 
5825 static const struct file_operations ublk_ctl_fops = {
5826 	.open		= nonseekable_open,
5827 	.uring_cmd      = ublk_ctrl_uring_cmd,
5828 	.owner		= THIS_MODULE,
5829 	.llseek		= noop_llseek,
5830 };
5831 
5832 static struct miscdevice ublk_misc = {
5833 	.minor		= MISC_DYNAMIC_MINOR,
5834 	.name		= "ublk-control",
5835 	.fops		= &ublk_ctl_fops,
5836 };
5837 
5838 static int __init ublk_init(void)
5839 {
5840 	int ret;
5841 
5842 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5843 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5844 	/*
5845 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5846 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5847 	 */
5848 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5849 		     UBLKSRV_IO_INTEGRITY_FLAG);
5850 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5851 
5852 	init_waitqueue_head(&ublk_idr_wq);
5853 
5854 	ret = misc_register(&ublk_misc);
5855 	if (ret)
5856 		return ret;
5857 
5858 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5859 	if (ret)
5860 		goto unregister_mis;
5861 
5862 	ret = class_register(&ublk_chr_class);
5863 	if (ret)
5864 		goto free_chrdev_region;
5865 
5866 	return 0;
5867 
5868 free_chrdev_region:
5869 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5870 unregister_mis:
5871 	misc_deregister(&ublk_misc);
5872 	return ret;
5873 }
5874 
5875 static void __exit ublk_exit(void)
5876 {
5877 	struct ublk_device *ub;
5878 	int id;
5879 
5880 	idr_for_each_entry(&ublk_index_idr, ub, id)
5881 		ublk_remove(ub);
5882 
5883 	class_unregister(&ublk_chr_class);
5884 	misc_deregister(&ublk_misc);
5885 
5886 	idr_destroy(&ublk_index_idr);
5887 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5888 }
5889 
5890 module_init(ublk_init);
5891 module_exit(ublk_exit);
5892 
5893 static int ublk_set_max_unprivileged_ublks(const char *buf,
5894 					   const struct kernel_param *kp)
5895 {
5896 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5897 }
5898 
5899 static int ublk_get_max_unprivileged_ublks(char *buf,
5900 					   const struct kernel_param *kp)
5901 {
5902 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5903 }
5904 
5905 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5906 	.set = ublk_set_max_unprivileged_ublks,
5907 	.get = ublk_get_max_unprivileged_ublks,
5908 };
5909 
5910 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5911 		&unprivileged_ublks_max, 0644);
5912 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5913 
5914 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5915 MODULE_DESCRIPTION("Userspace block device");
5916 MODULE_LICENSE("GPL");
5917