xref: /linux/drivers/block/ublk_drv.c (revision 7fe6ac157b7e15c8976bd62ad7cb98e248884e83)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <linux/maple_tree.h>
50 #include <linux/xarray.h>
51 #include <uapi/linux/fs.h>
52 #include <uapi/linux/ublk_cmd.h>
53 
54 #define UBLK_MINORS		(1U << MINORBITS)
55 
56 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
57 
58 /* private ioctl command mirror */
59 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
60 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
61 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
62 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63 #define UBLK_CMD_REG_BUF	_IOC_NR(UBLK_U_CMD_REG_BUF)
64 #define UBLK_CMD_UNREG_BUF	_IOC_NR(UBLK_U_CMD_UNREG_BUF)
65 
66 /* Default max shmem buffer size: 4GB (may be increased in future) */
67 #define UBLK_SHMEM_BUF_SIZE_MAX	(1ULL << 32)
68 
69 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
70 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
71 
72 /* All UBLK_F_* have to be included into UBLK_F_ALL */
73 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
74 		| UBLK_F_URING_CMD_COMP_IN_TASK \
75 		| UBLK_F_NEED_GET_DATA \
76 		| UBLK_F_USER_RECOVERY \
77 		| UBLK_F_USER_RECOVERY_REISSUE \
78 		| UBLK_F_UNPRIVILEGED_DEV \
79 		| UBLK_F_CMD_IOCTL_ENCODE \
80 		| UBLK_F_USER_COPY \
81 		| UBLK_F_ZONED \
82 		| UBLK_F_USER_RECOVERY_FAIL_IO \
83 		| UBLK_F_UPDATE_SIZE \
84 		| UBLK_F_AUTO_BUF_REG \
85 		| UBLK_F_QUIESCE \
86 		| UBLK_F_PER_IO_DAEMON \
87 		| UBLK_F_BUF_REG_OFF_DAEMON \
88 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
89 		| UBLK_F_SAFE_STOP_DEV \
90 		| UBLK_F_BATCH_IO \
91 		| UBLK_F_NO_AUTO_PART_SCAN \
92 		| UBLK_F_SHMEM_ZC)
93 
94 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
95 		| UBLK_F_USER_RECOVERY_REISSUE \
96 		| UBLK_F_USER_RECOVERY_FAIL_IO)
97 
98 /* All UBLK_PARAM_TYPE_* should be included here */
99 #define UBLK_PARAM_TYPE_ALL                                \
100 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
101 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
102 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
103 	 UBLK_PARAM_TYPE_INTEGRITY)
104 
105 #define UBLK_BATCH_F_ALL  \
106 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
107 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
108 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
109 
110 /* ublk batch fetch uring_cmd */
111 struct ublk_batch_fetch_cmd {
112 	struct list_head node;
113 	struct io_uring_cmd *cmd;
114 	unsigned short buf_group;
115 };
116 
117 struct ublk_uring_cmd_pdu {
118 	/*
119 	 * Store requests in same batch temporarily for queuing them to
120 	 * daemon context.
121 	 *
122 	 * It should have been stored to request payload, but we do want
123 	 * to avoid extra pre-allocation, and uring_cmd payload is always
124 	 * free for us
125 	 */
126 	union {
127 		struct request *req;
128 		struct request *req_list;
129 	};
130 
131 	/*
132 	 * The following two are valid in this cmd whole lifetime, and
133 	 * setup in ublk uring_cmd handler
134 	 */
135 	struct ublk_queue *ubq;
136 
137 	union {
138 		u16 tag;
139 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
140 	};
141 };
142 
143 struct ublk_batch_io_data {
144 	struct ublk_device *ub;
145 	struct io_uring_cmd *cmd;
146 	struct ublk_batch_io header;
147 	unsigned int issue_flags;
148 	struct io_comp_batch *iob;
149 };
150 
151 /*
152  * io command is active: sqe cmd is received, and its cqe isn't done
153  *
154  * If the flag is set, the io command is owned by ublk driver, and waited
155  * for incoming blk-mq request from the ublk block device.
156  *
157  * If the flag is cleared, the io command will be completed, and owned by
158  * ublk server.
159  */
160 #define UBLK_IO_FLAG_ACTIVE	0x01
161 
162 /*
163  * IO command is completed via cqe, and it is being handled by ublksrv, and
164  * not committed yet
165  *
166  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
167  * cross verification
168  */
169 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
170 
171 /*
172  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
173  * get data buffer address from ublksrv.
174  *
175  * Then, bio data could be copied into this data buffer for a WRITE request
176  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
177  */
178 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
179 
180 /*
181  * request buffer is registered automatically, so we have to unregister it
182  * before completing this request.
183  *
184  * io_uring will unregister buffer automatically for us during exiting.
185  */
186 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
187 
188 /* atomic RW with ubq->cancel_lock */
189 #define UBLK_IO_FLAG_CANCELED	0x80000000
190 
191 /*
192  * Initialize refcount to a large number to include any registered buffers.
193  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
194  * any buffers registered on the io daemon task.
195  */
196 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
197 
198 /* used for UBLK_F_BATCH_IO only */
199 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
200 
201 union ublk_io_buf {
202 	__u64	addr;
203 	struct ublk_auto_buf_reg auto_reg;
204 };
205 
206 struct ublk_io {
207 	union ublk_io_buf buf;
208 	unsigned int flags;
209 	int res;
210 
211 	union {
212 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
213 		struct io_uring_cmd *cmd;
214 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
215 		struct request *req;
216 	};
217 
218 	struct task_struct *task;
219 
220 	/*
221 	 * The number of uses of this I/O by the ublk server
222 	 * if user copy or zero copy are enabled:
223 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
224 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
225 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
226 	 * - 1 for each io_uring registered buffer not registered on task
227 	 * The I/O can only be completed once all references are dropped.
228 	 * User copy and buffer registration operations are only permitted
229 	 * if the reference count is nonzero.
230 	 */
231 	refcount_t ref;
232 	/* Count of buffers registered on task and not yet unregistered */
233 	unsigned task_registered_buffers;
234 
235 	void *buf_ctx_handle;
236 	spinlock_t lock;
237 } ____cacheline_aligned_in_smp;
238 
239 struct ublk_queue {
240 	int q_id;
241 	int q_depth;
242 
243 	unsigned long flags;
244 	struct ublksrv_io_desc *io_cmd_buf;
245 
246 	bool force_abort;
247 	bool canceling;
248 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
249 	spinlock_t		cancel_lock;
250 	struct ublk_device *dev;
251 	u32 nr_io_ready;
252 
253 	/*
254 	 * For supporting UBLK_F_BATCH_IO only.
255 	 *
256 	 * Inflight ublk request tag is saved in this fifo
257 	 *
258 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
259 	 * so lock is required for storing request tag to fifo
260 	 *
261 	 * Make sure just one reader for fetching request from task work
262 	 * function to ublk server, so no need to grab the lock in reader
263 	 * side.
264 	 *
265 	 * Batch I/O State Management:
266 	 *
267 	 * The batch I/O system uses implicit state management based on the
268 	 * combination of three key variables below.
269 	 *
270 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
271 	 *   No fetch commands available, events queue in evts_fifo
272 	 *
273 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
274 	 *   Fetch commands available but none processing events
275 	 *
276 	 * - ACTIVE: active_fcmd
277 	 *   One fetch command actively processing events from evts_fifo
278 	 *
279 	 * Key Invariants:
280 	 * - At most one active_fcmd at any time (single reader)
281 	 * - active_fcmd is always from fcmd_head list when non-NULL
282 	 * - evts_fifo can be read locklessly by the single active reader
283 	 * - All state transitions require evts_lock protection
284 	 * - Multiple writers to evts_fifo require lock protection
285 	 */
286 	struct {
287 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
288 		spinlock_t evts_lock;
289 
290 		/* List of fetch commands available to process events */
291 		struct list_head fcmd_head;
292 
293 		/* Currently active fetch command (NULL = none active) */
294 		struct ublk_batch_fetch_cmd  *active_fcmd;
295 	}____cacheline_aligned_in_smp;
296 
297 	struct ublk_io ios[] __counted_by(q_depth);
298 };
299 
300 /* Maple tree value: maps a PFN range to buffer location */
301 struct ublk_buf_range {
302 	unsigned short buf_index;
303 	unsigned short flags;
304 	unsigned int base_offset;	/* byte offset within buffer */
305 };
306 
307 struct ublk_device {
308 	struct gendisk		*ub_disk;
309 
310 	struct ublksrv_ctrl_dev_info	dev_info;
311 
312 	struct blk_mq_tag_set	tag_set;
313 
314 	struct cdev		cdev;
315 	struct device		cdev_dev;
316 
317 #define UB_STATE_OPEN		0
318 #define UB_STATE_USED		1
319 #define UB_STATE_DELETED	2
320 	unsigned long		state;
321 	int			ub_number;
322 
323 	struct mutex		mutex;
324 
325 	spinlock_t		lock;
326 	struct mm_struct	*mm;
327 
328 	struct ublk_params	params;
329 
330 	struct completion	completion;
331 	u32			nr_queue_ready;
332 	bool 			unprivileged_daemons;
333 	struct mutex cancel_mutex;
334 	bool canceling;
335 	pid_t 	ublksrv_tgid;
336 	struct delayed_work	exit_work;
337 	struct work_struct	partition_scan_work;
338 
339 	bool			block_open; /* protected by open_mutex */
340 
341 	/* shared memory zero copy */
342 	struct maple_tree	buf_tree;
343 	struct ida		buf_ida;
344 
345 	struct ublk_queue       *queues[];
346 };
347 
348 /* header of ublk_params */
349 struct ublk_params_header {
350 	__u32	len;
351 	__u32	types;
352 };
353 
354 static void ublk_io_release(void *priv);
355 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
356 static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
357 				  u32 *buf_idx, u32 *buf_off);
358 static void ublk_buf_cleanup(struct ublk_device *ub);
359 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
360 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
361 		u16 q_id, u16 tag, struct ublk_io *io);
362 static inline unsigned int ublk_req_build_flags(struct request *req);
363 static void ublk_batch_dispatch(struct ublk_queue *ubq,
364 				const struct ublk_batch_io_data *data,
365 				struct ublk_batch_fetch_cmd *fcmd);
366 
367 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
368 {
369 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
370 }
371 
372 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
373 {
374 	return ubq->flags & UBLK_F_BATCH_IO;
375 }
376 
377 static inline void ublk_io_lock(struct ublk_io *io)
378 {
379 	spin_lock(&io->lock);
380 }
381 
382 static inline void ublk_io_unlock(struct ublk_io *io)
383 {
384 	spin_unlock(&io->lock);
385 }
386 
387 /* Initialize the event queue */
388 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
389 				    int numa_node)
390 {
391 	spin_lock_init(&q->evts_lock);
392 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
393 }
394 
395 /* Check if event queue is empty */
396 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
397 {
398 	return kfifo_is_empty(&q->evts_fifo);
399 }
400 
401 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
402 {
403 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
404 	kfifo_free(&q->evts_fifo);
405 }
406 
407 static inline struct ublksrv_io_desc *
408 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
409 {
410 	return &ubq->io_cmd_buf[tag];
411 }
412 
413 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
414 {
415 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
416 }
417 
418 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
419 {
420 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
421 }
422 
423 static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
424 {
425 	return ubq->flags & UBLK_F_SHMEM_ZC;
426 }
427 
428 static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
429 					unsigned int tag)
430 {
431 	return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
432 }
433 
434 static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
435 {
436 	return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
437 }
438 
439 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
440 {
441 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
442 }
443 
444 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
445 {
446 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
447 }
448 
449 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
450 {
451 	return ubq->flags & UBLK_F_USER_COPY;
452 }
453 
454 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
455 {
456 	return ub->dev_info.flags & UBLK_F_USER_COPY;
457 }
458 
459 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
460 {
461 	return ub->dev_info.flags & UBLK_F_ZONED;
462 }
463 
464 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
465 {
466 	return ubq->flags & UBLK_F_ZONED;
467 }
468 
469 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
470 {
471 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
472 }
473 
474 #ifdef CONFIG_BLK_DEV_ZONED
475 
476 struct ublk_zoned_report_desc {
477 	__u64 sector;
478 	__u32 operation;
479 	__u32 nr_zones;
480 };
481 
482 static DEFINE_XARRAY(ublk_zoned_report_descs);
483 
484 static int ublk_zoned_insert_report_desc(const struct request *req,
485 		struct ublk_zoned_report_desc *desc)
486 {
487 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
488 			    desc, GFP_KERNEL);
489 }
490 
491 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
492 		const struct request *req)
493 {
494 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
495 }
496 
497 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
498 		const struct request *req)
499 {
500 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
501 }
502 
503 static int ublk_get_nr_zones(const struct ublk_device *ub)
504 {
505 	const struct ublk_param_basic *p = &ub->params.basic;
506 
507 	/* Zone size is a power of 2 */
508 	return p->dev_sectors >> ilog2(p->chunk_sectors);
509 }
510 
511 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
512 {
513 	return blk_revalidate_disk_zones(ub->ub_disk);
514 }
515 
516 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
517 {
518 	const struct ublk_param_zoned *p = &ub->params.zoned;
519 	int nr_zones;
520 
521 	if (!ublk_dev_is_zoned(ub))
522 		return -EINVAL;
523 
524 	if (!p->max_zone_append_sectors)
525 		return -EINVAL;
526 
527 	nr_zones = ublk_get_nr_zones(ub);
528 
529 	if (p->max_active_zones > nr_zones)
530 		return -EINVAL;
531 
532 	if (p->max_open_zones > nr_zones)
533 		return -EINVAL;
534 
535 	return 0;
536 }
537 
538 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
539 {
540 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
541 }
542 
543 /* Based on virtblk_alloc_report_buffer */
544 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
545 				      unsigned int nr_zones, size_t *buflen)
546 {
547 	struct request_queue *q = ublk->ub_disk->queue;
548 	size_t bufsize;
549 	void *buf;
550 
551 	nr_zones = min_t(unsigned int, nr_zones,
552 			 ublk->ub_disk->nr_zones);
553 
554 	bufsize = nr_zones * sizeof(struct blk_zone);
555 	bufsize =
556 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
557 
558 	while (bufsize >= sizeof(struct blk_zone)) {
559 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
560 		if (buf) {
561 			*buflen = bufsize;
562 			return buf;
563 		}
564 		bufsize >>= 1;
565 	}
566 
567 	*buflen = 0;
568 	return NULL;
569 }
570 
571 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
572 		      unsigned int nr_zones, struct blk_report_zones_args *args)
573 {
574 	struct ublk_device *ub = disk->private_data;
575 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
576 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
577 	unsigned int done_zones = 0;
578 	unsigned int max_zones_per_request;
579 	int ret;
580 	struct blk_zone *buffer;
581 	size_t buffer_length;
582 
583 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
584 			 nr_zones);
585 
586 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
587 	if (!buffer)
588 		return -ENOMEM;
589 
590 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
591 
592 	while (done_zones < nr_zones) {
593 		unsigned int remaining_zones = nr_zones - done_zones;
594 		unsigned int zones_in_request =
595 			min_t(unsigned int, remaining_zones, max_zones_per_request);
596 		struct request *req;
597 		struct ublk_zoned_report_desc desc;
598 		blk_status_t status;
599 
600 		memset(buffer, 0, buffer_length);
601 
602 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
603 		if (IS_ERR(req)) {
604 			ret = PTR_ERR(req);
605 			goto out;
606 		}
607 
608 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
609 		desc.sector = sector;
610 		desc.nr_zones = zones_in_request;
611 		ret = ublk_zoned_insert_report_desc(req, &desc);
612 		if (ret)
613 			goto free_req;
614 
615 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
616 		if (ret)
617 			goto erase_desc;
618 
619 		status = blk_execute_rq(req, 0);
620 		ret = blk_status_to_errno(status);
621 erase_desc:
622 		ublk_zoned_erase_report_desc(req);
623 free_req:
624 		blk_mq_free_request(req);
625 		if (ret)
626 			goto out;
627 
628 		for (unsigned int i = 0; i < zones_in_request; i++) {
629 			struct blk_zone *zone = buffer + i;
630 
631 			/* A zero length zone means no more zones in this response */
632 			if (!zone->len)
633 				break;
634 
635 			ret = disk_report_zone(disk, zone, i, args);
636 			if (ret)
637 				goto out;
638 
639 			done_zones++;
640 			sector += zone_size_sectors;
641 
642 		}
643 	}
644 
645 	ret = done_zones;
646 
647 out:
648 	kvfree(buffer);
649 	return ret;
650 }
651 
652 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
653 					 struct request *req)
654 {
655 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
656 	struct ublk_io *io = &ubq->ios[req->tag];
657 	struct ublk_zoned_report_desc *desc;
658 	u32 ublk_op;
659 
660 	switch (req_op(req)) {
661 	case REQ_OP_ZONE_OPEN:
662 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
663 		break;
664 	case REQ_OP_ZONE_CLOSE:
665 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
666 		break;
667 	case REQ_OP_ZONE_FINISH:
668 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
669 		break;
670 	case REQ_OP_ZONE_RESET:
671 		ublk_op = UBLK_IO_OP_ZONE_RESET;
672 		break;
673 	case REQ_OP_ZONE_APPEND:
674 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
675 		break;
676 	case REQ_OP_ZONE_RESET_ALL:
677 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
678 		break;
679 	case REQ_OP_DRV_IN:
680 		desc = ublk_zoned_get_report_desc(req);
681 		if (!desc)
682 			return BLK_STS_IOERR;
683 		ublk_op = desc->operation;
684 		switch (ublk_op) {
685 		case UBLK_IO_OP_REPORT_ZONES:
686 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
687 			iod->nr_zones = desc->nr_zones;
688 			iod->start_sector = desc->sector;
689 			return BLK_STS_OK;
690 		default:
691 			return BLK_STS_IOERR;
692 		}
693 	case REQ_OP_DRV_OUT:
694 		/* We do not support drv_out */
695 		return BLK_STS_NOTSUPP;
696 	default:
697 		return BLK_STS_IOERR;
698 	}
699 
700 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
701 	iod->nr_sectors = blk_rq_sectors(req);
702 	iod->start_sector = blk_rq_pos(req);
703 	iod->addr = io->buf.addr;
704 
705 	return BLK_STS_OK;
706 }
707 
708 #else
709 
710 #define ublk_report_zones (NULL)
711 
712 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
713 {
714 	return -EOPNOTSUPP;
715 }
716 
717 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
718 {
719 }
720 
721 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
722 {
723 	return 0;
724 }
725 
726 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
727 					 struct request *req)
728 {
729 	return BLK_STS_NOTSUPP;
730 }
731 
732 #endif
733 
734 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
735 				      bool need_map, struct io_comp_batch *iob);
736 
737 static dev_t ublk_chr_devt;
738 static const struct class ublk_chr_class = {
739 	.name = "ublk-char",
740 };
741 
742 static DEFINE_IDR(ublk_index_idr);
743 static DEFINE_SPINLOCK(ublk_idr_lock);
744 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
745 
746 static DEFINE_MUTEX(ublk_ctl_mutex);
747 
748 static struct ublk_batch_fetch_cmd *
749 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
750 {
751 	struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
752 
753 	if (fcmd) {
754 		fcmd->cmd = cmd;
755 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
756 	}
757 	return fcmd;
758 }
759 
760 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
761 {
762 	kfree(fcmd);
763 }
764 
765 static void __ublk_release_fcmd(struct ublk_queue *ubq)
766 {
767 	WRITE_ONCE(ubq->active_fcmd, NULL);
768 }
769 
770 /*
771  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
772  * dispatching
773  */
774 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
775 					const struct ublk_batch_io_data *data,
776 					struct ublk_batch_fetch_cmd *fcmd,
777 					int res)
778 {
779 	spin_lock(&ubq->evts_lock);
780 	list_del_init(&fcmd->node);
781 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
782 	__ublk_release_fcmd(ubq);
783 	spin_unlock(&ubq->evts_lock);
784 
785 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
786 	ublk_batch_free_fcmd(fcmd);
787 }
788 
789 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
790 				     struct io_br_sel *sel,
791 				     unsigned int issue_flags)
792 {
793 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
794 		return -ENOBUFS;
795 	return 0;
796 }
797 
798 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
799 				       void __user *buf, const u16 *tag_buf,
800 				       unsigned int len)
801 {
802 	if (copy_to_user(buf, tag_buf, len))
803 		return -EFAULT;
804 	return len;
805 }
806 
807 #define UBLK_MAX_UBLKS UBLK_MINORS
808 
809 /*
810  * Max unprivileged ublk devices allowed to add
811  *
812  * It can be extended to one per-user limit in future or even controlled
813  * by cgroup.
814  */
815 static unsigned int unprivileged_ublks_max = 64;
816 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
817 
818 static struct miscdevice ublk_misc;
819 
820 static inline unsigned ublk_pos_to_hwq(loff_t pos)
821 {
822 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
823 		UBLK_QID_BITS_MASK;
824 }
825 
826 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
827 {
828 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
829 }
830 
831 static inline unsigned ublk_pos_to_tag(loff_t pos)
832 {
833 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
834 		UBLK_TAG_BITS_MASK;
835 }
836 
837 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
838 {
839 	const struct ublk_param_basic *p = &ub->params.basic;
840 
841 	if (p->attrs & UBLK_ATTR_READ_ONLY)
842 		set_disk_ro(ub->ub_disk, true);
843 
844 	set_capacity(ub->ub_disk, p->dev_sectors);
845 }
846 
847 static int ublk_integrity_flags(u32 flags)
848 {
849 	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
850 
851 	if (flags & LBMD_PI_CAP_INTEGRITY) {
852 		flags &= ~LBMD_PI_CAP_INTEGRITY;
853 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
854 	}
855 	if (flags & LBMD_PI_CAP_REFTAG) {
856 		flags &= ~LBMD_PI_CAP_REFTAG;
857 		ret_flags |= BLK_INTEGRITY_REF_TAG;
858 	}
859 	return flags ? -EINVAL : ret_flags;
860 }
861 
862 static int ublk_integrity_pi_tuple_size(u8 csum_type)
863 {
864 	switch (csum_type) {
865 	case LBMD_PI_CSUM_NONE:
866 		return 0;
867 	case LBMD_PI_CSUM_IP:
868 	case LBMD_PI_CSUM_CRC16_T10DIF:
869 		return 8;
870 	case LBMD_PI_CSUM_CRC64_NVME:
871 		return 16;
872 	default:
873 		return -EINVAL;
874 	}
875 }
876 
877 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
878 {
879 	switch (csum_type) {
880 	case LBMD_PI_CSUM_NONE:
881 		return BLK_INTEGRITY_CSUM_NONE;
882 	case LBMD_PI_CSUM_IP:
883 		return BLK_INTEGRITY_CSUM_IP;
884 	case LBMD_PI_CSUM_CRC16_T10DIF:
885 		return BLK_INTEGRITY_CSUM_CRC;
886 	case LBMD_PI_CSUM_CRC64_NVME:
887 		return BLK_INTEGRITY_CSUM_CRC64;
888 	default:
889 		WARN_ON_ONCE(1);
890 		return BLK_INTEGRITY_CSUM_NONE;
891 	}
892 }
893 
894 static int ublk_validate_params(const struct ublk_device *ub)
895 {
896 	/* basic param is the only one which must be set */
897 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
898 		const struct ublk_param_basic *p = &ub->params.basic;
899 
900 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
901 			return -EINVAL;
902 
903 		if (p->logical_bs_shift > p->physical_bs_shift)
904 			return -EINVAL;
905 
906 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
907 			return -EINVAL;
908 
909 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
910 			return -EINVAL;
911 	} else
912 		return -EINVAL;
913 
914 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
915 		const struct ublk_param_discard *p = &ub->params.discard;
916 
917 		/* So far, only support single segment discard */
918 		if (p->max_discard_sectors && p->max_discard_segments != 1)
919 			return -EINVAL;
920 
921 		if (!p->discard_granularity)
922 			return -EINVAL;
923 	}
924 
925 	/* dev_t is read-only */
926 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
927 		return -EINVAL;
928 
929 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
930 		return ublk_dev_param_zoned_validate(ub);
931 	else if (ublk_dev_is_zoned(ub))
932 		return -EINVAL;
933 
934 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
935 		const struct ublk_param_dma_align *p = &ub->params.dma;
936 
937 		if (p->alignment >= PAGE_SIZE)
938 			return -EINVAL;
939 
940 		if (!is_power_of_2(p->alignment + 1))
941 			return -EINVAL;
942 	}
943 
944 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
945 		const struct ublk_param_segment *p = &ub->params.seg;
946 
947 		if (!is_power_of_2(p->seg_boundary_mask + 1))
948 			return -EINVAL;
949 
950 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
951 			return -EINVAL;
952 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
953 			return -EINVAL;
954 	}
955 
956 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
957 		const struct ublk_param_integrity *p = &ub->params.integrity;
958 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
959 		int flags = ublk_integrity_flags(p->flags);
960 
961 		if (!ublk_dev_support_integrity(ub))
962 			return -EINVAL;
963 		if (flags < 0)
964 			return flags;
965 		if (pi_tuple_size < 0)
966 			return pi_tuple_size;
967 		if (!p->metadata_size)
968 			return -EINVAL;
969 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
970 		    p->flags & LBMD_PI_CAP_REFTAG)
971 			return -EINVAL;
972 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
973 			return -EINVAL;
974 		if (p->interval_exp < SECTOR_SHIFT ||
975 		    p->interval_exp > ub->params.basic.logical_bs_shift)
976 			return -EINVAL;
977 	}
978 
979 	return 0;
980 }
981 
982 static void ublk_apply_params(struct ublk_device *ub)
983 {
984 	ublk_dev_param_basic_apply(ub);
985 
986 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
987 		ublk_dev_param_zoned_apply(ub);
988 }
989 
990 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
991 {
992 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
993 		!ublk_support_auto_buf_reg(ubq);
994 }
995 
996 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
997 {
998 	return !ublk_dev_support_user_copy(ub) &&
999 	       !ublk_dev_support_zero_copy(ub) &&
1000 	       !ublk_dev_support_auto_buf_reg(ub);
1001 }
1002 
1003 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
1004 {
1005 	/*
1006 	 * read()/write() is involved in user copy, so request reference
1007 	 * has to be grabbed
1008 	 *
1009 	 * for zero copy, request buffer need to be registered to io_uring
1010 	 * buffer table, so reference is needed
1011 	 *
1012 	 * For auto buffer register, ublk server still may issue
1013 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
1014 	 * so reference is required too.
1015 	 */
1016 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
1017 		ublk_support_auto_buf_reg(ubq);
1018 }
1019 
1020 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
1021 {
1022 	return ublk_dev_support_user_copy(ub) ||
1023 	       ublk_dev_support_zero_copy(ub) ||
1024 	       ublk_dev_support_auto_buf_reg(ub);
1025 }
1026 
1027 /*
1028  * ublk IO Reference Counting Design
1029  * ==================================
1030  *
1031  * For user-copy and zero-copy modes, ublk uses a split reference model with
1032  * two counters that together track IO lifetime:
1033  *
1034  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
1035  *   - io->task_registered_buffers: count of buffers registered on the IO task
1036  *
1037  * Key Invariant:
1038  * --------------
1039  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1040  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1041  * when no active references exist. After IO completion, both counters become
1042  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1043  * task_registered_buffers are 0.
1044  *
1045  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1046  * exit to determine if all references have been released.
1047  *
1048  * Why Split Counters:
1049  * -------------------
1050  * Buffers registered on the IO daemon task can use the lightweight
1051  * task_registered_buffers counter (simple increment/decrement) instead of
1052  * atomic refcount operations. The ublk_io_release() callback checks if
1053  * current == io->task to decide which counter to update.
1054  *
1055  * This optimization only applies before IO completion. At completion,
1056  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1057  * After that, all subsequent buffer unregistrations must use the atomic ref
1058  * since they may be releasing the last reference.
1059  *
1060  * Reference Lifecycle:
1061  * --------------------
1062  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1063  *
1064  * 2. During IO processing:
1065  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1066  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1067  *    - Buffer unregister callback (ublk_io_release):
1068  *      * If on-task: task_registered_buffers--
1069  *      * If off-task: ref-- via ublk_put_req_ref()
1070  *
1071  * 3. ublk_sub_req_ref() at IO completion:
1072  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1073  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1074  *    - This effectively collapses task_registered_buffers into the atomic ref,
1075  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1076  *      buffers that were already counted
1077  *
1078  * Example (zero-copy, register on-task, unregister off-task):
1079  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1080  *   - Register buffer on-task: task_registered_buffers = 1
1081  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1082  *   - Completion via ublk_sub_req_ref():
1083  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1084  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1085  *
1086  * Example (auto buffer registration):
1087  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1088  *
1089  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1090  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1091  *   - Completion via ublk_sub_req_ref():
1092  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1093  *
1094  * Example (zero-copy, ublk server killed):
1095  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1096  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1097  *
1098  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1099  *   - Register buffer on-task: task_registered_buffers = 1
1100  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1101  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1102  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1103  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1104  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1105  *     and abort pending requests
1106  *
1107  * Batch IO Special Case:
1108  * ----------------------
1109  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1110  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1111  * task_registered_buffers counter still tracks registered buffers for the
1112  * invariant check, even though the callback doesn't decrement it.
1113  *
1114  * Note: updating task_registered_buffers is protected by io->lock.
1115  */
1116 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1117 		struct ublk_io *io)
1118 {
1119 	if (ublk_need_req_ref(ubq))
1120 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1121 }
1122 
1123 static inline bool ublk_get_req_ref(struct ublk_io *io)
1124 {
1125 	return refcount_inc_not_zero(&io->ref);
1126 }
1127 
1128 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1129 {
1130 	if (!refcount_dec_and_test(&io->ref))
1131 		return;
1132 
1133 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1134 	__ublk_complete_rq(req, io, false, NULL);
1135 }
1136 
1137 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1138 {
1139 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1140 
1141 	io->task_registered_buffers = 0;
1142 	return refcount_sub_and_test(sub_refs, &io->ref);
1143 }
1144 
1145 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1146 {
1147 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1148 }
1149 
1150 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1151 {
1152 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1153 }
1154 
1155 /* Called in slow path only, keep it noinline for trace purpose */
1156 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1157 {
1158 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1159 		return ub;
1160 	return NULL;
1161 }
1162 
1163 /* Called in slow path only, keep it noinline for trace purpose */
1164 static noinline void ublk_put_device(struct ublk_device *ub)
1165 {
1166 	put_device(&ub->cdev_dev);
1167 }
1168 
1169 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1170 		int qid)
1171 {
1172 	return dev->queues[qid];
1173 }
1174 
1175 static inline bool ublk_rq_has_data(const struct request *rq)
1176 {
1177 	return bio_has_data(rq->bio);
1178 }
1179 
1180 static inline struct ublksrv_io_desc *
1181 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1182 {
1183 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1184 }
1185 
1186 static inline int __ublk_queue_cmd_buf_size(int depth)
1187 {
1188 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1189 }
1190 
1191 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1192 {
1193 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1194 }
1195 
1196 static int ublk_max_cmd_buf_size(void)
1197 {
1198 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1199 }
1200 
1201 /*
1202  * Should I/O outstanding to the ublk server when it exits be reissued?
1203  * If not, outstanding I/O will get errors.
1204  */
1205 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1206 {
1207 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1208 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1209 }
1210 
1211 /*
1212  * Should I/O issued while there is no ublk server queue? If not, I/O
1213  * issued while there is no ublk server will get errors.
1214  */
1215 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1216 {
1217 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1218 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1219 }
1220 
1221 /*
1222  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1223  * of the device flags for smaller cache footprint - better for fast
1224  * paths.
1225  */
1226 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1227 {
1228 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1229 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1230 }
1231 
1232 /*
1233  * Should ublk devices be stopped (i.e. no recovery possible) when the
1234  * ublk server exits? If not, devices can be used again by a future
1235  * incarnation of a ublk server via the start_recovery/end_recovery
1236  * commands.
1237  */
1238 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1239 {
1240 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1241 }
1242 
1243 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1244 {
1245 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1246 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1247 }
1248 
1249 static void ublk_free_disk(struct gendisk *disk)
1250 {
1251 	struct ublk_device *ub = disk->private_data;
1252 
1253 	clear_bit(UB_STATE_USED, &ub->state);
1254 	ublk_put_device(ub);
1255 }
1256 
1257 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1258 		unsigned int *owner_gid)
1259 {
1260 	kuid_t uid;
1261 	kgid_t gid;
1262 
1263 	current_uid_gid(&uid, &gid);
1264 
1265 	*owner_uid = from_kuid(&init_user_ns, uid);
1266 	*owner_gid = from_kgid(&init_user_ns, gid);
1267 }
1268 
1269 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1270 {
1271 	struct ublk_device *ub = disk->private_data;
1272 
1273 	if (capable(CAP_SYS_ADMIN))
1274 		return 0;
1275 
1276 	/*
1277 	 * If it is one unprivileged device, only owner can open
1278 	 * the disk. Otherwise it could be one trap made by one
1279 	 * evil user who grants this disk's privileges to other
1280 	 * users deliberately.
1281 	 *
1282 	 * This way is reasonable too given anyone can create
1283 	 * unprivileged device, and no need other's grant.
1284 	 */
1285 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1286 		unsigned int curr_uid, curr_gid;
1287 
1288 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1289 
1290 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1291 				ub->dev_info.owner_gid)
1292 			return -EPERM;
1293 	}
1294 
1295 	if (ub->block_open)
1296 		return -ENXIO;
1297 
1298 	return 0;
1299 }
1300 
1301 static const struct block_device_operations ub_fops = {
1302 	.owner =	THIS_MODULE,
1303 	.open =		ublk_open,
1304 	.free_disk =	ublk_free_disk,
1305 	.report_zones =	ublk_report_zones,
1306 };
1307 
1308 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1309 				struct iov_iter *uiter, int dir, size_t *done)
1310 {
1311 	unsigned len;
1312 	void *bv_buf;
1313 	size_t copied;
1314 
1315 	if (*offset >= bv->bv_len) {
1316 		*offset -= bv->bv_len;
1317 		return true;
1318 	}
1319 
1320 	len = bv->bv_len - *offset;
1321 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1322 	if (dir == ITER_DEST)
1323 		copied = copy_to_iter(bv_buf, len, uiter);
1324 	else
1325 		copied = copy_from_iter(bv_buf, len, uiter);
1326 
1327 	kunmap_local(bv_buf);
1328 
1329 	*done += copied;
1330 	if (copied < len)
1331 		return false;
1332 
1333 	*offset = 0;
1334 	return true;
1335 }
1336 
1337 /*
1338  * Copy data between request pages and io_iter, and 'offset'
1339  * is the start point of linear offset of request.
1340  */
1341 static size_t ublk_copy_user_pages(const struct request *req,
1342 		unsigned offset, struct iov_iter *uiter, int dir)
1343 {
1344 	struct req_iterator iter;
1345 	struct bio_vec bv;
1346 	size_t done = 0;
1347 
1348 	rq_for_each_segment(bv, req, iter) {
1349 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1350 			break;
1351 	}
1352 	return done;
1353 }
1354 
1355 #ifdef CONFIG_BLK_DEV_INTEGRITY
1356 static size_t ublk_copy_user_integrity(const struct request *req,
1357 		unsigned offset, struct iov_iter *uiter, int dir)
1358 {
1359 	size_t done = 0;
1360 	struct bio *bio = req->bio;
1361 	struct bvec_iter iter;
1362 	struct bio_vec iv;
1363 
1364 	if (!blk_integrity_rq(req))
1365 		return 0;
1366 
1367 	bio_for_each_integrity_vec(iv, bio, iter) {
1368 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1369 			break;
1370 	}
1371 
1372 	return done;
1373 }
1374 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1375 static size_t ublk_copy_user_integrity(const struct request *req,
1376 		unsigned offset, struct iov_iter *uiter, int dir)
1377 {
1378 	return 0;
1379 }
1380 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1381 
1382 static inline bool ublk_need_map_req(const struct request *req)
1383 {
1384 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1385 }
1386 
1387 static inline bool ublk_need_unmap_req(const struct request *req)
1388 {
1389 	return ublk_rq_has_data(req) &&
1390 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1391 }
1392 
1393 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1394 				const struct request *req,
1395 				const struct ublk_io *io)
1396 {
1397 	const unsigned int rq_bytes = blk_rq_bytes(req);
1398 
1399 	if (!ublk_need_map_io(ubq))
1400 		return rq_bytes;
1401 
1402 	/*
1403 	 * no zero copy, we delay copy WRITE request data into ublksrv
1404 	 * context and the big benefit is that pinning pages in current
1405 	 * context is pretty fast, see ublk_pin_user_pages
1406 	 */
1407 	if (ublk_need_map_req(req)) {
1408 		struct iov_iter iter;
1409 		const int dir = ITER_DEST;
1410 
1411 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1412 		return ublk_copy_user_pages(req, 0, &iter, dir);
1413 	}
1414 	return rq_bytes;
1415 }
1416 
1417 static unsigned int ublk_unmap_io(bool need_map,
1418 		const struct request *req,
1419 		const struct ublk_io *io)
1420 {
1421 	const unsigned int rq_bytes = blk_rq_bytes(req);
1422 
1423 	if (!need_map)
1424 		return rq_bytes;
1425 
1426 	if (ublk_need_unmap_req(req)) {
1427 		struct iov_iter iter;
1428 		const int dir = ITER_SOURCE;
1429 
1430 		WARN_ON_ONCE(io->res > rq_bytes);
1431 
1432 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1433 		return ublk_copy_user_pages(req, 0, &iter, dir);
1434 	}
1435 	return rq_bytes;
1436 }
1437 
1438 static inline unsigned int ublk_req_build_flags(struct request *req)
1439 {
1440 	unsigned flags = 0;
1441 
1442 	if (req->cmd_flags & REQ_FAILFAST_DEV)
1443 		flags |= UBLK_IO_F_FAILFAST_DEV;
1444 
1445 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1446 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1447 
1448 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1449 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
1450 
1451 	if (req->cmd_flags & REQ_META)
1452 		flags |= UBLK_IO_F_META;
1453 
1454 	if (req->cmd_flags & REQ_FUA)
1455 		flags |= UBLK_IO_F_FUA;
1456 
1457 	if (req->cmd_flags & REQ_NOUNMAP)
1458 		flags |= UBLK_IO_F_NOUNMAP;
1459 
1460 	if (req->cmd_flags & REQ_SWAP)
1461 		flags |= UBLK_IO_F_SWAP;
1462 
1463 	if (blk_integrity_rq(req))
1464 		flags |= UBLK_IO_F_INTEGRITY;
1465 
1466 	return flags;
1467 }
1468 
1469 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1470 {
1471 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1472 	struct ublk_io *io = &ubq->ios[req->tag];
1473 	u32 ublk_op;
1474 
1475 	switch (req_op(req)) {
1476 	case REQ_OP_READ:
1477 		ublk_op = UBLK_IO_OP_READ;
1478 		break;
1479 	case REQ_OP_WRITE:
1480 		ublk_op = UBLK_IO_OP_WRITE;
1481 		break;
1482 	case REQ_OP_FLUSH:
1483 		ublk_op = UBLK_IO_OP_FLUSH;
1484 		break;
1485 	case REQ_OP_DISCARD:
1486 		ublk_op = UBLK_IO_OP_DISCARD;
1487 		break;
1488 	case REQ_OP_WRITE_ZEROES:
1489 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1490 		break;
1491 	default:
1492 		if (ublk_queue_is_zoned(ubq))
1493 			return ublk_setup_iod_zoned(ubq, req);
1494 		return BLK_STS_IOERR;
1495 	}
1496 
1497 	/* need to translate since kernel may change */
1498 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
1499 	iod->nr_sectors = blk_rq_sectors(req);
1500 	iod->start_sector = blk_rq_pos(req);
1501 
1502 	/* Try shmem zero-copy match before setting addr */
1503 	if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
1504 		u32 buf_idx, buf_off;
1505 
1506 		if (ublk_try_buf_match(ubq->dev, req,
1507 					  &buf_idx, &buf_off)) {
1508 			iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
1509 			iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
1510 			return BLK_STS_OK;
1511 		}
1512 	}
1513 
1514 	iod->addr = io->buf.addr;
1515 
1516 	return BLK_STS_OK;
1517 }
1518 
1519 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1520 		struct io_uring_cmd *ioucmd)
1521 {
1522 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1523 }
1524 
1525 static void ublk_end_request(struct request *req, blk_status_t error)
1526 {
1527 	local_bh_disable();
1528 	blk_mq_end_request(req, error);
1529 	local_bh_enable();
1530 }
1531 
1532 /* todo: handle partial completion */
1533 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1534 				      bool need_map, struct io_comp_batch *iob)
1535 {
1536 	unsigned int unmapped_bytes;
1537 	blk_status_t res = BLK_STS_OK;
1538 	bool requeue;
1539 
1540 	/* failed read IO if nothing is read */
1541 	if (!io->res && req_op(req) == REQ_OP_READ)
1542 		io->res = -EIO;
1543 
1544 	if (io->res < 0) {
1545 		res = errno_to_blk_status(io->res);
1546 		goto exit;
1547 	}
1548 
1549 	/*
1550 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1551 	 * directly.
1552 	 *
1553 	 * Both the two needn't unmap.
1554 	 */
1555 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1556 	    req_op(req) != REQ_OP_DRV_IN)
1557 		goto exit;
1558 
1559 	/* shmem zero copy: no data to unmap, pages already shared */
1560 	if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
1561 		goto exit;
1562 
1563 	/* for READ request, writing data in iod->addr to rq buffers */
1564 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1565 
1566 	/*
1567 	 * Extremely impossible since we got data filled in just before
1568 	 *
1569 	 * Re-read simply for this unlikely case.
1570 	 */
1571 	if (unlikely(unmapped_bytes < io->res))
1572 		io->res = unmapped_bytes;
1573 
1574 	/*
1575 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1576 	 * happens off this path, then that will prevent ublk's blkdev_release()
1577 	 * from being called on current's task work, see fput() implementation.
1578 	 *
1579 	 * Otherwise, ublk server may not provide forward progress in case of
1580 	 * reading the partition table from bdev_open() with disk->open_mutex
1581 	 * held, and causes dead lock as we could already be holding
1582 	 * disk->open_mutex here.
1583 	 *
1584 	 * Preferably we would not be doing IO with a mutex held that is also
1585 	 * used for release, but this work-around will suffice for now.
1586 	 */
1587 	local_bh_disable();
1588 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1589 	local_bh_enable();
1590 	if (requeue)
1591 		blk_mq_requeue_request(req, true);
1592 	else if (likely(!blk_should_fake_timeout(req->q))) {
1593 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1594 			return;
1595 		__blk_mq_end_request(req, BLK_STS_OK);
1596 	}
1597 
1598 	return;
1599 exit:
1600 	ublk_end_request(req, res);
1601 }
1602 
1603 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1604 						     struct request *req)
1605 {
1606 	/* read cmd first because req will overwrite it */
1607 	struct io_uring_cmd *cmd = io->cmd;
1608 
1609 	/* mark this cmd owned by ublksrv */
1610 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1611 
1612 	/*
1613 	 * clear ACTIVE since we are done with this sqe/cmd slot
1614 	 * We can only accept io cmd in case of being not active.
1615 	 */
1616 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1617 
1618 	io->req = req;
1619 	return cmd;
1620 }
1621 
1622 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1623 				 int res, unsigned issue_flags)
1624 {
1625 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1626 
1627 	/* tell ublksrv one io request is coming */
1628 	io_uring_cmd_done(cmd, res, issue_flags);
1629 }
1630 
1631 #define UBLK_REQUEUE_DELAY_MS	3
1632 
1633 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1634 		struct request *rq)
1635 {
1636 	/* We cannot process this rq so just requeue it. */
1637 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1638 		blk_mq_requeue_request(rq, false);
1639 	else
1640 		ublk_end_request(rq, BLK_STS_IOERR);
1641 }
1642 
1643 static void
1644 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1645 {
1646 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1647 
1648 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1649 }
1650 
1651 enum auto_buf_reg_res {
1652 	AUTO_BUF_REG_FAIL,
1653 	AUTO_BUF_REG_FALLBACK,
1654 	AUTO_BUF_REG_OK,
1655 };
1656 
1657 /*
1658  * Setup io state after auto buffer registration.
1659  *
1660  * Must be called after ublk_auto_buf_register() is done.
1661  * Caller must hold io->lock in batch context.
1662  */
1663 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1664 				   struct request *req, struct ublk_io *io,
1665 				   struct io_uring_cmd *cmd,
1666 				   enum auto_buf_reg_res res)
1667 {
1668 	if (res == AUTO_BUF_REG_OK) {
1669 		io->task_registered_buffers = 1;
1670 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1671 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1672 	}
1673 	ublk_init_req_ref(ubq, io);
1674 	__ublk_prep_compl_io_cmd(io, req);
1675 }
1676 
1677 /* Register request bvec to io_uring for auto buffer registration. */
1678 static enum auto_buf_reg_res
1679 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1680 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1681 		       unsigned int issue_flags)
1682 {
1683 	int ret;
1684 
1685 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1686 				      io->buf.auto_reg.index, issue_flags);
1687 	if (ret) {
1688 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1689 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1690 			return AUTO_BUF_REG_FALLBACK;
1691 		}
1692 		ublk_end_request(req, BLK_STS_IOERR);
1693 		return AUTO_BUF_REG_FAIL;
1694 	}
1695 
1696 	return AUTO_BUF_REG_OK;
1697 }
1698 
1699 /*
1700  * Dispatch IO to userspace with auto buffer registration.
1701  *
1702  * Only called in non-batch context from task work, io->lock not held.
1703  */
1704 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1705 				   struct request *req, struct ublk_io *io,
1706 				   struct io_uring_cmd *cmd,
1707 				   unsigned int issue_flags)
1708 {
1709 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1710 			issue_flags);
1711 
1712 	if (res != AUTO_BUF_REG_FAIL) {
1713 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1714 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1715 	}
1716 }
1717 
1718 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1719 			  struct ublk_io *io)
1720 {
1721 	unsigned mapped_bytes;
1722 
1723 	/* shmem zero copy: skip data copy, pages already shared */
1724 	if (ublk_iod_is_shmem_zc(ubq, req->tag))
1725 		return true;
1726 
1727 	mapped_bytes = ublk_map_io(ubq, req, io);
1728 
1729 	/* partially mapped, update io descriptor */
1730 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1731 		/*
1732 		 * Nothing mapped, retry until we succeed.
1733 		 *
1734 		 * We may never succeed in mapping any bytes here because
1735 		 * of OOM. TODO: reserve one buffer with single page pinned
1736 		 * for providing forward progress guarantee.
1737 		 */
1738 		if (unlikely(!mapped_bytes)) {
1739 			blk_mq_requeue_request(req, false);
1740 			blk_mq_delay_kick_requeue_list(req->q,
1741 					UBLK_REQUEUE_DELAY_MS);
1742 			return false;
1743 		}
1744 
1745 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1746 			mapped_bytes >> 9;
1747 	}
1748 
1749 	return true;
1750 }
1751 
1752 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1753 {
1754 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1755 	int tag = req->tag;
1756 	struct ublk_io *io = &ubq->ios[tag];
1757 
1758 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1759 			__func__, ubq->q_id, req->tag, io->flags,
1760 			ublk_get_iod(ubq, req->tag)->addr);
1761 
1762 	/*
1763 	 * Task is exiting if either:
1764 	 *
1765 	 * (1) current != io->task.
1766 	 * io_uring_cmd_complete_in_task() tries to run task_work
1767 	 * in a workqueue if cmd's task is PF_EXITING.
1768 	 *
1769 	 * (2) current->flags & PF_EXITING.
1770 	 */
1771 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1772 		__ublk_abort_rq(ubq, req);
1773 		return;
1774 	}
1775 
1776 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1777 		/*
1778 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1779 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1780 		 * and notify it.
1781 		 */
1782 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1783 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1784 				__func__, ubq->q_id, req->tag, io->flags);
1785 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1786 				     issue_flags);
1787 		return;
1788 	}
1789 
1790 	if (!ublk_start_io(ubq, req, io))
1791 		return;
1792 
1793 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1794 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1795 	} else {
1796 		ublk_init_req_ref(ubq, io);
1797 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1798 	}
1799 }
1800 
1801 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1802 				       const struct ublk_batch_io_data *data,
1803 				       unsigned short tag)
1804 {
1805 	struct ublk_device *ub = data->ub;
1806 	struct ublk_io *io = &ubq->ios[tag];
1807 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1808 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1809 	struct io_uring_cmd *cmd = data->cmd;
1810 
1811 	if (!ublk_start_io(ubq, req, io))
1812 		return false;
1813 
1814 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1815 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1816 				data->issue_flags);
1817 
1818 		if (res == AUTO_BUF_REG_FAIL)
1819 			return false;
1820 	}
1821 
1822 	ublk_io_lock(io);
1823 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1824 	ublk_io_unlock(io);
1825 
1826 	return true;
1827 }
1828 
1829 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1830 				     const struct ublk_batch_io_data *data,
1831 				     unsigned short *tag_buf,
1832 				     unsigned int len)
1833 {
1834 	bool has_unused = false;
1835 	unsigned int i;
1836 
1837 	for (i = 0; i < len; i++) {
1838 		unsigned short tag = tag_buf[i];
1839 
1840 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1841 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1842 			has_unused = true;
1843 		}
1844 	}
1845 
1846 	return has_unused;
1847 }
1848 
1849 /*
1850  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1851  * Returns the new length after filtering.
1852  */
1853 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1854 					    unsigned int len)
1855 {
1856 	unsigned int i, j;
1857 
1858 	for (i = 0, j = 0; i < len; i++) {
1859 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1860 			if (i != j)
1861 				tag_buf[j] = tag_buf[i];
1862 			j++;
1863 		}
1864 	}
1865 
1866 	return j;
1867 }
1868 
1869 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1870 		const struct ublk_batch_io_data *data,
1871 		unsigned short *tag_buf, size_t len, int ret)
1872 {
1873 	int i, res;
1874 
1875 	/*
1876 	 * Undo prep state for all IOs since userspace never received them.
1877 	 * This restores IOs to pre-prepared state so they can be cleanly
1878 	 * re-prepared when tags are pulled from FIFO again.
1879 	 */
1880 	for (i = 0; i < len; i++) {
1881 		struct ublk_io *io = &ubq->ios[tag_buf[i]];
1882 		int index = -1;
1883 
1884 		ublk_io_lock(io);
1885 		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1886 			index = io->buf.auto_reg.index;
1887 		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1888 		io->flags |= UBLK_IO_FLAG_ACTIVE;
1889 		ublk_io_unlock(io);
1890 
1891 		if (index != -1)
1892 			io_buffer_unregister_bvec(data->cmd, index,
1893 					data->issue_flags);
1894 	}
1895 
1896 	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1897 		tag_buf, len, &ubq->evts_lock);
1898 
1899 	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1900 			"tags(%d %zu) ret %d\n", __func__, res, len,
1901 			ret);
1902 }
1903 
1904 #define MAX_NR_TAG 128
1905 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1906 				 const struct ublk_batch_io_data *data,
1907 				 struct ublk_batch_fetch_cmd *fcmd)
1908 {
1909 	const unsigned int tag_sz = sizeof(unsigned short);
1910 	unsigned short tag_buf[MAX_NR_TAG];
1911 	struct io_br_sel sel;
1912 	size_t len = 0;
1913 	bool needs_filter;
1914 	int ret;
1915 
1916 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1917 
1918 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1919 					 data->issue_flags);
1920 	if (sel.val < 0)
1921 		return sel.val;
1922 	if (!sel.addr)
1923 		return -ENOBUFS;
1924 
1925 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1926 	len = min(len, sizeof(tag_buf)) / tag_sz;
1927 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1928 
1929 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1930 	/* Filter out unused tags before posting to userspace */
1931 	if (unlikely(needs_filter)) {
1932 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1933 
1934 		/* return actual length if all are failed or requeued */
1935 		if (!new_len) {
1936 			/* release the selected buffer */
1937 			sel.val = 0;
1938 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1939 						&sel, data->issue_flags));
1940 			return len;
1941 		}
1942 		len = new_len;
1943 	}
1944 
1945 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1946 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1947 	if (unlikely(ret < 0))
1948 		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1949 	return ret;
1950 }
1951 
1952 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1953 		struct ublk_queue *ubq)
1954 {
1955 	struct ublk_batch_fetch_cmd *fcmd;
1956 
1957 	lockdep_assert_held(&ubq->evts_lock);
1958 
1959 	/*
1960 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1961 	 *
1962 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1963 	 *
1964 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1965 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1966 	 */
1967 	smp_mb();
1968 	if (READ_ONCE(ubq->active_fcmd)) {
1969 		fcmd = NULL;
1970 	} else {
1971 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1972 				struct ublk_batch_fetch_cmd, node);
1973 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1974 	}
1975 	return fcmd;
1976 }
1977 
1978 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1979 {
1980 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1981 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1982 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1983 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1984 	struct ublk_batch_io_data data = {
1985 		.ub = pdu->ubq->dev,
1986 		.cmd = fcmd->cmd,
1987 		.issue_flags = issue_flags,
1988 	};
1989 
1990 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1991 
1992 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
1993 }
1994 
1995 static void
1996 ublk_batch_dispatch(struct ublk_queue *ubq,
1997 		    const struct ublk_batch_io_data *data,
1998 		    struct ublk_batch_fetch_cmd *fcmd)
1999 {
2000 	struct ublk_batch_fetch_cmd *new_fcmd;
2001 	unsigned tried = 0;
2002 	int ret = 0;
2003 
2004 again:
2005 	while (!ublk_io_evts_empty(ubq)) {
2006 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
2007 		if (ret <= 0)
2008 			break;
2009 	}
2010 
2011 	if (ret < 0) {
2012 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
2013 		return;
2014 	}
2015 
2016 	__ublk_release_fcmd(ubq);
2017 	/*
2018 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
2019 	 * checking ubq->evts_fifo.
2020 	 *
2021 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
2022 	 */
2023 	smp_mb();
2024 	if (likely(ublk_io_evts_empty(ubq)))
2025 		return;
2026 
2027 	spin_lock(&ubq->evts_lock);
2028 	new_fcmd = __ublk_acquire_fcmd(ubq);
2029 	spin_unlock(&ubq->evts_lock);
2030 
2031 	if (!new_fcmd)
2032 		return;
2033 
2034 	/* Avoid lockup by allowing to handle at most 32 batches */
2035 	if (new_fcmd == fcmd && tried++ < 32)
2036 		goto again;
2037 
2038 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
2039 }
2040 
2041 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2042 {
2043 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2044 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2045 	struct ublk_queue *ubq = pdu->ubq;
2046 
2047 	ublk_dispatch_req(ubq, pdu->req);
2048 }
2049 
2050 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
2051 {
2052 	unsigned short tag = rq->tag;
2053 	struct ublk_batch_fetch_cmd *fcmd = NULL;
2054 
2055 	spin_lock(&ubq->evts_lock);
2056 	kfifo_put(&ubq->evts_fifo, tag);
2057 	if (last)
2058 		fcmd = __ublk_acquire_fcmd(ubq);
2059 	spin_unlock(&ubq->evts_lock);
2060 
2061 	if (fcmd)
2062 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2063 }
2064 
2065 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2066 {
2067 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2068 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2069 
2070 	pdu->req = rq;
2071 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2072 }
2073 
2074 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2075 {
2076 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2077 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2078 	struct request *rq = pdu->req_list;
2079 	struct request *next;
2080 
2081 	do {
2082 		next = rq->rq_next;
2083 		rq->rq_next = NULL;
2084 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2085 		rq = next;
2086 	} while (rq);
2087 }
2088 
2089 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2090 {
2091 	struct io_uring_cmd *cmd = io->cmd;
2092 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2093 
2094 	pdu->req_list = rq_list_peek(l);
2095 	rq_list_init(l);
2096 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2097 }
2098 
2099 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2100 {
2101 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2102 	pid_t tgid = ubq->dev->ublksrv_tgid;
2103 	struct task_struct *p;
2104 	struct pid *pid;
2105 
2106 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2107 		return BLK_EH_RESET_TIMER;
2108 
2109 	if (unlikely(!tgid))
2110 		return BLK_EH_RESET_TIMER;
2111 
2112 	rcu_read_lock();
2113 	pid = find_vpid(tgid);
2114 	p = pid_task(pid, PIDTYPE_PID);
2115 	if (p)
2116 		send_sig(SIGKILL, p, 0);
2117 	rcu_read_unlock();
2118 	return BLK_EH_DONE;
2119 }
2120 
2121 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2122 				  bool check_cancel)
2123 {
2124 	blk_status_t res;
2125 
2126 	if (unlikely(READ_ONCE(ubq->fail_io)))
2127 		return BLK_STS_TARGET;
2128 
2129 	/* With recovery feature enabled, force_abort is set in
2130 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2131 	 * abort all requeued and new rqs here to let del_gendisk()
2132 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2133 	 * to avoid UAF on io_uring ctx.
2134 	 *
2135 	 * Note: force_abort is guaranteed to be seen because it is set
2136 	 * before request queue is unqiuesced.
2137 	 */
2138 	if (ublk_nosrv_should_queue_io(ubq) &&
2139 	    unlikely(READ_ONCE(ubq->force_abort)))
2140 		return BLK_STS_IOERR;
2141 
2142 	if (check_cancel && unlikely(ubq->canceling))
2143 		return BLK_STS_IOERR;
2144 
2145 	/* fill iod to slot in io cmd buffer */
2146 	res = ublk_setup_iod(ubq, rq);
2147 	if (unlikely(res != BLK_STS_OK))
2148 		return BLK_STS_IOERR;
2149 
2150 	blk_mq_start_request(rq);
2151 	return BLK_STS_OK;
2152 }
2153 
2154 /*
2155  * Common helper for queue_rq that handles request preparation and
2156  * cancellation checks. Returns status and sets should_queue to indicate
2157  * whether the caller should proceed with queuing the request.
2158  */
2159 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2160 						   struct request *rq,
2161 						   bool *should_queue)
2162 {
2163 	blk_status_t res;
2164 
2165 	res = ublk_prep_req(ubq, rq, false);
2166 	if (res != BLK_STS_OK) {
2167 		*should_queue = false;
2168 		return res;
2169 	}
2170 
2171 	/*
2172 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2173 	 * is dealt with, otherwise this request may not be failed in case
2174 	 * of recovery, and cause hang when deleting disk
2175 	 */
2176 	if (unlikely(ubq->canceling)) {
2177 		*should_queue = false;
2178 		__ublk_abort_rq(ubq, rq);
2179 		return BLK_STS_OK;
2180 	}
2181 
2182 	*should_queue = true;
2183 	return BLK_STS_OK;
2184 }
2185 
2186 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2187 		const struct blk_mq_queue_data *bd)
2188 {
2189 	struct ublk_queue *ubq = hctx->driver_data;
2190 	struct request *rq = bd->rq;
2191 	bool should_queue;
2192 	blk_status_t res;
2193 
2194 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2195 	if (!should_queue)
2196 		return res;
2197 
2198 	ublk_queue_cmd(ubq, rq);
2199 	return BLK_STS_OK;
2200 }
2201 
2202 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2203 		const struct blk_mq_queue_data *bd)
2204 {
2205 	struct ublk_queue *ubq = hctx->driver_data;
2206 	struct request *rq = bd->rq;
2207 	bool should_queue;
2208 	blk_status_t res;
2209 
2210 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2211 	if (!should_queue)
2212 		return res;
2213 
2214 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2215 	return BLK_STS_OK;
2216 }
2217 
2218 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2219 					     const struct ublk_io *io2)
2220 {
2221 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2222 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2223 		(io->task == io2->task);
2224 }
2225 
2226 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2227 {
2228 	struct ublk_queue *ubq = hctx->driver_data;
2229 	struct ublk_batch_fetch_cmd *fcmd;
2230 
2231 	spin_lock(&ubq->evts_lock);
2232 	fcmd = __ublk_acquire_fcmd(ubq);
2233 	spin_unlock(&ubq->evts_lock);
2234 
2235 	if (fcmd)
2236 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2237 }
2238 
2239 static void ublk_queue_rqs(struct rq_list *rqlist)
2240 {
2241 	struct rq_list requeue_list = { };
2242 	struct rq_list submit_list = { };
2243 	struct ublk_io *io = NULL;
2244 	struct request *req;
2245 
2246 	while ((req = rq_list_pop(rqlist))) {
2247 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2248 		struct ublk_io *this_io = &this_q->ios[req->tag];
2249 
2250 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2251 			rq_list_add_tail(&requeue_list, req);
2252 			continue;
2253 		}
2254 
2255 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2256 				!rq_list_empty(&submit_list))
2257 			ublk_queue_cmd_list(io, &submit_list);
2258 		io = this_io;
2259 		rq_list_add_tail(&submit_list, req);
2260 	}
2261 
2262 	if (!rq_list_empty(&submit_list))
2263 		ublk_queue_cmd_list(io, &submit_list);
2264 	*rqlist = requeue_list;
2265 }
2266 
2267 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2268 {
2269 	unsigned short tags[MAX_NR_TAG];
2270 	struct ublk_batch_fetch_cmd *fcmd;
2271 	struct request *rq;
2272 	unsigned cnt = 0;
2273 
2274 	spin_lock(&ubq->evts_lock);
2275 	rq_list_for_each(l, rq) {
2276 		tags[cnt++] = (unsigned short)rq->tag;
2277 		if (cnt >= MAX_NR_TAG) {
2278 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2279 			cnt = 0;
2280 		}
2281 	}
2282 	if (cnt)
2283 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2284 	fcmd = __ublk_acquire_fcmd(ubq);
2285 	spin_unlock(&ubq->evts_lock);
2286 
2287 	rq_list_init(l);
2288 	if (fcmd)
2289 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2290 }
2291 
2292 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2293 {
2294 	struct rq_list requeue_list = { };
2295 	struct rq_list submit_list = { };
2296 	struct ublk_queue *ubq = NULL;
2297 	struct request *req;
2298 
2299 	while ((req = rq_list_pop(rqlist))) {
2300 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2301 
2302 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2303 			rq_list_add_tail(&requeue_list, req);
2304 			continue;
2305 		}
2306 
2307 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2308 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2309 		ubq = this_q;
2310 		rq_list_add_tail(&submit_list, req);
2311 	}
2312 
2313 	if (!rq_list_empty(&submit_list))
2314 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2315 	*rqlist = requeue_list;
2316 }
2317 
2318 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2319 		unsigned int hctx_idx)
2320 {
2321 	struct ublk_device *ub = driver_data;
2322 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2323 
2324 	hctx->driver_data = ubq;
2325 	return 0;
2326 }
2327 
2328 static const struct blk_mq_ops ublk_mq_ops = {
2329 	.queue_rq       = ublk_queue_rq,
2330 	.queue_rqs      = ublk_queue_rqs,
2331 	.init_hctx	= ublk_init_hctx,
2332 	.timeout	= ublk_timeout,
2333 };
2334 
2335 static const struct blk_mq_ops ublk_batch_mq_ops = {
2336 	.commit_rqs	= ublk_commit_rqs,
2337 	.queue_rq       = ublk_batch_queue_rq,
2338 	.queue_rqs      = ublk_batch_queue_rqs,
2339 	.init_hctx	= ublk_init_hctx,
2340 	.timeout	= ublk_timeout,
2341 };
2342 
2343 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2344 {
2345 	int i;
2346 
2347 	ubq->nr_io_ready = 0;
2348 
2349 	for (i = 0; i < ubq->q_depth; i++) {
2350 		struct ublk_io *io = &ubq->ios[i];
2351 
2352 		/*
2353 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2354 		 * io->cmd
2355 		 */
2356 		io->flags &= UBLK_IO_FLAG_CANCELED;
2357 		io->cmd = NULL;
2358 		io->buf.addr = 0;
2359 
2360 		/*
2361 		 * old task is PF_EXITING, put it now
2362 		 *
2363 		 * It could be NULL in case of closing one quiesced
2364 		 * device.
2365 		 */
2366 		if (io->task) {
2367 			put_task_struct(io->task);
2368 			io->task = NULL;
2369 		}
2370 
2371 		WARN_ON_ONCE(refcount_read(&io->ref));
2372 		WARN_ON_ONCE(io->task_registered_buffers);
2373 	}
2374 }
2375 
2376 static int ublk_ch_open(struct inode *inode, struct file *filp)
2377 {
2378 	struct ublk_device *ub = container_of(inode->i_cdev,
2379 			struct ublk_device, cdev);
2380 
2381 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2382 		return -EBUSY;
2383 	filp->private_data = ub;
2384 	ub->ublksrv_tgid = current->tgid;
2385 	return 0;
2386 }
2387 
2388 static void ublk_reset_ch_dev(struct ublk_device *ub)
2389 {
2390 	int i;
2391 
2392 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2393 		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2394 
2395 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2396 	ub->mm = NULL;
2397 	ub->nr_queue_ready = 0;
2398 	ub->unprivileged_daemons = false;
2399 	ub->ublksrv_tgid = -1;
2400 }
2401 
2402 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2403 {
2404 	struct gendisk *disk;
2405 
2406 	spin_lock(&ub->lock);
2407 	disk = ub->ub_disk;
2408 	if (disk)
2409 		get_device(disk_to_dev(disk));
2410 	spin_unlock(&ub->lock);
2411 
2412 	return disk;
2413 }
2414 
2415 static void ublk_put_disk(struct gendisk *disk)
2416 {
2417 	if (disk)
2418 		put_device(disk_to_dev(disk));
2419 }
2420 
2421 static void ublk_partition_scan_work(struct work_struct *work)
2422 {
2423 	struct ublk_device *ub =
2424 		container_of(work, struct ublk_device, partition_scan_work);
2425 	/* Hold disk reference to prevent UAF during concurrent teardown */
2426 	struct gendisk *disk = ublk_get_disk(ub);
2427 
2428 	if (!disk)
2429 		return;
2430 
2431 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2432 					     &disk->state)))
2433 		goto out;
2434 
2435 	mutex_lock(&disk->open_mutex);
2436 	bdev_disk_changed(disk, false);
2437 	mutex_unlock(&disk->open_mutex);
2438 out:
2439 	ublk_put_disk(disk);
2440 }
2441 
2442 /*
2443  * Use this function to ensure that ->canceling is consistently set for
2444  * the device and all queues. Do not set these flags directly.
2445  *
2446  * Caller must ensure that:
2447  * - cancel_mutex is held. This ensures that there is no concurrent
2448  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2449  * - there are no concurrent reads of ubq->canceling from the queue_rq
2450  *   path. This can be done by quiescing the queue, or through other
2451  *   means.
2452  */
2453 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2454 	__must_hold(&ub->cancel_mutex)
2455 {
2456 	int i;
2457 
2458 	ub->canceling = canceling;
2459 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2460 		ublk_get_queue(ub, i)->canceling = canceling;
2461 }
2462 
2463 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2464 {
2465 	int i, j;
2466 
2467 	if (!ublk_dev_need_req_ref(ub))
2468 		return false;
2469 
2470 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2471 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2472 
2473 		for (j = 0; j < ubq->q_depth; j++) {
2474 			struct ublk_io *io = &ubq->ios[j];
2475 			unsigned int refs = refcount_read(&io->ref) +
2476 				io->task_registered_buffers;
2477 
2478 			/*
2479 			 * UBLK_REFCOUNT_INIT or zero means no active
2480 			 * reference
2481 			 */
2482 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2483 				return true;
2484 
2485 			/* reset to zero if the io hasn't active references */
2486 			refcount_set(&io->ref, 0);
2487 			io->task_registered_buffers = 0;
2488 		}
2489 	}
2490 	return false;
2491 }
2492 
2493 static void ublk_ch_release_work_fn(struct work_struct *work)
2494 {
2495 	struct ublk_device *ub =
2496 		container_of(work, struct ublk_device, exit_work.work);
2497 	struct gendisk *disk;
2498 	int i;
2499 
2500 	/*
2501 	 * For zero-copy and auto buffer register modes, I/O references
2502 	 * might not be dropped naturally when the daemon is killed, but
2503 	 * io_uring guarantees that registered bvec kernel buffers are
2504 	 * unregistered finally when freeing io_uring context, then the
2505 	 * active references are dropped.
2506 	 *
2507 	 * Wait until active references are dropped for avoiding use-after-free
2508 	 *
2509 	 * registered buffer may be unregistered in io_ring's release hander,
2510 	 * so have to wait by scheduling work function for avoiding the two
2511 	 * file release dependency.
2512 	 */
2513 	if (ublk_check_and_reset_active_ref(ub)) {
2514 		schedule_delayed_work(&ub->exit_work, 1);
2515 		return;
2516 	}
2517 
2518 	/*
2519 	 * disk isn't attached yet, either device isn't live, or it has
2520 	 * been removed already, so we needn't to do anything
2521 	 */
2522 	disk = ublk_get_disk(ub);
2523 	if (!disk)
2524 		goto out;
2525 
2526 	/*
2527 	 * All uring_cmd are done now, so abort any request outstanding to
2528 	 * the ublk server
2529 	 *
2530 	 * This can be done in lockless way because ublk server has been
2531 	 * gone
2532 	 *
2533 	 * More importantly, we have to provide forward progress guarantee
2534 	 * without holding ub->mutex, otherwise control task grabbing
2535 	 * ub->mutex triggers deadlock
2536 	 *
2537 	 * All requests may be inflight, so ->canceling may not be set, set
2538 	 * it now.
2539 	 */
2540 	mutex_lock(&ub->cancel_mutex);
2541 	ublk_set_canceling(ub, true);
2542 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2543 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2544 	mutex_unlock(&ub->cancel_mutex);
2545 	blk_mq_kick_requeue_list(disk->queue);
2546 
2547 	/*
2548 	 * All infligh requests have been completed or requeued and any new
2549 	 * request will be failed or requeued via `->canceling` now, so it is
2550 	 * fine to grab ub->mutex now.
2551 	 */
2552 	mutex_lock(&ub->mutex);
2553 
2554 	/* double check after grabbing lock */
2555 	if (!ub->ub_disk)
2556 		goto unlock;
2557 
2558 	/*
2559 	 * Transition the device to the nosrv state. What exactly this
2560 	 * means depends on the recovery flags
2561 	 */
2562 	if (ublk_nosrv_should_stop_dev(ub)) {
2563 		/*
2564 		 * Allow any pending/future I/O to pass through quickly
2565 		 * with an error. This is needed because del_gendisk
2566 		 * waits for all pending I/O to complete
2567 		 */
2568 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2569 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2570 
2571 		ublk_stop_dev_unlocked(ub);
2572 	} else {
2573 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2574 			/* ->canceling is set and all requests are aborted */
2575 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2576 		} else {
2577 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2578 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2579 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2580 		}
2581 	}
2582 unlock:
2583 	mutex_unlock(&ub->mutex);
2584 	ublk_put_disk(disk);
2585 
2586 	/* all uring_cmd has been done now, reset device & ubq */
2587 	ublk_reset_ch_dev(ub);
2588 out:
2589 	clear_bit(UB_STATE_OPEN, &ub->state);
2590 
2591 	/* put the reference grabbed in ublk_ch_release() */
2592 	ublk_put_device(ub);
2593 }
2594 
2595 static int ublk_ch_release(struct inode *inode, struct file *filp)
2596 {
2597 	struct ublk_device *ub = filp->private_data;
2598 
2599 	/*
2600 	 * Grab ublk device reference, so it won't be gone until we are
2601 	 * really released from work function.
2602 	 */
2603 	ublk_get_device(ub);
2604 
2605 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2606 	schedule_delayed_work(&ub->exit_work, 0);
2607 	return 0;
2608 }
2609 
2610 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
2611 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2612 {
2613 	struct ublk_device *ub = filp->private_data;
2614 	size_t sz = vma->vm_end - vma->vm_start;
2615 	unsigned max_sz = ublk_max_cmd_buf_size();
2616 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2617 	int q_id, ret = 0;
2618 
2619 	spin_lock(&ub->lock);
2620 	if (!ub->mm)
2621 		ub->mm = current->mm;
2622 	if (current->mm != ub->mm)
2623 		ret = -EINVAL;
2624 	spin_unlock(&ub->lock);
2625 
2626 	if (ret)
2627 		return ret;
2628 
2629 	if (vma->vm_flags & VM_WRITE)
2630 		return -EPERM;
2631 
2632 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2633 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2634 		return -EINVAL;
2635 
2636 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2637 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2638 			__func__, q_id, current->pid, vma->vm_start,
2639 			phys_off, (unsigned long)sz);
2640 
2641 	if (sz != ublk_queue_cmd_buf_size(ub))
2642 		return -EINVAL;
2643 
2644 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2645 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2646 }
2647 
2648 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2649 		struct request *req)
2650 {
2651 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2652 			io->flags & UBLK_IO_FLAG_ACTIVE);
2653 
2654 	if (ublk_nosrv_should_reissue_outstanding(ub))
2655 		blk_mq_requeue_request(req, false);
2656 	else {
2657 		io->res = -EIO;
2658 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2659 	}
2660 }
2661 
2662 /*
2663  * Request tag may just be filled to event kfifo, not get chance to
2664  * dispatch, abort these requests too
2665  */
2666 static void ublk_abort_batch_queue(struct ublk_device *ub,
2667 				   struct ublk_queue *ubq)
2668 {
2669 	unsigned short tag;
2670 
2671 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2672 		struct request *req = blk_mq_tag_to_rq(
2673 				ub->tag_set.tags[ubq->q_id], tag);
2674 
2675 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2676 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2677 	}
2678 }
2679 
2680 /*
2681  * Called from ublk char device release handler, when any uring_cmd is
2682  * done, meantime request queue is "quiesced" since all inflight requests
2683  * can't be completed because ublk server is dead.
2684  *
2685  * So no one can hold our request IO reference any more, simply ignore the
2686  * reference, and complete the request immediately
2687  */
2688 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2689 {
2690 	int i;
2691 
2692 	for (i = 0; i < ubq->q_depth; i++) {
2693 		struct ublk_io *io = &ubq->ios[i];
2694 
2695 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2696 			__ublk_fail_req(ub, io, io->req);
2697 	}
2698 
2699 	if (ublk_support_batch_io(ubq))
2700 		ublk_abort_batch_queue(ub, ubq);
2701 }
2702 
2703 static void ublk_start_cancel(struct ublk_device *ub)
2704 {
2705 	struct gendisk *disk = ublk_get_disk(ub);
2706 
2707 	/* Our disk has been dead */
2708 	if (!disk)
2709 		return;
2710 
2711 	mutex_lock(&ub->cancel_mutex);
2712 	if (ub->canceling)
2713 		goto out;
2714 	/*
2715 	 * Now we are serialized with ublk_queue_rq()
2716 	 *
2717 	 * Make sure that ubq->canceling is set when queue is frozen,
2718 	 * because ublk_queue_rq() has to rely on this flag for avoiding to
2719 	 * touch completed uring_cmd
2720 	 */
2721 	blk_mq_quiesce_queue(disk->queue);
2722 	ublk_set_canceling(ub, true);
2723 	blk_mq_unquiesce_queue(disk->queue);
2724 out:
2725 	mutex_unlock(&ub->cancel_mutex);
2726 	ublk_put_disk(disk);
2727 }
2728 
2729 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2730 		unsigned int issue_flags)
2731 {
2732 	struct ublk_io *io = &ubq->ios[tag];
2733 	struct ublk_device *ub = ubq->dev;
2734 	struct request *req;
2735 	bool done;
2736 
2737 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2738 		return;
2739 
2740 	/*
2741 	 * Don't try to cancel this command if the request is started for
2742 	 * avoiding race between io_uring_cmd_done() and
2743 	 * io_uring_cmd_complete_in_task().
2744 	 *
2745 	 * Either the started request will be aborted via __ublk_abort_rq(),
2746 	 * then this uring_cmd is canceled next time, or it will be done in
2747 	 * task work function ublk_dispatch_req() because io_uring guarantees
2748 	 * that ublk_dispatch_req() is always called
2749 	 */
2750 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2751 	if (req && blk_mq_request_started(req) && req->tag == tag)
2752 		return;
2753 
2754 	spin_lock(&ubq->cancel_lock);
2755 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2756 	if (!done)
2757 		io->flags |= UBLK_IO_FLAG_CANCELED;
2758 	spin_unlock(&ubq->cancel_lock);
2759 
2760 	if (!done)
2761 		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2762 }
2763 
2764 /*
2765  * Cancel a batch fetch command if it hasn't been claimed by another path.
2766  *
2767  * An fcmd can only be cancelled if:
2768  * 1. It's not the active_fcmd (which is currently being processed)
2769  * 2. It's still on the list (!list_empty check) - once removed from the list,
2770  *    the fcmd is considered claimed and will be freed by whoever removed it
2771  *
2772  * Use list_del_init() so subsequent list_empty() checks work correctly.
2773  */
2774 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2775 				  struct ublk_batch_fetch_cmd *fcmd,
2776 				  unsigned int issue_flags)
2777 {
2778 	bool done;
2779 
2780 	spin_lock(&ubq->evts_lock);
2781 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2782 	if (done)
2783 		list_del_init(&fcmd->node);
2784 	spin_unlock(&ubq->evts_lock);
2785 
2786 	if (done) {
2787 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2788 		ublk_batch_free_fcmd(fcmd);
2789 	}
2790 }
2791 
2792 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2793 {
2794 	struct ublk_batch_fetch_cmd *fcmd;
2795 	LIST_HEAD(fcmd_list);
2796 
2797 	spin_lock(&ubq->evts_lock);
2798 	ubq->force_abort = true;
2799 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2800 	fcmd = READ_ONCE(ubq->active_fcmd);
2801 	if (fcmd)
2802 		list_move(&fcmd->node, &ubq->fcmd_head);
2803 	spin_unlock(&ubq->evts_lock);
2804 
2805 	while (!list_empty(&fcmd_list)) {
2806 		fcmd = list_first_entry(&fcmd_list,
2807 				struct ublk_batch_fetch_cmd, node);
2808 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2809 	}
2810 }
2811 
2812 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2813 				 unsigned int issue_flags)
2814 {
2815 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2816 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2817 	struct ublk_queue *ubq = pdu->ubq;
2818 
2819 	ublk_start_cancel(ubq->dev);
2820 
2821 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2822 }
2823 
2824 /*
2825  * The ublk char device won't be closed when calling cancel fn, so both
2826  * ublk device and queue are guaranteed to be live
2827  *
2828  * Two-stage cancel:
2829  *
2830  * - make every active uring_cmd done in ->cancel_fn()
2831  *
2832  * - aborting inflight ublk IO requests in ublk char device release handler,
2833  *   which depends on 1st stage because device can only be closed iff all
2834  *   uring_cmd are done
2835  *
2836  * Do _not_ try to acquire ub->mutex before all inflight requests are
2837  * aborted, otherwise deadlock may be caused.
2838  */
2839 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2840 		unsigned int issue_flags)
2841 {
2842 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2843 	struct ublk_queue *ubq = pdu->ubq;
2844 	struct task_struct *task;
2845 	struct ublk_io *io;
2846 
2847 	if (WARN_ON_ONCE(!ubq))
2848 		return;
2849 
2850 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2851 		return;
2852 
2853 	task = io_uring_cmd_get_task(cmd);
2854 	io = &ubq->ios[pdu->tag];
2855 	if (WARN_ON_ONCE(task && task != io->task))
2856 		return;
2857 
2858 	ublk_start_cancel(ubq->dev);
2859 
2860 	WARN_ON_ONCE(io->cmd != cmd);
2861 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2862 }
2863 
2864 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2865 {
2866 	return ubq->nr_io_ready == ubq->q_depth;
2867 }
2868 
2869 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2870 {
2871 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2872 }
2873 
2874 static void ublk_cancel_queue(struct ublk_queue *ubq)
2875 {
2876 	int i;
2877 
2878 	if (ublk_support_batch_io(ubq)) {
2879 		ublk_batch_cancel_queue(ubq);
2880 		return;
2881 	}
2882 
2883 	for (i = 0; i < ubq->q_depth; i++)
2884 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2885 }
2886 
2887 /* Cancel all pending commands, must be called after del_gendisk() returns */
2888 static void ublk_cancel_dev(struct ublk_device *ub)
2889 {
2890 	int i;
2891 
2892 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2893 		ublk_cancel_queue(ublk_get_queue(ub, i));
2894 }
2895 
2896 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2897 {
2898 	bool *idle = data;
2899 
2900 	if (blk_mq_request_started(rq)) {
2901 		*idle = false;
2902 		return false;
2903 	}
2904 	return true;
2905 }
2906 
2907 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2908 {
2909 	bool idle;
2910 
2911 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2912 	while (true) {
2913 		idle = true;
2914 		blk_mq_tagset_busy_iter(&ub->tag_set,
2915 				ublk_check_inflight_rq, &idle);
2916 		if (idle)
2917 			break;
2918 		msleep(UBLK_REQUEUE_DELAY_MS);
2919 	}
2920 }
2921 
2922 static void ublk_force_abort_dev(struct ublk_device *ub)
2923 {
2924 	int i;
2925 
2926 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2927 			__func__, ub->dev_info.dev_id,
2928 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2929 			"LIVE" : "QUIESCED");
2930 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2931 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2932 		ublk_wait_tagset_rqs_idle(ub);
2933 
2934 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2935 		ublk_get_queue(ub, i)->force_abort = true;
2936 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2937 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2938 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2939 }
2940 
2941 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2942 {
2943 	struct gendisk *disk;
2944 
2945 	/* Sync with ublk_abort_queue() by holding the lock */
2946 	spin_lock(&ub->lock);
2947 	disk = ub->ub_disk;
2948 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2949 	ub->dev_info.ublksrv_pid = -1;
2950 	ub->ub_disk = NULL;
2951 	spin_unlock(&ub->lock);
2952 
2953 	return disk;
2954 }
2955 
2956 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2957 	__must_hold(&ub->mutex)
2958 {
2959 	struct gendisk *disk;
2960 
2961 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2962 		return;
2963 
2964 	if (ublk_nosrv_dev_should_queue_io(ub))
2965 		ublk_force_abort_dev(ub);
2966 	del_gendisk(ub->ub_disk);
2967 	disk = ublk_detach_disk(ub);
2968 	put_disk(disk);
2969 }
2970 
2971 static void ublk_stop_dev(struct ublk_device *ub)
2972 {
2973 	mutex_lock(&ub->mutex);
2974 	ublk_stop_dev_unlocked(ub);
2975 	mutex_unlock(&ub->mutex);
2976 	cancel_work_sync(&ub->partition_scan_work);
2977 	ublk_cancel_dev(ub);
2978 }
2979 
2980 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
2981 {
2982 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
2983 	spin_lock(&ubq->cancel_lock);
2984 	io->flags &= ~UBLK_IO_FLAG_CANCELED;
2985 	spin_unlock(&ubq->cancel_lock);
2986 }
2987 
2988 /* reset per-queue io flags */
2989 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2990 {
2991 	spin_lock(&ubq->cancel_lock);
2992 	ubq->canceling = false;
2993 	spin_unlock(&ubq->cancel_lock);
2994 	ubq->fail_io = false;
2995 }
2996 
2997 /* device can only be started after all IOs are ready */
2998 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
2999 	struct ublk_io *io)
3000 	__must_hold(&ub->mutex)
3001 {
3002 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
3003 
3004 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
3005 		ub->unprivileged_daemons = true;
3006 
3007 	ubq->nr_io_ready++;
3008 	ublk_reset_io_flags(ubq, io);
3009 
3010 	/* Check if this specific queue is now fully ready */
3011 	if (ublk_queue_ready(ubq)) {
3012 		ub->nr_queue_ready++;
3013 
3014 		/*
3015 		 * Reset queue flags as soon as this queue is ready.
3016 		 * This clears the canceling flag, allowing batch FETCH commands
3017 		 * to succeed during recovery without waiting for all queues.
3018 		 */
3019 		ublk_queue_reset_io_flags(ubq);
3020 	}
3021 
3022 	/* Check if all queues are ready */
3023 	if (ublk_dev_ready(ub)) {
3024 		/*
3025 		 * All queues ready - clear device-level canceling flag
3026 		 * and complete the recovery/initialization.
3027 		 */
3028 		mutex_lock(&ub->cancel_mutex);
3029 		ub->canceling = false;
3030 		mutex_unlock(&ub->cancel_mutex);
3031 		complete_all(&ub->completion);
3032 	}
3033 }
3034 
3035 static inline int ublk_check_cmd_op(u32 cmd_op)
3036 {
3037 	u32 ioc_type = _IOC_TYPE(cmd_op);
3038 
3039 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
3040 		return -EOPNOTSUPP;
3041 
3042 	if (ioc_type != 'u' && ioc_type != 0)
3043 		return -EOPNOTSUPP;
3044 
3045 	return 0;
3046 }
3047 
3048 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
3049 {
3050 	struct ublk_auto_buf_reg buf;
3051 
3052 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
3053 
3054 	if (buf.reserved0 || buf.reserved1)
3055 		return -EINVAL;
3056 
3057 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
3058 		return -EINVAL;
3059 	io->buf.auto_reg = buf;
3060 	return 0;
3061 }
3062 
3063 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3064 				    struct io_uring_cmd *cmd,
3065 				    u16 *buf_idx)
3066 {
3067 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3068 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3069 
3070 		/*
3071 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3072 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3073 		 * `io_ring_ctx`.
3074 		 *
3075 		 * If this uring_cmd's io_ring_ctx isn't same with the
3076 		 * one for registering the buffer, it is ublk server's
3077 		 * responsibility for unregistering the buffer, otherwise
3078 		 * this ublk request gets stuck.
3079 		 */
3080 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3081 			*buf_idx = io->buf.auto_reg.index;
3082 	}
3083 }
3084 
3085 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3086 				    struct io_uring_cmd *cmd,
3087 				    u16 *buf_idx)
3088 {
3089 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3090 	return ublk_set_auto_buf_reg(io, cmd);
3091 }
3092 
3093 /* Once we return, `io->req` can't be used any more */
3094 static inline struct request *
3095 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3096 {
3097 	struct request *req = io->req;
3098 
3099 	io->cmd = cmd;
3100 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3101 	/* now this cmd slot is owned by ublk driver */
3102 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3103 
3104 	return req;
3105 }
3106 
3107 static inline int
3108 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3109 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3110 		   u16 *buf_idx)
3111 {
3112 	if (ublk_dev_support_auto_buf_reg(ub))
3113 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3114 
3115 	io->buf.addr = buf_addr;
3116 	return 0;
3117 }
3118 
3119 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3120 				    unsigned int issue_flags,
3121 				    struct ublk_queue *ubq, unsigned int tag)
3122 {
3123 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3124 
3125 	/*
3126 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3127 	 * commands are completed
3128 	 */
3129 	pdu->ubq = ubq;
3130 	pdu->tag = tag;
3131 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3132 }
3133 
3134 static void ublk_io_release(void *priv)
3135 {
3136 	struct request *rq = priv;
3137 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3138 	struct ublk_io *io = &ubq->ios[rq->tag];
3139 
3140 	/*
3141 	 * task_registered_buffers may be 0 if buffers were registered off task
3142 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3143 	 */
3144 	if (current == io->task && io->task_registered_buffers)
3145 		io->task_registered_buffers--;
3146 	else
3147 		ublk_put_req_ref(io, rq);
3148 }
3149 
3150 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3151 				struct ublk_device *ub,
3152 				u16 q_id, u16 tag,
3153 				struct ublk_io *io,
3154 				unsigned int index, unsigned int issue_flags)
3155 {
3156 	struct request *req;
3157 	int ret;
3158 
3159 	if (!ublk_dev_support_zero_copy(ub))
3160 		return -EINVAL;
3161 
3162 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3163 	if (!req)
3164 		return -EINVAL;
3165 
3166 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3167 				      issue_flags);
3168 	if (ret) {
3169 		ublk_put_req_ref(io, req);
3170 		return ret;
3171 	}
3172 
3173 	return 0;
3174 }
3175 
3176 static int
3177 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3178 			    struct ublk_device *ub,
3179 			    u16 q_id, u16 tag, struct ublk_io *io,
3180 			    unsigned index, unsigned issue_flags)
3181 {
3182 	unsigned new_registered_buffers;
3183 	struct request *req = io->req;
3184 	int ret;
3185 
3186 	/*
3187 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3188 	 * If not, fall back on the thread-safe buffer registration.
3189 	 */
3190 	new_registered_buffers = io->task_registered_buffers + 1;
3191 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3192 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3193 					    issue_flags);
3194 
3195 	if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3196 		return -EINVAL;
3197 
3198 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3199 				      issue_flags);
3200 	if (ret)
3201 		return ret;
3202 
3203 	io->task_registered_buffers = new_registered_buffers;
3204 	return 0;
3205 }
3206 
3207 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3208 				  const struct ublk_device *ub,
3209 				  unsigned int index, unsigned int issue_flags)
3210 {
3211 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3212 		return -EINVAL;
3213 
3214 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3215 }
3216 
3217 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3218 {
3219 	if (ublk_dev_need_map_io(ub)) {
3220 		/*
3221 		 * FETCH_RQ has to provide IO buffer if NEED GET
3222 		 * DATA is not enabled
3223 		 */
3224 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3225 			return -EINVAL;
3226 	} else if (buf_addr) {
3227 		/* User copy requires addr to be unset */
3228 		return -EINVAL;
3229 	}
3230 	return 0;
3231 }
3232 
3233 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3234 			struct ublk_io *io, u16 q_id)
3235 {
3236 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3237 	if (ublk_dev_ready(ub))
3238 		return -EBUSY;
3239 
3240 	/* allow each command to be FETCHed at most once */
3241 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3242 		return -EINVAL;
3243 
3244 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3245 
3246 	ublk_fill_io_cmd(io, cmd);
3247 
3248 	if (ublk_dev_support_batch_io(ub))
3249 		WRITE_ONCE(io->task, NULL);
3250 	else
3251 		WRITE_ONCE(io->task, get_task_struct(current));
3252 
3253 	return 0;
3254 }
3255 
3256 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3257 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3258 {
3259 	int ret;
3260 
3261 	/*
3262 	 * When handling FETCH command for setting up ublk uring queue,
3263 	 * ub->mutex is the innermost lock, and we won't block for handling
3264 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3265 	 */
3266 	mutex_lock(&ub->mutex);
3267 	ret = __ublk_fetch(cmd, ub, io, q_id);
3268 	if (!ret)
3269 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3270 	if (!ret)
3271 		ublk_mark_io_ready(ub, q_id, io);
3272 	mutex_unlock(&ub->mutex);
3273 	return ret;
3274 }
3275 
3276 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3277 				       struct ublk_io *io, __u64 buf_addr)
3278 {
3279 	struct request *req = io->req;
3280 
3281 	if (ublk_dev_need_map_io(ub)) {
3282 		/*
3283 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3284 		 * NEED GET DATA is not enabled or it is Read IO.
3285 		 */
3286 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3287 					req_op(req) == REQ_OP_READ))
3288 			return -EINVAL;
3289 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3290 		/*
3291 		 * User copy requires addr to be unset when command is
3292 		 * not zone append
3293 		 */
3294 		return -EINVAL;
3295 	}
3296 
3297 	return 0;
3298 }
3299 
3300 static bool ublk_need_complete_req(const struct ublk_device *ub,
3301 				   struct ublk_io *io)
3302 {
3303 	if (ublk_dev_need_req_ref(ub))
3304 		return ublk_sub_req_ref(io);
3305 	return true;
3306 }
3307 
3308 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3309 			  struct request *req)
3310 {
3311 	/*
3312 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3313 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3314 	 * do the copy work.
3315 	 */
3316 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3317 	/* update iod->addr because ublksrv may have passed a new io buffer */
3318 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3319 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3320 			__func__, ubq->q_id, req->tag, io->flags,
3321 			ublk_get_iod(ubq, req->tag)->addr);
3322 
3323 	return ublk_start_io(ubq, req, io);
3324 }
3325 
3326 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3327 		unsigned int issue_flags)
3328 {
3329 	/* May point to userspace-mapped memory */
3330 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3331 							       struct ublksrv_io_cmd);
3332 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3333 	struct ublk_device *ub = cmd->file->private_data;
3334 	struct ublk_queue *ubq;
3335 	struct ublk_io *io = NULL;
3336 	u32 cmd_op = cmd->cmd_op;
3337 	u16 q_id = READ_ONCE(ub_src->q_id);
3338 	u16 tag = READ_ONCE(ub_src->tag);
3339 	s32 result = READ_ONCE(ub_src->result);
3340 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3341 	struct request *req;
3342 	int ret;
3343 	bool compl;
3344 
3345 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3346 
3347 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3348 			__func__, cmd->cmd_op, q_id, tag, result);
3349 
3350 	ret = ublk_check_cmd_op(cmd_op);
3351 	if (ret)
3352 		goto out;
3353 
3354 	/*
3355 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3356 	 * so no need to validate the q_id, tag, or task
3357 	 */
3358 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3359 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3360 
3361 	ret = -EINVAL;
3362 	if (q_id >= ub->dev_info.nr_hw_queues)
3363 		goto out;
3364 
3365 	ubq = ublk_get_queue(ub, q_id);
3366 
3367 	if (tag >= ub->dev_info.queue_depth)
3368 		goto out;
3369 
3370 	io = &ubq->ios[tag];
3371 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3372 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3373 		ret = ublk_check_fetch_buf(ub, addr);
3374 		if (ret)
3375 			goto out;
3376 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3377 		if (ret)
3378 			goto out;
3379 
3380 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3381 		return -EIOCBQUEUED;
3382 	}
3383 
3384 	if (READ_ONCE(io->task) != current) {
3385 		/*
3386 		 * ublk_register_io_buf() accesses only the io's refcount,
3387 		 * so can be handled on any task
3388 		 */
3389 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3390 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3391 						    addr, issue_flags);
3392 
3393 		goto out;
3394 	}
3395 
3396 	/* there is pending io cmd, something must be wrong */
3397 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3398 		ret = -EBUSY;
3399 		goto out;
3400 	}
3401 
3402 	/*
3403 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3404 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3405 	 */
3406 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3407 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3408 		goto out;
3409 
3410 	switch (_IOC_NR(cmd_op)) {
3411 	case UBLK_IO_REGISTER_IO_BUF:
3412 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3413 						   issue_flags);
3414 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3415 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3416 		if (ret)
3417 			goto out;
3418 		io->res = result;
3419 		req = ublk_fill_io_cmd(io, cmd);
3420 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3421 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3422 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3423 		compl = ublk_need_complete_req(ub, io);
3424 
3425 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3426 			req->__sector = addr;
3427 		if (compl)
3428 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3429 
3430 		if (ret)
3431 			goto out;
3432 		break;
3433 	case UBLK_IO_NEED_GET_DATA:
3434 		/*
3435 		 * ublk_get_data() may fail and fallback to requeue, so keep
3436 		 * uring_cmd active first and prepare for handling new requeued
3437 		 * request
3438 		 */
3439 		req = ublk_fill_io_cmd(io, cmd);
3440 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3441 		WARN_ON_ONCE(ret);
3442 		if (likely(ublk_get_data(ubq, io, req))) {
3443 			__ublk_prep_compl_io_cmd(io, req);
3444 			return UBLK_IO_RES_OK;
3445 		}
3446 		break;
3447 	default:
3448 		goto out;
3449 	}
3450 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3451 	return -EIOCBQUEUED;
3452 
3453  out:
3454 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3455 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3456 	return ret;
3457 }
3458 
3459 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3460 		u16 q_id, u16 tag, struct ublk_io *io)
3461 {
3462 	struct request *req;
3463 
3464 	/*
3465 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3466 	 * which would overwrite it with io->cmd
3467 	 */
3468 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3469 	if (!req)
3470 		return NULL;
3471 
3472 	if (!ublk_get_req_ref(io))
3473 		return NULL;
3474 
3475 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3476 		goto fail_put;
3477 
3478 	if (!ublk_rq_has_data(req))
3479 		goto fail_put;
3480 
3481 	return req;
3482 fail_put:
3483 	ublk_put_req_ref(io, req);
3484 	return NULL;
3485 }
3486 
3487 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3488 {
3489 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3490 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3491 	int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3492 
3493 	if (ret != -EIOCBQUEUED)
3494 		io_uring_cmd_done(cmd, ret, issue_flags);
3495 }
3496 
3497 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3498 {
3499 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3500 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3501 		return 0;
3502 	}
3503 
3504 	/* well-implemented server won't run into unlocked */
3505 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3506 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3507 		return -EIOCBQUEUED;
3508 	}
3509 
3510 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3511 }
3512 
3513 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3514 					const struct ublk_elem_header *elem)
3515 {
3516 	const void *buf = elem;
3517 
3518 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3519 		return *(const __u64 *)(buf + sizeof(*elem));
3520 	return 0;
3521 }
3522 
3523 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3524 					const struct ublk_elem_header *elem)
3525 {
3526 	const void *buf = elem;
3527 
3528 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3529 		return *(const __u64 *)(buf + sizeof(*elem) +
3530 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3531 	return -1;
3532 }
3533 
3534 static struct ublk_auto_buf_reg
3535 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3536 			const struct ublk_elem_header *elem)
3537 {
3538 	struct ublk_auto_buf_reg reg = {
3539 		.index = elem->buf_index,
3540 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3541 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3542 	};
3543 
3544 	return reg;
3545 }
3546 
3547 /*
3548  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3549  * it is the least common multiple(LCM) of 8, 16 and 24
3550  */
3551 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3552 struct ublk_batch_io_iter {
3553 	void __user *uaddr;
3554 	unsigned done, total;
3555 	unsigned char elem_bytes;
3556 	/* copy to this buffer from user space */
3557 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3558 };
3559 
3560 static inline int
3561 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3562 		    struct ublk_batch_io_iter *iter,
3563 		    const struct ublk_batch_io_data *data,
3564 		    unsigned bytes,
3565 		    int (*cb)(struct ublk_queue *q,
3566 			    const struct ublk_batch_io_data *data,
3567 			    const struct ublk_elem_header *elem))
3568 {
3569 	unsigned int i;
3570 	int ret = 0;
3571 
3572 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3573 		const struct ublk_elem_header *elem =
3574 			(const struct ublk_elem_header *)&iter->buf[i];
3575 
3576 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3577 			ret = -EINVAL;
3578 			break;
3579 		}
3580 
3581 		ret = cb(ubq, data, elem);
3582 		if (unlikely(ret))
3583 			break;
3584 	}
3585 
3586 	iter->done += i;
3587 	return ret;
3588 }
3589 
3590 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3591 			     const struct ublk_batch_io_data *data,
3592 			     int (*cb)(struct ublk_queue *q,
3593 				     const struct ublk_batch_io_data *data,
3594 				     const struct ublk_elem_header *elem))
3595 {
3596 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3597 	int ret = 0;
3598 
3599 	while (iter->done < iter->total) {
3600 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3601 
3602 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3603 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3604 					data->ub->dev_info.dev_id);
3605 			return -EFAULT;
3606 		}
3607 
3608 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3609 		if (ret)
3610 			return ret;
3611 	}
3612 	return 0;
3613 }
3614 
3615 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3616 				const struct ublk_batch_io_data *data,
3617 				const struct ublk_elem_header *elem)
3618 {
3619 	struct ublk_io *io = &ubq->ios[elem->tag];
3620 
3621 	/*
3622 	 * If queue was ready before this decrement, it won't be anymore,
3623 	 * so we need to decrement the queue ready count and restore the
3624 	 * canceling flag to prevent new requests from being queued.
3625 	 */
3626 	if (ublk_queue_ready(ubq)) {
3627 		data->ub->nr_queue_ready--;
3628 		spin_lock(&ubq->cancel_lock);
3629 		ubq->canceling = true;
3630 		spin_unlock(&ubq->cancel_lock);
3631 	}
3632 	ubq->nr_io_ready--;
3633 
3634 	ublk_io_lock(io);
3635 	io->flags = 0;
3636 	ublk_io_unlock(io);
3637 	return 0;
3638 }
3639 
3640 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3641 				       const struct ublk_batch_io_data *data)
3642 {
3643 	int ret;
3644 
3645 	/* Re-process only what we've already processed, starting from beginning */
3646 	iter->total = iter->done;
3647 	iter->done = 0;
3648 
3649 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3650 	WARN_ON_ONCE(ret);
3651 }
3652 
3653 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3654 			      const struct ublk_batch_io_data *data,
3655 			      const struct ublk_elem_header *elem)
3656 {
3657 	struct ublk_io *io = &ubq->ios[elem->tag];
3658 	const struct ublk_batch_io *uc = &data->header;
3659 	union ublk_io_buf buf = { 0 };
3660 	int ret;
3661 
3662 	if (ublk_dev_support_auto_buf_reg(data->ub))
3663 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3664 	else if (ublk_dev_need_map_io(data->ub)) {
3665 		buf.addr = ublk_batch_buf_addr(uc, elem);
3666 
3667 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3668 		if (ret)
3669 			return ret;
3670 	}
3671 
3672 	ublk_io_lock(io);
3673 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3674 	if (!ret)
3675 		io->buf = buf;
3676 	ublk_io_unlock(io);
3677 
3678 	if (!ret)
3679 		ublk_mark_io_ready(data->ub, ubq->q_id, io);
3680 
3681 	return ret;
3682 }
3683 
3684 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3685 {
3686 	const struct ublk_batch_io *uc = &data->header;
3687 	struct io_uring_cmd *cmd = data->cmd;
3688 	struct ublk_batch_io_iter iter = {
3689 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3690 		.total = uc->nr_elem * uc->elem_bytes,
3691 		.elem_bytes = uc->elem_bytes,
3692 	};
3693 	int ret;
3694 
3695 	mutex_lock(&data->ub->mutex);
3696 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3697 
3698 	if (ret && iter.done)
3699 		ublk_batch_revert_prep_cmd(&iter, data);
3700 	mutex_unlock(&data->ub->mutex);
3701 	return ret;
3702 }
3703 
3704 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3705 				      struct ublk_io *io,
3706 				      union ublk_io_buf *buf)
3707 {
3708 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3709 		return -EBUSY;
3710 
3711 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3712 	if (ublk_need_map_io(ubq) && !buf->addr)
3713 		return -EINVAL;
3714 	return 0;
3715 }
3716 
3717 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3718 				const struct ublk_batch_io_data *data,
3719 				const struct ublk_elem_header *elem)
3720 {
3721 	struct ublk_io *io = &ubq->ios[elem->tag];
3722 	const struct ublk_batch_io *uc = &data->header;
3723 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3724 	union ublk_io_buf buf = { 0 };
3725 	struct request *req = NULL;
3726 	bool auto_reg = false;
3727 	bool compl = false;
3728 	int ret;
3729 
3730 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3731 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3732 		auto_reg = true;
3733 	} else if (ublk_dev_need_map_io(data->ub))
3734 		buf.addr = ublk_batch_buf_addr(uc, elem);
3735 
3736 	ublk_io_lock(io);
3737 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3738 	if (!ret) {
3739 		io->res = elem->result;
3740 		io->buf = buf;
3741 		req = ublk_fill_io_cmd(io, data->cmd);
3742 
3743 		if (auto_reg)
3744 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3745 		compl = ublk_need_complete_req(data->ub, io);
3746 	}
3747 	ublk_io_unlock(io);
3748 
3749 	if (unlikely(ret)) {
3750 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3751 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3752 			elem->tag, ret);
3753 		return ret;
3754 	}
3755 
3756 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3757 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3758 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3759 		req->__sector = ublk_batch_zone_lba(uc, elem);
3760 	if (compl)
3761 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3762 	return 0;
3763 }
3764 
3765 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3766 {
3767 	const struct ublk_batch_io *uc = &data->header;
3768 	struct io_uring_cmd *cmd = data->cmd;
3769 	struct ublk_batch_io_iter iter = {
3770 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3771 		.total = uc->nr_elem * uc->elem_bytes,
3772 		.elem_bytes = uc->elem_bytes,
3773 	};
3774 	DEFINE_IO_COMP_BATCH(iob);
3775 	int ret;
3776 
3777 	data->iob = &iob;
3778 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3779 
3780 	if (iob.complete)
3781 		iob.complete(&iob);
3782 
3783 	return iter.done == 0 ? ret : iter.done;
3784 }
3785 
3786 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3787 {
3788 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3789 
3790 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3791 		return -EINVAL;
3792 
3793 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3794 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3795 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3796 		return -EINVAL;
3797 
3798 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3799 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3800 	if (uc->elem_bytes != elem_bytes)
3801 		return -EINVAL;
3802 	return 0;
3803 }
3804 
3805 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3806 {
3807 	const struct ublk_batch_io *uc = &data->header;
3808 
3809 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3810 		return -EINVAL;
3811 
3812 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3813 		return -E2BIG;
3814 
3815 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3816 			!ublk_dev_is_zoned(data->ub))
3817 		return -EINVAL;
3818 
3819 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3820 			!ublk_dev_need_map_io(data->ub))
3821 		return -EINVAL;
3822 
3823 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3824 			!ublk_dev_support_auto_buf_reg(data->ub))
3825 		return -EINVAL;
3826 
3827 	return ublk_check_batch_cmd_flags(uc);
3828 }
3829 
3830 static int ublk_batch_attach(struct ublk_queue *ubq,
3831 			     struct ublk_batch_io_data *data,
3832 			     struct ublk_batch_fetch_cmd *fcmd)
3833 {
3834 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3835 	bool free = false;
3836 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3837 
3838 	spin_lock(&ubq->evts_lock);
3839 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3840 		free = true;
3841 	} else {
3842 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3843 		new_fcmd = __ublk_acquire_fcmd(ubq);
3844 	}
3845 	spin_unlock(&ubq->evts_lock);
3846 
3847 	if (unlikely(free)) {
3848 		ublk_batch_free_fcmd(fcmd);
3849 		return -ENODEV;
3850 	}
3851 
3852 	pdu->ubq = ubq;
3853 	pdu->fcmd = fcmd;
3854 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3855 
3856 	if (!new_fcmd)
3857 		goto out;
3858 
3859 	/*
3860 	 * If the two fetch commands are originated from same io_ring_ctx,
3861 	 * run batch dispatch directly. Otherwise, schedule task work for
3862 	 * doing it.
3863 	 */
3864 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3865 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3866 		data->cmd = new_fcmd->cmd;
3867 		ublk_batch_dispatch(ubq, data, new_fcmd);
3868 	} else {
3869 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3870 				ublk_batch_tw_cb);
3871 	}
3872 out:
3873 	return -EIOCBQUEUED;
3874 }
3875 
3876 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3877 {
3878 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3879 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3880 
3881 	if (!fcmd)
3882 		return -ENOMEM;
3883 
3884 	return ublk_batch_attach(ubq, data, fcmd);
3885 }
3886 
3887 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3888 {
3889 	const struct ublk_batch_io *uc = &data->header;
3890 
3891 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3892 		return -EINVAL;
3893 
3894 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3895 		return -EINVAL;
3896 
3897 	if (uc->elem_bytes != sizeof(__u16))
3898 		return -EINVAL;
3899 
3900 	if (uc->flags != 0)
3901 		return -EINVAL;
3902 
3903 	return 0;
3904 }
3905 
3906 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3907 				     unsigned int issue_flags)
3908 {
3909 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3910 							       struct ublksrv_io_cmd);
3911 	struct ublk_device *ub = cmd->file->private_data;
3912 	unsigned tag = READ_ONCE(ub_cmd->tag);
3913 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3914 	unsigned index = READ_ONCE(ub_cmd->addr);
3915 	struct ublk_queue *ubq;
3916 	struct ublk_io *io;
3917 
3918 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3919 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3920 
3921 	if (q_id >= ub->dev_info.nr_hw_queues)
3922 		return -EINVAL;
3923 
3924 	if (tag >= ub->dev_info.queue_depth)
3925 		return -EINVAL;
3926 
3927 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3928 		return -EOPNOTSUPP;
3929 
3930 	ubq = ublk_get_queue(ub, q_id);
3931 	io = &ubq->ios[tag];
3932 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3933 			issue_flags);
3934 }
3935 
3936 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3937 				       unsigned int issue_flags)
3938 {
3939 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3940 							  struct ublk_batch_io);
3941 	struct ublk_device *ub = cmd->file->private_data;
3942 	struct ublk_batch_io_data data = {
3943 		.ub  = ub,
3944 		.cmd = cmd,
3945 		.header = (struct ublk_batch_io) {
3946 			.q_id = READ_ONCE(uc->q_id),
3947 			.flags = READ_ONCE(uc->flags),
3948 			.nr_elem = READ_ONCE(uc->nr_elem),
3949 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3950 		},
3951 		.issue_flags = issue_flags,
3952 	};
3953 	u32 cmd_op = cmd->cmd_op;
3954 	int ret = -EINVAL;
3955 
3956 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3957 		ublk_batch_cancel_fn(cmd, issue_flags);
3958 		return 0;
3959 	}
3960 
3961 	switch (cmd_op) {
3962 	case UBLK_U_IO_PREP_IO_CMDS:
3963 		ret = ublk_check_batch_cmd(&data);
3964 		if (ret)
3965 			goto out;
3966 		ret = ublk_handle_batch_prep_cmd(&data);
3967 		break;
3968 	case UBLK_U_IO_COMMIT_IO_CMDS:
3969 		ret = ublk_check_batch_cmd(&data);
3970 		if (ret)
3971 			goto out;
3972 		ret = ublk_handle_batch_commit_cmd(&data);
3973 		break;
3974 	case UBLK_U_IO_FETCH_IO_CMDS:
3975 		ret = ublk_validate_batch_fetch_cmd(&data);
3976 		if (ret)
3977 			goto out;
3978 		ret = ublk_handle_batch_fetch_cmd(&data);
3979 		break;
3980 	default:
3981 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3982 		break;
3983 	}
3984 out:
3985 	return ret;
3986 }
3987 
3988 static inline bool ublk_check_ubuf_dir(const struct request *req,
3989 		int ubuf_dir)
3990 {
3991 	/* copy ubuf to request pages */
3992 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
3993 	    ubuf_dir == ITER_SOURCE)
3994 		return true;
3995 
3996 	/* copy request pages to ubuf */
3997 	if ((req_op(req) == REQ_OP_WRITE ||
3998 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
3999 	    ubuf_dir == ITER_DEST)
4000 		return true;
4001 
4002 	return false;
4003 }
4004 
4005 static ssize_t
4006 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
4007 {
4008 	struct ublk_device *ub = iocb->ki_filp->private_data;
4009 	struct ublk_queue *ubq;
4010 	struct request *req;
4011 	struct ublk_io *io;
4012 	unsigned data_len;
4013 	bool is_integrity;
4014 	bool on_daemon;
4015 	size_t buf_off;
4016 	u16 tag, q_id;
4017 	ssize_t ret;
4018 
4019 	if (!user_backed_iter(iter))
4020 		return -EACCES;
4021 
4022 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
4023 		return -EACCES;
4024 
4025 	tag = ublk_pos_to_tag(iocb->ki_pos);
4026 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
4027 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
4028 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
4029 
4030 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
4031 		return -EINVAL;
4032 
4033 	if (q_id >= ub->dev_info.nr_hw_queues)
4034 		return -EINVAL;
4035 
4036 	ubq = ublk_get_queue(ub, q_id);
4037 	if (!ublk_dev_support_user_copy(ub))
4038 		return -EACCES;
4039 
4040 	if (tag >= ub->dev_info.queue_depth)
4041 		return -EINVAL;
4042 
4043 	io = &ubq->ios[tag];
4044 	on_daemon = current == READ_ONCE(io->task);
4045 	if (on_daemon) {
4046 		/* On daemon, io can't be completed concurrently, so skip ref */
4047 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
4048 			return -EINVAL;
4049 
4050 		req = io->req;
4051 		if (!ublk_rq_has_data(req))
4052 			return -EINVAL;
4053 	} else {
4054 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
4055 		if (!req)
4056 			return -EINVAL;
4057 	}
4058 
4059 	if (is_integrity) {
4060 		struct blk_integrity *bi = &req->q->limits.integrity;
4061 
4062 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4063 	} else {
4064 		data_len = blk_rq_bytes(req);
4065 	}
4066 	if (buf_off > data_len) {
4067 		ret = -EINVAL;
4068 		goto out;
4069 	}
4070 
4071 	if (!ublk_check_ubuf_dir(req, dir)) {
4072 		ret = -EACCES;
4073 		goto out;
4074 	}
4075 
4076 	if (is_integrity)
4077 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4078 	else
4079 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4080 
4081 out:
4082 	if (!on_daemon)
4083 		ublk_put_req_ref(io, req);
4084 	return ret;
4085 }
4086 
4087 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4088 {
4089 	return ublk_user_copy(iocb, to, ITER_DEST);
4090 }
4091 
4092 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4093 {
4094 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4095 }
4096 
4097 static const struct file_operations ublk_ch_fops = {
4098 	.owner = THIS_MODULE,
4099 	.open = ublk_ch_open,
4100 	.release = ublk_ch_release,
4101 	.read_iter = ublk_ch_read_iter,
4102 	.write_iter = ublk_ch_write_iter,
4103 	.uring_cmd = ublk_ch_uring_cmd,
4104 	.mmap = ublk_ch_mmap,
4105 };
4106 
4107 static const struct file_operations ublk_ch_batch_io_fops = {
4108 	.owner = THIS_MODULE,
4109 	.open = ublk_ch_open,
4110 	.release = ublk_ch_release,
4111 	.read_iter = ublk_ch_read_iter,
4112 	.write_iter = ublk_ch_write_iter,
4113 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4114 	.mmap = ublk_ch_mmap,
4115 };
4116 
4117 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4118 {
4119 	int size, i;
4120 
4121 	size = ublk_queue_cmd_buf_size(ub);
4122 
4123 	for (i = 0; i < ubq->q_depth; i++) {
4124 		struct ublk_io *io = &ubq->ios[i];
4125 		if (io->task)
4126 			put_task_struct(io->task);
4127 		WARN_ON_ONCE(refcount_read(&io->ref));
4128 		WARN_ON_ONCE(io->task_registered_buffers);
4129 	}
4130 
4131 	if (ubq->io_cmd_buf)
4132 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4133 
4134 	if (ublk_dev_support_batch_io(ub))
4135 		ublk_io_evts_deinit(ubq);
4136 
4137 	kvfree(ubq);
4138 }
4139 
4140 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4141 {
4142 	struct ublk_queue *ubq = ub->queues[q_id];
4143 
4144 	if (!ubq)
4145 		return;
4146 
4147 	__ublk_deinit_queue(ub, ubq);
4148 	ub->queues[q_id] = NULL;
4149 }
4150 
4151 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4152 {
4153 	unsigned int cpu;
4154 
4155 	/* Find first CPU mapped to this queue */
4156 	for_each_possible_cpu(cpu) {
4157 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4158 			return cpu_to_node(cpu);
4159 	}
4160 
4161 	return NUMA_NO_NODE;
4162 }
4163 
4164 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4165 {
4166 	int depth = ub->dev_info.queue_depth;
4167 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4168 	struct ublk_queue *ubq;
4169 	struct page *page;
4170 	int numa_node;
4171 	int size, i, ret;
4172 
4173 	/* Determine NUMA node based on queue's CPU affinity */
4174 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4175 
4176 	/* Allocate queue structure on local NUMA node */
4177 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4178 			    numa_node);
4179 	if (!ubq)
4180 		return -ENOMEM;
4181 
4182 	spin_lock_init(&ubq->cancel_lock);
4183 	ubq->flags = ub->dev_info.flags;
4184 	ubq->q_id = q_id;
4185 	ubq->q_depth = depth;
4186 	size = ublk_queue_cmd_buf_size(ub);
4187 
4188 	/* Allocate I/O command buffer on local NUMA node */
4189 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4190 	if (!page) {
4191 		kvfree(ubq);
4192 		return -ENOMEM;
4193 	}
4194 	ubq->io_cmd_buf = page_address(page);
4195 
4196 	for (i = 0; i < ubq->q_depth; i++)
4197 		spin_lock_init(&ubq->ios[i].lock);
4198 
4199 	if (ublk_dev_support_batch_io(ub)) {
4200 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4201 		if (ret)
4202 			goto fail;
4203 		INIT_LIST_HEAD(&ubq->fcmd_head);
4204 	}
4205 	ub->queues[q_id] = ubq;
4206 	ubq->dev = ub;
4207 
4208 	return 0;
4209 fail:
4210 	__ublk_deinit_queue(ub, ubq);
4211 	return ret;
4212 }
4213 
4214 static void ublk_deinit_queues(struct ublk_device *ub)
4215 {
4216 	int i;
4217 
4218 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4219 		ublk_deinit_queue(ub, i);
4220 }
4221 
4222 static int ublk_init_queues(struct ublk_device *ub)
4223 {
4224 	int i, ret;
4225 
4226 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4227 		ret = ublk_init_queue(ub, i);
4228 		if (ret)
4229 			goto fail;
4230 	}
4231 
4232 	init_completion(&ub->completion);
4233 	return 0;
4234 
4235  fail:
4236 	ublk_deinit_queues(ub);
4237 	return ret;
4238 }
4239 
4240 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4241 {
4242 	int i = idx;
4243 	int err;
4244 
4245 	spin_lock(&ublk_idr_lock);
4246 	/* allocate id, if @id >= 0, we're requesting that specific id */
4247 	if (i >= 0) {
4248 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4249 		if (err == -ENOSPC)
4250 			err = -EEXIST;
4251 	} else {
4252 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4253 				GFP_NOWAIT);
4254 	}
4255 	spin_unlock(&ublk_idr_lock);
4256 
4257 	if (err >= 0)
4258 		ub->ub_number = err;
4259 
4260 	return err;
4261 }
4262 
4263 static void ublk_free_dev_number(struct ublk_device *ub)
4264 {
4265 	spin_lock(&ublk_idr_lock);
4266 	idr_remove(&ublk_index_idr, ub->ub_number);
4267 	wake_up_all(&ublk_idr_wq);
4268 	spin_unlock(&ublk_idr_lock);
4269 }
4270 
4271 static void ublk_cdev_rel(struct device *dev)
4272 {
4273 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4274 
4275 	ublk_buf_cleanup(ub);
4276 	blk_mq_free_tag_set(&ub->tag_set);
4277 	ublk_deinit_queues(ub);
4278 	ublk_free_dev_number(ub);
4279 	mutex_destroy(&ub->mutex);
4280 	mutex_destroy(&ub->cancel_mutex);
4281 	kfree(ub);
4282 }
4283 
4284 static int ublk_add_chdev(struct ublk_device *ub)
4285 {
4286 	struct device *dev = &ub->cdev_dev;
4287 	int minor = ub->ub_number;
4288 	int ret;
4289 
4290 	dev->parent = ublk_misc.this_device;
4291 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4292 	dev->class = &ublk_chr_class;
4293 	dev->release = ublk_cdev_rel;
4294 	device_initialize(dev);
4295 
4296 	ret = dev_set_name(dev, "ublkc%d", minor);
4297 	if (ret)
4298 		goto fail;
4299 
4300 	if (ublk_dev_support_batch_io(ub))
4301 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4302 	else
4303 		cdev_init(&ub->cdev, &ublk_ch_fops);
4304 	ret = cdev_device_add(&ub->cdev, dev);
4305 	if (ret)
4306 		goto fail;
4307 
4308 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4309 		unprivileged_ublks_added++;
4310 	return 0;
4311  fail:
4312 	put_device(dev);
4313 	return ret;
4314 }
4315 
4316 /* align max io buffer size with PAGE_SIZE */
4317 static void ublk_align_max_io_size(struct ublk_device *ub)
4318 {
4319 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4320 
4321 	ub->dev_info.max_io_buf_bytes =
4322 		round_down(max_io_bytes, PAGE_SIZE);
4323 }
4324 
4325 static int ublk_add_tag_set(struct ublk_device *ub)
4326 {
4327 	if (ublk_dev_support_batch_io(ub))
4328 		ub->tag_set.ops = &ublk_batch_mq_ops;
4329 	else
4330 		ub->tag_set.ops = &ublk_mq_ops;
4331 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4332 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4333 	ub->tag_set.numa_node = NUMA_NO_NODE;
4334 	ub->tag_set.driver_data = ub;
4335 	return blk_mq_alloc_tag_set(&ub->tag_set);
4336 }
4337 
4338 static void ublk_remove(struct ublk_device *ub)
4339 {
4340 	bool unprivileged;
4341 
4342 	ublk_stop_dev(ub);
4343 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4344 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4345 	ublk_put_device(ub);
4346 
4347 	if (unprivileged)
4348 		unprivileged_ublks_added--;
4349 }
4350 
4351 static struct ublk_device *ublk_get_device_from_id(int idx)
4352 {
4353 	struct ublk_device *ub = NULL;
4354 
4355 	if (idx < 0)
4356 		return NULL;
4357 
4358 	spin_lock(&ublk_idr_lock);
4359 	ub = idr_find(&ublk_index_idr, idx);
4360 	if (ub)
4361 		ub = ublk_get_device(ub);
4362 	spin_unlock(&ublk_idr_lock);
4363 
4364 	return ub;
4365 }
4366 
4367 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4368 {
4369 	rcu_read_lock();
4370 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4371 	rcu_read_unlock();
4372 
4373 	return ub->ublksrv_tgid == ublksrv_pid;
4374 }
4375 
4376 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4377 		const struct ublksrv_ctrl_cmd *header)
4378 {
4379 	const struct ublk_param_basic *p = &ub->params.basic;
4380 	int ublksrv_pid = (int)header->data[0];
4381 	struct queue_limits lim = {
4382 		.logical_block_size	= 1 << p->logical_bs_shift,
4383 		.physical_block_size	= 1 << p->physical_bs_shift,
4384 		.io_min			= 1 << p->io_min_shift,
4385 		.io_opt			= 1 << p->io_opt_shift,
4386 		.max_hw_sectors		= p->max_sectors,
4387 		.chunk_sectors		= p->chunk_sectors,
4388 		.virt_boundary_mask	= p->virt_boundary_mask,
4389 		.max_segments		= USHRT_MAX,
4390 		.max_segment_size	= UINT_MAX,
4391 		.dma_alignment		= 3,
4392 	};
4393 	struct gendisk *disk;
4394 	int ret = -EINVAL;
4395 
4396 	if (ublksrv_pid <= 0)
4397 		return -EINVAL;
4398 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4399 		return -EINVAL;
4400 
4401 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4402 		const struct ublk_param_discard *pd = &ub->params.discard;
4403 
4404 		lim.discard_alignment = pd->discard_alignment;
4405 		lim.discard_granularity = pd->discard_granularity;
4406 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4407 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4408 		lim.max_discard_segments = pd->max_discard_segments;
4409 	}
4410 
4411 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4412 		const struct ublk_param_zoned *p = &ub->params.zoned;
4413 
4414 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4415 			return -EOPNOTSUPP;
4416 
4417 		lim.features |= BLK_FEAT_ZONED;
4418 		lim.max_active_zones = p->max_active_zones;
4419 		lim.max_open_zones =  p->max_open_zones;
4420 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4421 	}
4422 
4423 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4424 		lim.features |= BLK_FEAT_WRITE_CACHE;
4425 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4426 			lim.features |= BLK_FEAT_FUA;
4427 	}
4428 
4429 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4430 		lim.features |= BLK_FEAT_ROTATIONAL;
4431 
4432 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4433 		lim.dma_alignment = ub->params.dma.alignment;
4434 
4435 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4436 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4437 		lim.max_segment_size = ub->params.seg.max_segment_size;
4438 		lim.max_segments = ub->params.seg.max_segments;
4439 	}
4440 
4441 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4442 		const struct ublk_param_integrity *p = &ub->params.integrity;
4443 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4444 
4445 		lim.max_integrity_segments =
4446 			p->max_integrity_segments ?: USHRT_MAX;
4447 		lim.integrity = (struct blk_integrity) {
4448 			.flags = ublk_integrity_flags(p->flags),
4449 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4450 			.metadata_size = p->metadata_size,
4451 			.pi_offset = p->pi_offset,
4452 			.interval_exp = p->interval_exp,
4453 			.tag_size = p->tag_size,
4454 			.pi_tuple_size = pi_tuple_size,
4455 		};
4456 	}
4457 
4458 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4459 		return -EINTR;
4460 
4461 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4462 		return -EINVAL;
4463 
4464 	mutex_lock(&ub->mutex);
4465 	/* device may become not ready in case of F_BATCH */
4466 	if (!ublk_dev_ready(ub)) {
4467 		ret = -EINVAL;
4468 		goto out_unlock;
4469 	}
4470 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4471 	    test_bit(UB_STATE_USED, &ub->state)) {
4472 		ret = -EEXIST;
4473 		goto out_unlock;
4474 	}
4475 
4476 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4477 	if (IS_ERR(disk)) {
4478 		ret = PTR_ERR(disk);
4479 		goto out_unlock;
4480 	}
4481 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4482 	disk->fops = &ub_fops;
4483 	disk->private_data = ub;
4484 
4485 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4486 	ub->ub_disk = disk;
4487 
4488 	ublk_apply_params(ub);
4489 
4490 	/*
4491 	 * Suppress partition scan to avoid potential IO hang.
4492 	 *
4493 	 * If ublk server error occurs during partition scan, the IO may
4494 	 * wait while holding ub->mutex, which can deadlock with other
4495 	 * operations that need the mutex. Defer partition scan to async
4496 	 * work.
4497 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4498 	 * permanently.
4499 	 */
4500 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4501 
4502 	ublk_get_device(ub);
4503 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4504 
4505 	if (ublk_dev_is_zoned(ub)) {
4506 		ret = ublk_revalidate_disk_zones(ub);
4507 		if (ret)
4508 			goto out_put_cdev;
4509 	}
4510 
4511 	ret = add_disk(disk);
4512 	if (ret)
4513 		goto out_put_cdev;
4514 
4515 	set_bit(UB_STATE_USED, &ub->state);
4516 
4517 	/* Skip partition scan if disabled by user */
4518 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4519 		/* Not clear for unprivileged daemons, see comment above */
4520 		if (!ub->unprivileged_daemons)
4521 			clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4522 	} else {
4523 		/* Schedule async partition scan for trusted daemons */
4524 		if (!ub->unprivileged_daemons)
4525 			schedule_work(&ub->partition_scan_work);
4526 	}
4527 
4528 out_put_cdev:
4529 	if (ret) {
4530 		ublk_detach_disk(ub);
4531 		ublk_put_device(ub);
4532 	}
4533 	if (ret)
4534 		put_disk(disk);
4535 out_unlock:
4536 	mutex_unlock(&ub->mutex);
4537 	return ret;
4538 }
4539 
4540 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4541 		const struct ublksrv_ctrl_cmd *header)
4542 {
4543 	void __user *argp = (void __user *)(unsigned long)header->addr;
4544 	cpumask_var_t cpumask;
4545 	unsigned long queue;
4546 	unsigned int retlen;
4547 	unsigned int i;
4548 	int ret;
4549 
4550 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4551 		return -EINVAL;
4552 	if (header->len & (sizeof(unsigned long)-1))
4553 		return -EINVAL;
4554 	if (!header->addr)
4555 		return -EINVAL;
4556 
4557 	queue = header->data[0];
4558 	if (queue >= ub->dev_info.nr_hw_queues)
4559 		return -EINVAL;
4560 
4561 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4562 		return -ENOMEM;
4563 
4564 	for_each_possible_cpu(i) {
4565 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4566 			cpumask_set_cpu(i, cpumask);
4567 	}
4568 
4569 	ret = -EFAULT;
4570 	retlen = min_t(unsigned short, header->len, cpumask_size());
4571 	if (copy_to_user(argp, cpumask, retlen))
4572 		goto out_free_cpumask;
4573 	if (retlen != header->len &&
4574 	    clear_user(argp + retlen, header->len - retlen))
4575 		goto out_free_cpumask;
4576 
4577 	ret = 0;
4578 out_free_cpumask:
4579 	free_cpumask_var(cpumask);
4580 	return ret;
4581 }
4582 
4583 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4584 {
4585 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4586 			info->dev_id, info->flags);
4587 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4588 			info->nr_hw_queues, info->queue_depth);
4589 }
4590 
4591 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4592 {
4593 	void __user *argp = (void __user *)(unsigned long)header->addr;
4594 	struct ublksrv_ctrl_dev_info info;
4595 	struct ublk_device *ub;
4596 	int ret = -EINVAL;
4597 
4598 	if (header->len < sizeof(info) || !header->addr)
4599 		return -EINVAL;
4600 	if (header->queue_id != (u16)-1) {
4601 		pr_warn("%s: queue_id is wrong %x\n",
4602 			__func__, header->queue_id);
4603 		return -EINVAL;
4604 	}
4605 
4606 	if (copy_from_user(&info, argp, sizeof(info)))
4607 		return -EFAULT;
4608 
4609 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4610 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4611 		return -EINVAL;
4612 
4613 	if (capable(CAP_SYS_ADMIN))
4614 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4615 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4616 		return -EPERM;
4617 
4618 	/* forbid nonsense combinations of recovery flags */
4619 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4620 	case 0:
4621 	case UBLK_F_USER_RECOVERY:
4622 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4623 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4624 		break;
4625 	default:
4626 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4627 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4628 		return -EINVAL;
4629 	}
4630 
4631 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4632 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4633 		return -EINVAL;
4634 	}
4635 
4636 	/*
4637 	 * unprivileged device can't be trusted, but RECOVERY and
4638 	 * RECOVERY_REISSUE still may hang error handling, so can't
4639 	 * support recovery features for unprivileged ublk now
4640 	 *
4641 	 * TODO: provide forward progress for RECOVERY handler, so that
4642 	 * unprivileged device can benefit from it
4643 	 */
4644 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4645 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4646 				UBLK_F_USER_RECOVERY);
4647 
4648 		/*
4649 		 * For USER_COPY, we depends on userspace to fill request
4650 		 * buffer by pwrite() to ublk char device, which can't be
4651 		 * used for unprivileged device
4652 		 *
4653 		 * Same with zero copy or auto buffer register.
4654 		 */
4655 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4656 					UBLK_F_AUTO_BUF_REG))
4657 			return -EINVAL;
4658 	}
4659 
4660 	/* User copy is required to access integrity buffer */
4661 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4662 		return -EINVAL;
4663 
4664 	/* the created device is always owned by current user */
4665 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4666 
4667 	if (header->dev_id != info.dev_id) {
4668 		pr_warn("%s: dev id not match %u %u\n",
4669 			__func__, header->dev_id, info.dev_id);
4670 		return -EINVAL;
4671 	}
4672 
4673 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4674 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4675 			__func__, UBLK_MAX_UBLKS - 1);
4676 		return -EINVAL;
4677 	}
4678 
4679 	ublk_dump_dev_info(&info);
4680 
4681 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4682 	if (ret)
4683 		return ret;
4684 
4685 	ret = -EACCES;
4686 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4687 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4688 		goto out_unlock;
4689 
4690 	ret = -ENOMEM;
4691 	ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4692 	if (!ub)
4693 		goto out_unlock;
4694 	mutex_init(&ub->mutex);
4695 	spin_lock_init(&ub->lock);
4696 	mutex_init(&ub->cancel_mutex);
4697 	mt_init(&ub->buf_tree);
4698 	ida_init(&ub->buf_ida);
4699 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4700 
4701 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4702 	if (ret < 0)
4703 		goto out_free_ub;
4704 
4705 	memcpy(&ub->dev_info, &info, sizeof(info));
4706 
4707 	/* update device id */
4708 	ub->dev_info.dev_id = ub->ub_number;
4709 
4710 	/*
4711 	 * 64bit flags will be copied back to userspace as feature
4712 	 * negotiation result, so have to clear flags which driver
4713 	 * doesn't support yet, then userspace can get correct flags
4714 	 * (features) to handle.
4715 	 */
4716 	ub->dev_info.flags &= UBLK_F_ALL;
4717 
4718 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4719 		UBLK_F_URING_CMD_COMP_IN_TASK |
4720 		UBLK_F_PER_IO_DAEMON |
4721 		UBLK_F_BUF_REG_OFF_DAEMON |
4722 		UBLK_F_SAFE_STOP_DEV;
4723 
4724 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4725 	if (ublk_dev_support_batch_io(ub))
4726 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4727 
4728 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4729 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4730 				UBLK_F_AUTO_BUF_REG))
4731 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4732 
4733 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4734 	if (ublk_dev_support_batch_io(ub))
4735 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4736 
4737 	/*
4738 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4739 	 * returning write_append_lba, which is only allowed in case of
4740 	 * user copy or zero copy
4741 	 */
4742 	if (ublk_dev_is_zoned(ub) &&
4743 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4744 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4745 		ret = -EINVAL;
4746 		goto out_free_dev_number;
4747 	}
4748 
4749 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4750 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4751 	ublk_align_max_io_size(ub);
4752 
4753 	ret = ublk_add_tag_set(ub);
4754 	if (ret)
4755 		goto out_free_dev_number;
4756 
4757 	ret = ublk_init_queues(ub);
4758 	if (ret)
4759 		goto out_free_tag_set;
4760 
4761 	ret = -EFAULT;
4762 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4763 		goto out_deinit_queues;
4764 
4765 	/*
4766 	 * Add the char dev so that ublksrv daemon can be setup.
4767 	 * ublk_add_chdev() will cleanup everything if it fails.
4768 	 */
4769 	ret = ublk_add_chdev(ub);
4770 	goto out_unlock;
4771 
4772 out_deinit_queues:
4773 	ublk_deinit_queues(ub);
4774 out_free_tag_set:
4775 	blk_mq_free_tag_set(&ub->tag_set);
4776 out_free_dev_number:
4777 	ublk_free_dev_number(ub);
4778 out_free_ub:
4779 	mutex_destroy(&ub->mutex);
4780 	mutex_destroy(&ub->cancel_mutex);
4781 	kfree(ub);
4782 out_unlock:
4783 	mutex_unlock(&ublk_ctl_mutex);
4784 	return ret;
4785 }
4786 
4787 static inline bool ublk_idr_freed(int id)
4788 {
4789 	void *ptr;
4790 
4791 	spin_lock(&ublk_idr_lock);
4792 	ptr = idr_find(&ublk_index_idr, id);
4793 	spin_unlock(&ublk_idr_lock);
4794 
4795 	return ptr == NULL;
4796 }
4797 
4798 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4799 {
4800 	struct ublk_device *ub = *p_ub;
4801 	int idx = ub->ub_number;
4802 	int ret;
4803 
4804 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4805 	if (ret)
4806 		return ret;
4807 
4808 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4809 		ublk_remove(ub);
4810 		set_bit(UB_STATE_DELETED, &ub->state);
4811 	}
4812 
4813 	/* Mark the reference as consumed */
4814 	*p_ub = NULL;
4815 	ublk_put_device(ub);
4816 	mutex_unlock(&ublk_ctl_mutex);
4817 
4818 	/*
4819 	 * Wait until the idr is removed, then it can be reused after
4820 	 * DEL_DEV command is returned.
4821 	 *
4822 	 * If we returns because of user interrupt, future delete command
4823 	 * may come:
4824 	 *
4825 	 * - the device number isn't freed, this device won't or needn't
4826 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4827 	 *   will be released after the last reference is dropped
4828 	 *
4829 	 * - the device number is freed already, we will not find this
4830 	 *   device via ublk_get_device_from_id()
4831 	 */
4832 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4833 		return -EINTR;
4834 	return 0;
4835 }
4836 
4837 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4838 				      const struct ublksrv_ctrl_cmd *header)
4839 {
4840 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4841 			__func__, cmd_op, header->dev_id, header->queue_id,
4842 			header->data[0], header->addr, header->len);
4843 }
4844 
4845 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4846 {
4847 	ublk_stop_dev(ub);
4848 }
4849 
4850 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4851 {
4852 	struct gendisk *disk;
4853 	int ret = 0;
4854 
4855 	disk = ublk_get_disk(ub);
4856 	if (!disk)
4857 		return -ENODEV;
4858 
4859 	mutex_lock(&disk->open_mutex);
4860 	if (disk_openers(disk) > 0) {
4861 		ret = -EBUSY;
4862 		goto unlock;
4863 	}
4864 	ub->block_open = true;
4865 	/* release open_mutex as del_gendisk() will reacquire it */
4866 	mutex_unlock(&disk->open_mutex);
4867 
4868 	ublk_ctrl_stop_dev(ub);
4869 	goto out;
4870 
4871 unlock:
4872 	mutex_unlock(&disk->open_mutex);
4873 out:
4874 	ublk_put_disk(disk);
4875 	return ret;
4876 }
4877 
4878 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4879 		const struct ublksrv_ctrl_cmd *header)
4880 {
4881 	struct task_struct *p;
4882 	struct pid *pid;
4883 	struct ublksrv_ctrl_dev_info dev_info;
4884 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4885 	void __user *argp = (void __user *)(unsigned long)header->addr;
4886 
4887 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4888 		return -EINVAL;
4889 
4890 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4891 	dev_info.ublksrv_pid = -1;
4892 
4893 	if (init_ublksrv_tgid > 0) {
4894 		rcu_read_lock();
4895 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4896 		p = pid_task(pid, PIDTYPE_TGID);
4897 		if (p) {
4898 			int vnr = task_tgid_vnr(p);
4899 
4900 			if (vnr)
4901 				dev_info.ublksrv_pid = vnr;
4902 		}
4903 		rcu_read_unlock();
4904 	}
4905 
4906 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4907 		return -EFAULT;
4908 
4909 	return 0;
4910 }
4911 
4912 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
4913 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4914 {
4915 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4916 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4917 
4918 	if (ub->ub_disk) {
4919 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4920 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4921 	} else {
4922 		ub->params.devt.disk_major = 0;
4923 		ub->params.devt.disk_minor = 0;
4924 	}
4925 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4926 }
4927 
4928 static int ublk_ctrl_get_params(struct ublk_device *ub,
4929 		const struct ublksrv_ctrl_cmd *header)
4930 {
4931 	void __user *argp = (void __user *)(unsigned long)header->addr;
4932 	struct ublk_params_header ph;
4933 	int ret;
4934 
4935 	if (header->len <= sizeof(ph) || !header->addr)
4936 		return -EINVAL;
4937 
4938 	if (copy_from_user(&ph, argp, sizeof(ph)))
4939 		return -EFAULT;
4940 
4941 	if (ph.len > header->len || !ph.len)
4942 		return -EINVAL;
4943 
4944 	if (ph.len > sizeof(struct ublk_params))
4945 		ph.len = sizeof(struct ublk_params);
4946 
4947 	mutex_lock(&ub->mutex);
4948 	ublk_ctrl_fill_params_devt(ub);
4949 	if (copy_to_user(argp, &ub->params, ph.len))
4950 		ret = -EFAULT;
4951 	else
4952 		ret = 0;
4953 	mutex_unlock(&ub->mutex);
4954 
4955 	return ret;
4956 }
4957 
4958 static int ublk_ctrl_set_params(struct ublk_device *ub,
4959 		const struct ublksrv_ctrl_cmd *header)
4960 {
4961 	void __user *argp = (void __user *)(unsigned long)header->addr;
4962 	struct ublk_params_header ph;
4963 	int ret = -EFAULT;
4964 
4965 	if (header->len <= sizeof(ph) || !header->addr)
4966 		return -EINVAL;
4967 
4968 	if (copy_from_user(&ph, argp, sizeof(ph)))
4969 		return -EFAULT;
4970 
4971 	if (ph.len > header->len || !ph.len || !ph.types)
4972 		return -EINVAL;
4973 
4974 	if (ph.len > sizeof(struct ublk_params))
4975 		ph.len = sizeof(struct ublk_params);
4976 
4977 	mutex_lock(&ub->mutex);
4978 	if (test_bit(UB_STATE_USED, &ub->state)) {
4979 		/*
4980 		 * Parameters can only be changed when device hasn't
4981 		 * been started yet
4982 		 */
4983 		ret = -EACCES;
4984 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
4985 		ret = -EFAULT;
4986 	} else {
4987 		/* clear all we don't support yet */
4988 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
4989 		ret = ublk_validate_params(ub);
4990 		if (ret)
4991 			ub->params.types = 0;
4992 	}
4993 	mutex_unlock(&ub->mutex);
4994 
4995 	return ret;
4996 }
4997 
4998 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
4999 {
5000 	int ret = -EINVAL;
5001 
5002 	mutex_lock(&ub->mutex);
5003 	if (ublk_nosrv_should_stop_dev(ub))
5004 		goto out_unlock;
5005 	/*
5006 	 * START_RECOVERY is only allowd after:
5007 	 *
5008 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
5009 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
5010 	 *     released.
5011 	 *
5012 	 * and one of the following holds
5013 	 *
5014 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
5015 	 *     (a)has quiesced request queue
5016 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
5017 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
5018 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
5019 	 *
5020 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
5021 	 *     quiesced, but all I/O is being immediately errored
5022 	 */
5023 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
5024 		ret = -EBUSY;
5025 		goto out_unlock;
5026 	}
5027 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
5028 	init_completion(&ub->completion);
5029 	ret = 0;
5030  out_unlock:
5031 	mutex_unlock(&ub->mutex);
5032 	return ret;
5033 }
5034 
5035 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
5036 		const struct ublksrv_ctrl_cmd *header)
5037 {
5038 	int ublksrv_pid = (int)header->data[0];
5039 	int ret = -EINVAL;
5040 
5041 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
5042 		 header->dev_id);
5043 
5044 	if (wait_for_completion_interruptible(&ub->completion))
5045 		return -EINTR;
5046 
5047 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
5048 		 header->dev_id);
5049 
5050 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
5051 		return -EINVAL;
5052 
5053 	mutex_lock(&ub->mutex);
5054 	if (ublk_nosrv_should_stop_dev(ub))
5055 		goto out_unlock;
5056 
5057 	if (!ublk_dev_in_recoverable_state(ub)) {
5058 		ret = -EBUSY;
5059 		goto out_unlock;
5060 	}
5061 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
5062 	ub->dev_info.state = UBLK_S_DEV_LIVE;
5063 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
5064 			__func__, ublksrv_pid, header->dev_id);
5065 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
5066 	ret = 0;
5067  out_unlock:
5068 	mutex_unlock(&ub->mutex);
5069 	return ret;
5070 }
5071 
5072 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5073 {
5074 	void __user *argp = (void __user *)(unsigned long)header->addr;
5075 	u64 features = UBLK_F_ALL;
5076 
5077 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
5078 		return -EINVAL;
5079 
5080 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5081 		return -EFAULT;
5082 
5083 	return 0;
5084 }
5085 
5086 static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5087 {
5088 	struct ublk_param_basic *p = &ub->params.basic;
5089 	u64 new_size = header->data[0];
5090 	int ret = 0;
5091 
5092 	mutex_lock(&ub->mutex);
5093 	if (!ub->ub_disk) {
5094 		ret = -ENODEV;
5095 		goto out;
5096 	}
5097 	p->dev_sectors = new_size;
5098 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5099 out:
5100 	mutex_unlock(&ub->mutex);
5101 	return ret;
5102 }
5103 
5104 struct count_busy {
5105 	const struct ublk_queue *ubq;
5106 	unsigned int nr_busy;
5107 };
5108 
5109 static bool ublk_count_busy_req(struct request *rq, void *data)
5110 {
5111 	struct count_busy *idle = data;
5112 
5113 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5114 		idle->nr_busy += 1;
5115 	return true;
5116 }
5117 
5118 /* uring_cmd is guaranteed to be active if the associated request is idle */
5119 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5120 {
5121 	struct count_busy data = {
5122 		.ubq = ubq,
5123 	};
5124 
5125 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5126 	return data.nr_busy < ubq->q_depth;
5127 }
5128 
5129 /* Wait until each hw queue has at least one idle IO */
5130 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5131 				 unsigned int timeout_ms)
5132 {
5133 	unsigned int elapsed = 0;
5134 	int ret;
5135 
5136 	/*
5137 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5138 	 * or new fetch command, so needn't wait any more
5139 	 */
5140 	if (ublk_dev_support_batch_io(ub))
5141 		return 0;
5142 
5143 	while (elapsed < timeout_ms && !signal_pending(current)) {
5144 		unsigned int queues_cancelable = 0;
5145 		int i;
5146 
5147 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5148 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5149 
5150 			queues_cancelable += !!ubq_has_idle_io(ubq);
5151 		}
5152 
5153 		/*
5154 		 * Each queue needs at least one active command for
5155 		 * notifying ublk server
5156 		 */
5157 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5158 			break;
5159 
5160 		msleep(UBLK_REQUEUE_DELAY_MS);
5161 		elapsed += UBLK_REQUEUE_DELAY_MS;
5162 	}
5163 
5164 	if (signal_pending(current))
5165 		ret = -EINTR;
5166 	else if (elapsed >= timeout_ms)
5167 		ret = -EBUSY;
5168 	else
5169 		ret = 0;
5170 
5171 	return ret;
5172 }
5173 
5174 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5175 				 const struct ublksrv_ctrl_cmd *header)
5176 {
5177 	/* zero means wait forever */
5178 	u64 timeout_ms = header->data[0];
5179 	struct gendisk *disk;
5180 	int ret = -ENODEV;
5181 
5182 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5183 		return -EOPNOTSUPP;
5184 
5185 	mutex_lock(&ub->mutex);
5186 	disk = ublk_get_disk(ub);
5187 	if (!disk)
5188 		goto unlock;
5189 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5190 		goto put_disk;
5191 
5192 	ret = 0;
5193 	/* already in expected state */
5194 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5195 		goto put_disk;
5196 
5197 	/* Mark the device as canceling */
5198 	mutex_lock(&ub->cancel_mutex);
5199 	blk_mq_quiesce_queue(disk->queue);
5200 	ublk_set_canceling(ub, true);
5201 	blk_mq_unquiesce_queue(disk->queue);
5202 	mutex_unlock(&ub->cancel_mutex);
5203 
5204 	if (!timeout_ms)
5205 		timeout_ms = UINT_MAX;
5206 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5207 
5208 put_disk:
5209 	ublk_put_disk(disk);
5210 unlock:
5211 	mutex_unlock(&ub->mutex);
5212 
5213 	/* Cancel pending uring_cmd */
5214 	if (!ret)
5215 		ublk_cancel_dev(ub);
5216 	return ret;
5217 }
5218 
5219 /*
5220  * All control commands are sent via /dev/ublk-control, so we have to check
5221  * the destination device's permission
5222  */
5223 static int ublk_char_dev_permission(struct ublk_device *ub,
5224 		const char *dev_path, int mask)
5225 {
5226 	int err;
5227 	struct path path;
5228 	struct kstat stat;
5229 
5230 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5231 	if (err)
5232 		return err;
5233 
5234 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5235 	if (err)
5236 		goto exit;
5237 
5238 	err = -EPERM;
5239 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5240 		goto exit;
5241 
5242 	err = inode_permission(&nop_mnt_idmap,
5243 			d_backing_inode(path.dentry), mask);
5244 exit:
5245 	path_put(&path);
5246 	return err;
5247 }
5248 
5249 /*
5250  * Lock for maple tree modification: acquire ub->mutex, then freeze queue
5251  * if device is started. If device is not yet started, only mutex is
5252  * needed since no I/O path can access the tree.
5253  *
5254  * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
5255  * already holds ub->mutex when calling del_gendisk() which freezes the queue.
5256 */
5257 static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
5258 {
5259 	unsigned int memflags = 0;
5260 
5261 	mutex_lock(&ub->mutex);
5262 	if (ub->ub_disk)
5263 		memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
5264 
5265 	return memflags;
5266 }
5267 
5268 static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
5269 {
5270 	if (ub->ub_disk)
5271 		blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
5272 	mutex_unlock(&ub->mutex);
5273 }
5274 
5275 /* Erase coalesced PFN ranges from the maple tree matching buf_index */
5276 static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
5277 {
5278 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5279 	struct ublk_buf_range *range;
5280 
5281 	mas_lock(&mas);
5282 	mas_for_each(&mas, range, ULONG_MAX) {
5283 		if (range->buf_index == buf_index) {
5284 			mas_erase(&mas);
5285 			kfree(range);
5286 		}
5287 	}
5288 	mas_unlock(&mas);
5289 }
5290 
5291 static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5292 			       struct page **pages, unsigned long nr_pages,
5293 			       int index, unsigned short flags)
5294 {
5295 	unsigned long i;
5296 	int ret;
5297 
5298 	for (i = 0; i < nr_pages; i++) {
5299 		unsigned long pfn = page_to_pfn(pages[i]);
5300 		unsigned long start = i;
5301 		struct ublk_buf_range *range;
5302 
5303 		/* Find run of consecutive PFNs */
5304 		while (i + 1 < nr_pages &&
5305 		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
5306 			i++;
5307 
5308 		range = kzalloc(sizeof(*range), GFP_KERNEL);
5309 		if (!range) {
5310 			ret = -ENOMEM;
5311 			goto unwind;
5312 		}
5313 		range->buf_index = index;
5314 		range->flags = flags;
5315 		range->base_offset = start << PAGE_SHIFT;
5316 
5317 		ret = mtree_insert_range(&ub->buf_tree, pfn,
5318 					 pfn + (i - start),
5319 					 range, GFP_KERNEL);
5320 		if (ret) {
5321 			kfree(range);
5322 			goto unwind;
5323 		}
5324 	}
5325 	return 0;
5326 
5327 unwind:
5328 	ublk_buf_erase_ranges(ub, index);
5329 	return ret;
5330 }
5331 
5332 /*
5333  * Register a shared memory buffer for zero-copy I/O.
5334  * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5335  * internally. Returns buffer index (>= 0) on success.
5336  */
5337 static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5338 			     struct ublksrv_ctrl_cmd *header)
5339 {
5340 	void __user *argp = (void __user *)(unsigned long)header->addr;
5341 	struct ublk_shmem_buf_reg buf_reg;
5342 	unsigned long nr_pages;
5343 	struct page **pages = NULL;
5344 	unsigned int gup_flags;
5345 	unsigned int memflags;
5346 	long pinned;
5347 	int index;
5348 	int ret;
5349 
5350 	if (!ublk_dev_support_shmem_zc(ub))
5351 		return -EOPNOTSUPP;
5352 
5353 	memset(&buf_reg, 0, sizeof(buf_reg));
5354 	if (copy_from_user(&buf_reg, argp,
5355 			   min_t(size_t, header->len, sizeof(buf_reg))))
5356 		return -EFAULT;
5357 
5358 	if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5359 		return -EINVAL;
5360 
5361 	if (buf_reg.reserved)
5362 		return -EINVAL;
5363 
5364 	if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
5365 	    !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
5366 		return -EINVAL;
5367 
5368 	nr_pages = buf_reg.len >> PAGE_SHIFT;
5369 
5370 	/* Pin pages before any locks (may sleep) */
5371 	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
5372 	if (!pages)
5373 		return -ENOMEM;
5374 
5375 	gup_flags = FOLL_LONGTERM;
5376 	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5377 		gup_flags |= FOLL_WRITE;
5378 
5379 	pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
5380 	if (pinned < 0) {
5381 		ret = pinned;
5382 		goto err_free_pages;
5383 	}
5384 	if (pinned != nr_pages) {
5385 		ret = -EFAULT;
5386 		goto err_unpin;
5387 	}
5388 
5389 	memflags = ublk_lock_buf_tree(ub);
5390 
5391 	index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
5392 	if (index < 0) {
5393 		ret = index;
5394 		goto err_unlock;
5395 	}
5396 
5397 	ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
5398 	if (ret) {
5399 		ida_free(&ub->buf_ida, index);
5400 		goto err_unlock;
5401 	}
5402 
5403 	ublk_unlock_buf_tree(ub, memflags);
5404 	kvfree(pages);
5405 	return index;
5406 
5407 err_unlock:
5408 	ublk_unlock_buf_tree(ub, memflags);
5409 err_unpin:
5410 	unpin_user_pages(pages, pinned);
5411 err_free_pages:
5412 	kvfree(pages);
5413 	return ret;
5414 }
5415 
5416 static int __ublk_ctrl_unreg_buf(struct ublk_device *ub, int buf_index)
5417 {
5418 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5419 	struct ublk_buf_range *range;
5420 	struct page *pages[32];
5421 	int ret = -ENOENT;
5422 
5423 	mas_lock(&mas);
5424 	mas_for_each(&mas, range, ULONG_MAX) {
5425 		unsigned long base, nr, off;
5426 
5427 		if (range->buf_index != buf_index)
5428 			continue;
5429 
5430 		ret = 0;
5431 		base = mas.index;
5432 		nr = mas.last - base + 1;
5433 		mas_erase(&mas);
5434 
5435 		for (off = 0; off < nr; ) {
5436 			unsigned int batch = min_t(unsigned long,
5437 						   nr - off, 32);
5438 			unsigned int j;
5439 
5440 			for (j = 0; j < batch; j++)
5441 				pages[j] = pfn_to_page(base + off + j);
5442 			unpin_user_pages(pages, batch);
5443 			off += batch;
5444 		}
5445 		kfree(range);
5446 	}
5447 	mas_unlock(&mas);
5448 
5449 	return ret;
5450 }
5451 
5452 static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5453 			       struct ublksrv_ctrl_cmd *header)
5454 {
5455 	int index = (int)header->data[0];
5456 	unsigned int memflags;
5457 	int ret;
5458 
5459 	if (!ublk_dev_support_shmem_zc(ub))
5460 		return -EOPNOTSUPP;
5461 
5462 	if (index < 0 || index > USHRT_MAX)
5463 		return -EINVAL;
5464 
5465 	memflags = ublk_lock_buf_tree(ub);
5466 
5467 	ret = __ublk_ctrl_unreg_buf(ub, index);
5468 	if (!ret)
5469 		ida_free(&ub->buf_ida, index);
5470 
5471 	ublk_unlock_buf_tree(ub, memflags);
5472 	return ret;
5473 }
5474 
5475 static void ublk_buf_cleanup(struct ublk_device *ub)
5476 {
5477 	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
5478 	struct ublk_buf_range *range;
5479 	struct page *pages[32];
5480 
5481 	mas_for_each(&mas, range, ULONG_MAX) {
5482 		unsigned long base = mas.index;
5483 		unsigned long nr = mas.last - base + 1;
5484 		unsigned long off;
5485 
5486 		for (off = 0; off < nr; ) {
5487 			unsigned int batch = min_t(unsigned long,
5488 						   nr - off, 32);
5489 			unsigned int j;
5490 
5491 			for (j = 0; j < batch; j++)
5492 				pages[j] = pfn_to_page(base + off + j);
5493 			unpin_user_pages(pages, batch);
5494 			off += batch;
5495 		}
5496 		kfree(range);
5497 	}
5498 	mtree_destroy(&ub->buf_tree);
5499 	ida_destroy(&ub->buf_ida);
5500 }
5501 
5502 /* Check if request pages match a registered shared memory buffer */
5503 static bool ublk_try_buf_match(struct ublk_device *ub,
5504 				   struct request *rq,
5505 				   u32 *buf_idx, u32 *buf_off)
5506 {
5507 	struct req_iterator iter;
5508 	struct bio_vec bv;
5509 	int index = -1;
5510 	unsigned long expected_offset = 0;
5511 	bool first = true;
5512 
5513 	rq_for_each_bvec(bv, rq, iter) {
5514 		unsigned long pfn = page_to_pfn(bv.bv_page);
5515 		unsigned long end_pfn = pfn +
5516 			((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
5517 		struct ublk_buf_range *range;
5518 		unsigned long off;
5519 		MA_STATE(mas, &ub->buf_tree, pfn, pfn);
5520 
5521 		range = mas_walk(&mas);
5522 		if (!range)
5523 			return false;
5524 
5525 		/* verify all pages in this bvec fall within the range */
5526 		if (end_pfn > mas.last)
5527 			return false;
5528 
5529 		off = range->base_offset +
5530 			(pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
5531 
5532 		if (first) {
5533 			/* Read-only buffer can't serve READ (kernel writes) */
5534 			if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
5535 			    req_op(rq) != REQ_OP_WRITE)
5536 				return false;
5537 			index = range->buf_index;
5538 			expected_offset = off;
5539 			*buf_off = off;
5540 			first = false;
5541 		} else {
5542 			if (range->buf_index != index)
5543 				return false;
5544 			if (off != expected_offset)
5545 				return false;
5546 		}
5547 		expected_offset += bv.bv_len;
5548 	}
5549 
5550 	if (first)
5551 		return false;
5552 
5553 	*buf_idx = index;
5554 	return true;
5555 }
5556 
5557 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5558 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5559 {
5560 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5561 	void __user *argp = (void __user *)(unsigned long)header->addr;
5562 	char *dev_path = NULL;
5563 	int ret = 0;
5564 	int mask;
5565 
5566 	if (!unprivileged) {
5567 		if (!capable(CAP_SYS_ADMIN))
5568 			return -EPERM;
5569 		/*
5570 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5571 		 * char_dev_path in payload too, since userspace may not
5572 		 * know if the specified device is created as unprivileged
5573 		 * mode.
5574 		 */
5575 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5576 			return 0;
5577 	}
5578 
5579 	/*
5580 	 * User has to provide the char device path for unprivileged ublk
5581 	 *
5582 	 * header->addr always points to the dev path buffer, and
5583 	 * header->dev_path_len records length of dev path buffer.
5584 	 */
5585 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5586 		return -EINVAL;
5587 
5588 	if (header->len < header->dev_path_len)
5589 		return -EINVAL;
5590 
5591 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5592 	if (IS_ERR(dev_path))
5593 		return PTR_ERR(dev_path);
5594 
5595 	ret = -EINVAL;
5596 	switch (_IOC_NR(cmd_op)) {
5597 	case UBLK_CMD_GET_DEV_INFO:
5598 	case UBLK_CMD_GET_DEV_INFO2:
5599 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5600 	case UBLK_CMD_GET_PARAMS:
5601 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5602 		mask = MAY_READ;
5603 		break;
5604 	case UBLK_CMD_START_DEV:
5605 	case UBLK_CMD_STOP_DEV:
5606 	case UBLK_CMD_ADD_DEV:
5607 	case UBLK_CMD_DEL_DEV:
5608 	case UBLK_CMD_SET_PARAMS:
5609 	case UBLK_CMD_START_USER_RECOVERY:
5610 	case UBLK_CMD_END_USER_RECOVERY:
5611 	case UBLK_CMD_UPDATE_SIZE:
5612 	case UBLK_CMD_QUIESCE_DEV:
5613 	case UBLK_CMD_TRY_STOP_DEV:
5614 	case UBLK_CMD_REG_BUF:
5615 	case UBLK_CMD_UNREG_BUF:
5616 		mask = MAY_READ | MAY_WRITE;
5617 		break;
5618 	default:
5619 		goto exit;
5620 	}
5621 
5622 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5623 	if (!ret) {
5624 		header->len -= header->dev_path_len;
5625 		header->addr += header->dev_path_len;
5626 	}
5627 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5628 			__func__, ub->ub_number, cmd_op,
5629 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5630 			dev_path, ret);
5631 exit:
5632 	kfree(dev_path);
5633 	return ret;
5634 }
5635 
5636 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5637 {
5638 	switch (_IOC_NR(cmd_op)) {
5639 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5640 	case UBLK_CMD_GET_DEV_INFO:
5641 	case UBLK_CMD_GET_DEV_INFO2:
5642 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5643 		return false;
5644 	default:
5645 		return true;
5646 	}
5647 }
5648 
5649 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5650 		unsigned int issue_flags)
5651 {
5652 	/* May point to userspace-mapped memory */
5653 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5654 								    struct ublksrv_ctrl_cmd);
5655 	struct ublksrv_ctrl_cmd header;
5656 	struct ublk_device *ub = NULL;
5657 	u32 cmd_op = cmd->cmd_op;
5658 	int ret = -EINVAL;
5659 
5660 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5661 	    issue_flags & IO_URING_F_NONBLOCK)
5662 		return -EAGAIN;
5663 
5664 	if (!(issue_flags & IO_URING_F_SQE128))
5665 		return -EINVAL;
5666 
5667 	header.dev_id = READ_ONCE(ub_src->dev_id);
5668 	header.queue_id = READ_ONCE(ub_src->queue_id);
5669 	header.len = READ_ONCE(ub_src->len);
5670 	header.addr = READ_ONCE(ub_src->addr);
5671 	header.data[0] = READ_ONCE(ub_src->data[0]);
5672 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5673 	ublk_ctrl_cmd_dump(cmd_op, &header);
5674 
5675 	ret = ublk_check_cmd_op(cmd_op);
5676 	if (ret)
5677 		goto out;
5678 
5679 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5680 		ret = ublk_ctrl_get_features(&header);
5681 		goto out;
5682 	}
5683 
5684 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5685 		ret = -ENODEV;
5686 		ub = ublk_get_device_from_id(header.dev_id);
5687 		if (!ub)
5688 			goto out;
5689 
5690 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5691 		if (ret)
5692 			goto put_dev;
5693 	}
5694 
5695 	switch (_IOC_NR(cmd_op)) {
5696 	case UBLK_CMD_START_DEV:
5697 		ret = ublk_ctrl_start_dev(ub, &header);
5698 		break;
5699 	case UBLK_CMD_STOP_DEV:
5700 		ublk_ctrl_stop_dev(ub);
5701 		ret = 0;
5702 		break;
5703 	case UBLK_CMD_GET_DEV_INFO:
5704 	case UBLK_CMD_GET_DEV_INFO2:
5705 		ret = ublk_ctrl_get_dev_info(ub, &header);
5706 		break;
5707 	case UBLK_CMD_ADD_DEV:
5708 		ret = ublk_ctrl_add_dev(&header);
5709 		break;
5710 	case UBLK_CMD_DEL_DEV:
5711 		ret = ublk_ctrl_del_dev(&ub, true);
5712 		break;
5713 	case UBLK_CMD_DEL_DEV_ASYNC:
5714 		ret = ublk_ctrl_del_dev(&ub, false);
5715 		break;
5716 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5717 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5718 		break;
5719 	case UBLK_CMD_GET_PARAMS:
5720 		ret = ublk_ctrl_get_params(ub, &header);
5721 		break;
5722 	case UBLK_CMD_SET_PARAMS:
5723 		ret = ublk_ctrl_set_params(ub, &header);
5724 		break;
5725 	case UBLK_CMD_START_USER_RECOVERY:
5726 		ret = ublk_ctrl_start_recovery(ub);
5727 		break;
5728 	case UBLK_CMD_END_USER_RECOVERY:
5729 		ret = ublk_ctrl_end_recovery(ub, &header);
5730 		break;
5731 	case UBLK_CMD_UPDATE_SIZE:
5732 		ret = ublk_ctrl_set_size(ub, &header);
5733 		break;
5734 	case UBLK_CMD_QUIESCE_DEV:
5735 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5736 		break;
5737 	case UBLK_CMD_TRY_STOP_DEV:
5738 		ret = ublk_ctrl_try_stop_dev(ub);
5739 		break;
5740 	case UBLK_CMD_REG_BUF:
5741 		ret = ublk_ctrl_reg_buf(ub, &header);
5742 		break;
5743 	case UBLK_CMD_UNREG_BUF:
5744 		ret = ublk_ctrl_unreg_buf(ub, &header);
5745 		break;
5746 	default:
5747 		ret = -EOPNOTSUPP;
5748 		break;
5749 	}
5750 
5751  put_dev:
5752 	if (ub)
5753 		ublk_put_device(ub);
5754  out:
5755 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5756 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5757 	return ret;
5758 }
5759 
5760 static const struct file_operations ublk_ctl_fops = {
5761 	.open		= nonseekable_open,
5762 	.uring_cmd      = ublk_ctrl_uring_cmd,
5763 	.owner		= THIS_MODULE,
5764 	.llseek		= noop_llseek,
5765 };
5766 
5767 static struct miscdevice ublk_misc = {
5768 	.minor		= MISC_DYNAMIC_MINOR,
5769 	.name		= "ublk-control",
5770 	.fops		= &ublk_ctl_fops,
5771 };
5772 
5773 static int __init ublk_init(void)
5774 {
5775 	int ret;
5776 
5777 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5778 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5779 	/*
5780 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5781 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5782 	 */
5783 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5784 		     UBLKSRV_IO_INTEGRITY_FLAG);
5785 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5786 
5787 	init_waitqueue_head(&ublk_idr_wq);
5788 
5789 	ret = misc_register(&ublk_misc);
5790 	if (ret)
5791 		return ret;
5792 
5793 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5794 	if (ret)
5795 		goto unregister_mis;
5796 
5797 	ret = class_register(&ublk_chr_class);
5798 	if (ret)
5799 		goto free_chrdev_region;
5800 
5801 	return 0;
5802 
5803 free_chrdev_region:
5804 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5805 unregister_mis:
5806 	misc_deregister(&ublk_misc);
5807 	return ret;
5808 }
5809 
5810 static void __exit ublk_exit(void)
5811 {
5812 	struct ublk_device *ub;
5813 	int id;
5814 
5815 	idr_for_each_entry(&ublk_index_idr, ub, id)
5816 		ublk_remove(ub);
5817 
5818 	class_unregister(&ublk_chr_class);
5819 	misc_deregister(&ublk_misc);
5820 
5821 	idr_destroy(&ublk_index_idr);
5822 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5823 }
5824 
5825 module_init(ublk_init);
5826 module_exit(ublk_exit);
5827 
5828 static int ublk_set_max_unprivileged_ublks(const char *buf,
5829 					   const struct kernel_param *kp)
5830 {
5831 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5832 }
5833 
5834 static int ublk_get_max_unprivileged_ublks(char *buf,
5835 					   const struct kernel_param *kp)
5836 {
5837 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5838 }
5839 
5840 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5841 	.set = ublk_set_max_unprivileged_ublks,
5842 	.get = ublk_get_max_unprivileged_ublks,
5843 };
5844 
5845 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5846 		&unprivileged_ublks_max, 0644);
5847 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5848 
5849 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5850 MODULE_DESCRIPTION("Userspace block device");
5851 MODULE_LICENSE("GPL");
5852