xref: /linux/drivers/block/ublk_drv.c (revision 0c00ed308d0559fc216be0442a3df124e9e13533)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <uapi/linux/fs.h>
50 #include <uapi/linux/ublk_cmd.h>
51 
52 #define UBLK_MINORS		(1U << MINORBITS)
53 
54 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
55 
56 /* private ioctl command mirror */
57 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
58 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
59 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
60 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
61 
62 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
63 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
64 
65 /* All UBLK_F_* have to be included into UBLK_F_ALL */
66 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
67 		| UBLK_F_URING_CMD_COMP_IN_TASK \
68 		| UBLK_F_NEED_GET_DATA \
69 		| UBLK_F_USER_RECOVERY \
70 		| UBLK_F_USER_RECOVERY_REISSUE \
71 		| UBLK_F_UNPRIVILEGED_DEV \
72 		| UBLK_F_CMD_IOCTL_ENCODE \
73 		| UBLK_F_USER_COPY \
74 		| UBLK_F_ZONED \
75 		| UBLK_F_USER_RECOVERY_FAIL_IO \
76 		| UBLK_F_UPDATE_SIZE \
77 		| UBLK_F_AUTO_BUF_REG \
78 		| UBLK_F_QUIESCE \
79 		| UBLK_F_PER_IO_DAEMON \
80 		| UBLK_F_BUF_REG_OFF_DAEMON \
81 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
82 		| UBLK_F_SAFE_STOP_DEV \
83 		| UBLK_F_BATCH_IO \
84 		| UBLK_F_NO_AUTO_PART_SCAN)
85 
86 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
87 		| UBLK_F_USER_RECOVERY_REISSUE \
88 		| UBLK_F_USER_RECOVERY_FAIL_IO)
89 
90 /* All UBLK_PARAM_TYPE_* should be included here */
91 #define UBLK_PARAM_TYPE_ALL                                \
92 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
93 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
94 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
95 	 UBLK_PARAM_TYPE_INTEGRITY)
96 
97 #define UBLK_BATCH_F_ALL  \
98 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
99 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
100 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
101 
102 /* ublk batch fetch uring_cmd */
103 struct ublk_batch_fetch_cmd {
104 	struct list_head node;
105 	struct io_uring_cmd *cmd;
106 	unsigned short buf_group;
107 };
108 
109 struct ublk_uring_cmd_pdu {
110 	/*
111 	 * Store requests in same batch temporarily for queuing them to
112 	 * daemon context.
113 	 *
114 	 * It should have been stored to request payload, but we do want
115 	 * to avoid extra pre-allocation, and uring_cmd payload is always
116 	 * free for us
117 	 */
118 	union {
119 		struct request *req;
120 		struct request *req_list;
121 	};
122 
123 	/*
124 	 * The following two are valid in this cmd whole lifetime, and
125 	 * setup in ublk uring_cmd handler
126 	 */
127 	struct ublk_queue *ubq;
128 
129 	union {
130 		u16 tag;
131 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
132 	};
133 };
134 
135 struct ublk_batch_io_data {
136 	struct ublk_device *ub;
137 	struct io_uring_cmd *cmd;
138 	struct ublk_batch_io header;
139 	unsigned int issue_flags;
140 	struct io_comp_batch *iob;
141 };
142 
143 /*
144  * io command is active: sqe cmd is received, and its cqe isn't done
145  *
146  * If the flag is set, the io command is owned by ublk driver, and waited
147  * for incoming blk-mq request from the ublk block device.
148  *
149  * If the flag is cleared, the io command will be completed, and owned by
150  * ublk server.
151  */
152 #define UBLK_IO_FLAG_ACTIVE	0x01
153 
154 /*
155  * IO command is completed via cqe, and it is being handled by ublksrv, and
156  * not committed yet
157  *
158  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
159  * cross verification
160  */
161 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
162 
163 /*
164  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
165  * get data buffer address from ublksrv.
166  *
167  * Then, bio data could be copied into this data buffer for a WRITE request
168  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
169  */
170 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
171 
172 /*
173  * request buffer is registered automatically, so we have to unregister it
174  * before completing this request.
175  *
176  * io_uring will unregister buffer automatically for us during exiting.
177  */
178 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
179 
180 /* atomic RW with ubq->cancel_lock */
181 #define UBLK_IO_FLAG_CANCELED	0x80000000
182 
183 /*
184  * Initialize refcount to a large number to include any registered buffers.
185  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
186  * any buffers registered on the io daemon task.
187  */
188 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
189 
190 /* used for UBLK_F_BATCH_IO only */
191 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
192 
193 union ublk_io_buf {
194 	__u64	addr;
195 	struct ublk_auto_buf_reg auto_reg;
196 };
197 
198 struct ublk_io {
199 	union ublk_io_buf buf;
200 	unsigned int flags;
201 	int res;
202 
203 	union {
204 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
205 		struct io_uring_cmd *cmd;
206 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
207 		struct request *req;
208 	};
209 
210 	struct task_struct *task;
211 
212 	/*
213 	 * The number of uses of this I/O by the ublk server
214 	 * if user copy or zero copy are enabled:
215 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
216 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
217 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
218 	 * - 1 for each io_uring registered buffer not registered on task
219 	 * The I/O can only be completed once all references are dropped.
220 	 * User copy and buffer registration operations are only permitted
221 	 * if the reference count is nonzero.
222 	 */
223 	refcount_t ref;
224 	/* Count of buffers registered on task and not yet unregistered */
225 	unsigned task_registered_buffers;
226 
227 	void *buf_ctx_handle;
228 	spinlock_t lock;
229 } ____cacheline_aligned_in_smp;
230 
231 struct ublk_queue {
232 	int q_id;
233 	int q_depth;
234 
235 	unsigned long flags;
236 	struct ublksrv_io_desc *io_cmd_buf;
237 
238 	bool force_abort;
239 	bool canceling;
240 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
241 	spinlock_t		cancel_lock;
242 	struct ublk_device *dev;
243 	u32 nr_io_ready;
244 
245 	/*
246 	 * For supporting UBLK_F_BATCH_IO only.
247 	 *
248 	 * Inflight ublk request tag is saved in this fifo
249 	 *
250 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
251 	 * so lock is required for storing request tag to fifo
252 	 *
253 	 * Make sure just one reader for fetching request from task work
254 	 * function to ublk server, so no need to grab the lock in reader
255 	 * side.
256 	 *
257 	 * Batch I/O State Management:
258 	 *
259 	 * The batch I/O system uses implicit state management based on the
260 	 * combination of three key variables below.
261 	 *
262 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
263 	 *   No fetch commands available, events queue in evts_fifo
264 	 *
265 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
266 	 *   Fetch commands available but none processing events
267 	 *
268 	 * - ACTIVE: active_fcmd
269 	 *   One fetch command actively processing events from evts_fifo
270 	 *
271 	 * Key Invariants:
272 	 * - At most one active_fcmd at any time (single reader)
273 	 * - active_fcmd is always from fcmd_head list when non-NULL
274 	 * - evts_fifo can be read locklessly by the single active reader
275 	 * - All state transitions require evts_lock protection
276 	 * - Multiple writers to evts_fifo require lock protection
277 	 */
278 	struct {
279 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
280 		spinlock_t evts_lock;
281 
282 		/* List of fetch commands available to process events */
283 		struct list_head fcmd_head;
284 
285 		/* Currently active fetch command (NULL = none active) */
286 		struct ublk_batch_fetch_cmd  *active_fcmd;
287 	}____cacheline_aligned_in_smp;
288 
289 	struct ublk_io ios[] __counted_by(q_depth);
290 };
291 
292 struct ublk_device {
293 	struct gendisk		*ub_disk;
294 
295 	struct ublksrv_ctrl_dev_info	dev_info;
296 
297 	struct blk_mq_tag_set	tag_set;
298 
299 	struct cdev		cdev;
300 	struct device		cdev_dev;
301 
302 #define UB_STATE_OPEN		0
303 #define UB_STATE_USED		1
304 #define UB_STATE_DELETED	2
305 	unsigned long		state;
306 	int			ub_number;
307 
308 	struct mutex		mutex;
309 
310 	spinlock_t		lock;
311 	struct mm_struct	*mm;
312 
313 	struct ublk_params	params;
314 
315 	struct completion	completion;
316 	u32			nr_queue_ready;
317 	bool 			unprivileged_daemons;
318 	struct mutex cancel_mutex;
319 	bool canceling;
320 	pid_t 	ublksrv_tgid;
321 	struct delayed_work	exit_work;
322 	struct work_struct	partition_scan_work;
323 
324 	bool			block_open; /* protected by open_mutex */
325 
326 	struct ublk_queue       *queues[];
327 };
328 
329 /* header of ublk_params */
330 struct ublk_params_header {
331 	__u32	len;
332 	__u32	types;
333 };
334 
335 static void ublk_io_release(void *priv);
336 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
337 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
338 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
339 		u16 q_id, u16 tag, struct ublk_io *io);
340 static inline unsigned int ublk_req_build_flags(struct request *req);
341 static void ublk_batch_dispatch(struct ublk_queue *ubq,
342 				const struct ublk_batch_io_data *data,
343 				struct ublk_batch_fetch_cmd *fcmd);
344 
345 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
346 {
347 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
348 }
349 
350 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
351 {
352 	return ubq->flags & UBLK_F_BATCH_IO;
353 }
354 
355 static inline void ublk_io_lock(struct ublk_io *io)
356 {
357 	spin_lock(&io->lock);
358 }
359 
360 static inline void ublk_io_unlock(struct ublk_io *io)
361 {
362 	spin_unlock(&io->lock);
363 }
364 
365 /* Initialize the event queue */
366 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
367 				    int numa_node)
368 {
369 	spin_lock_init(&q->evts_lock);
370 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
371 }
372 
373 /* Check if event queue is empty */
374 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
375 {
376 	return kfifo_is_empty(&q->evts_fifo);
377 }
378 
379 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
380 {
381 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
382 	kfifo_free(&q->evts_fifo);
383 }
384 
385 static inline struct ublksrv_io_desc *
386 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
387 {
388 	return &ubq->io_cmd_buf[tag];
389 }
390 
391 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
392 {
393 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
394 }
395 
396 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
397 {
398 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
399 }
400 
401 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
402 {
403 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
404 }
405 
406 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
407 {
408 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
409 }
410 
411 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
412 {
413 	return ubq->flags & UBLK_F_USER_COPY;
414 }
415 
416 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
417 {
418 	return ub->dev_info.flags & UBLK_F_USER_COPY;
419 }
420 
421 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
422 {
423 	return ub->dev_info.flags & UBLK_F_ZONED;
424 }
425 
426 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
427 {
428 	return ubq->flags & UBLK_F_ZONED;
429 }
430 
431 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
432 {
433 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
434 }
435 
436 #ifdef CONFIG_BLK_DEV_ZONED
437 
438 struct ublk_zoned_report_desc {
439 	__u64 sector;
440 	__u32 operation;
441 	__u32 nr_zones;
442 };
443 
444 static DEFINE_XARRAY(ublk_zoned_report_descs);
445 
446 static int ublk_zoned_insert_report_desc(const struct request *req,
447 		struct ublk_zoned_report_desc *desc)
448 {
449 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
450 			    desc, GFP_KERNEL);
451 }
452 
453 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
454 		const struct request *req)
455 {
456 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
457 }
458 
459 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
460 		const struct request *req)
461 {
462 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
463 }
464 
465 static int ublk_get_nr_zones(const struct ublk_device *ub)
466 {
467 	const struct ublk_param_basic *p = &ub->params.basic;
468 
469 	/* Zone size is a power of 2 */
470 	return p->dev_sectors >> ilog2(p->chunk_sectors);
471 }
472 
473 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
474 {
475 	return blk_revalidate_disk_zones(ub->ub_disk);
476 }
477 
478 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
479 {
480 	const struct ublk_param_zoned *p = &ub->params.zoned;
481 	int nr_zones;
482 
483 	if (!ublk_dev_is_zoned(ub))
484 		return -EINVAL;
485 
486 	if (!p->max_zone_append_sectors)
487 		return -EINVAL;
488 
489 	nr_zones = ublk_get_nr_zones(ub);
490 
491 	if (p->max_active_zones > nr_zones)
492 		return -EINVAL;
493 
494 	if (p->max_open_zones > nr_zones)
495 		return -EINVAL;
496 
497 	return 0;
498 }
499 
500 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
501 {
502 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
503 }
504 
505 /* Based on virtblk_alloc_report_buffer */
506 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
507 				      unsigned int nr_zones, size_t *buflen)
508 {
509 	struct request_queue *q = ublk->ub_disk->queue;
510 	size_t bufsize;
511 	void *buf;
512 
513 	nr_zones = min_t(unsigned int, nr_zones,
514 			 ublk->ub_disk->nr_zones);
515 
516 	bufsize = nr_zones * sizeof(struct blk_zone);
517 	bufsize =
518 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
519 
520 	while (bufsize >= sizeof(struct blk_zone)) {
521 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
522 		if (buf) {
523 			*buflen = bufsize;
524 			return buf;
525 		}
526 		bufsize >>= 1;
527 	}
528 
529 	*buflen = 0;
530 	return NULL;
531 }
532 
533 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
534 		      unsigned int nr_zones, struct blk_report_zones_args *args)
535 {
536 	struct ublk_device *ub = disk->private_data;
537 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
538 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
539 	unsigned int done_zones = 0;
540 	unsigned int max_zones_per_request;
541 	int ret;
542 	struct blk_zone *buffer;
543 	size_t buffer_length;
544 
545 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
546 			 nr_zones);
547 
548 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
549 	if (!buffer)
550 		return -ENOMEM;
551 
552 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
553 
554 	while (done_zones < nr_zones) {
555 		unsigned int remaining_zones = nr_zones - done_zones;
556 		unsigned int zones_in_request =
557 			min_t(unsigned int, remaining_zones, max_zones_per_request);
558 		struct request *req;
559 		struct ublk_zoned_report_desc desc;
560 		blk_status_t status;
561 
562 		memset(buffer, 0, buffer_length);
563 
564 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
565 		if (IS_ERR(req)) {
566 			ret = PTR_ERR(req);
567 			goto out;
568 		}
569 
570 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
571 		desc.sector = sector;
572 		desc.nr_zones = zones_in_request;
573 		ret = ublk_zoned_insert_report_desc(req, &desc);
574 		if (ret)
575 			goto free_req;
576 
577 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
578 		if (ret)
579 			goto erase_desc;
580 
581 		status = blk_execute_rq(req, 0);
582 		ret = blk_status_to_errno(status);
583 erase_desc:
584 		ublk_zoned_erase_report_desc(req);
585 free_req:
586 		blk_mq_free_request(req);
587 		if (ret)
588 			goto out;
589 
590 		for (unsigned int i = 0; i < zones_in_request; i++) {
591 			struct blk_zone *zone = buffer + i;
592 
593 			/* A zero length zone means no more zones in this response */
594 			if (!zone->len)
595 				break;
596 
597 			ret = disk_report_zone(disk, zone, i, args);
598 			if (ret)
599 				goto out;
600 
601 			done_zones++;
602 			sector += zone_size_sectors;
603 
604 		}
605 	}
606 
607 	ret = done_zones;
608 
609 out:
610 	kvfree(buffer);
611 	return ret;
612 }
613 
614 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
615 					 struct request *req)
616 {
617 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
618 	struct ublk_io *io = &ubq->ios[req->tag];
619 	struct ublk_zoned_report_desc *desc;
620 	u32 ublk_op;
621 
622 	switch (req_op(req)) {
623 	case REQ_OP_ZONE_OPEN:
624 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
625 		break;
626 	case REQ_OP_ZONE_CLOSE:
627 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
628 		break;
629 	case REQ_OP_ZONE_FINISH:
630 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
631 		break;
632 	case REQ_OP_ZONE_RESET:
633 		ublk_op = UBLK_IO_OP_ZONE_RESET;
634 		break;
635 	case REQ_OP_ZONE_APPEND:
636 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
637 		break;
638 	case REQ_OP_ZONE_RESET_ALL:
639 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
640 		break;
641 	case REQ_OP_DRV_IN:
642 		desc = ublk_zoned_get_report_desc(req);
643 		if (!desc)
644 			return BLK_STS_IOERR;
645 		ublk_op = desc->operation;
646 		switch (ublk_op) {
647 		case UBLK_IO_OP_REPORT_ZONES:
648 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
649 			iod->nr_zones = desc->nr_zones;
650 			iod->start_sector = desc->sector;
651 			return BLK_STS_OK;
652 		default:
653 			return BLK_STS_IOERR;
654 		}
655 	case REQ_OP_DRV_OUT:
656 		/* We do not support drv_out */
657 		return BLK_STS_NOTSUPP;
658 	default:
659 		return BLK_STS_IOERR;
660 	}
661 
662 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
663 	iod->nr_sectors = blk_rq_sectors(req);
664 	iod->start_sector = blk_rq_pos(req);
665 	iod->addr = io->buf.addr;
666 
667 	return BLK_STS_OK;
668 }
669 
670 #else
671 
672 #define ublk_report_zones (NULL)
673 
674 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
675 {
676 	return -EOPNOTSUPP;
677 }
678 
679 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
680 {
681 }
682 
683 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
684 {
685 	return 0;
686 }
687 
688 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
689 					 struct request *req)
690 {
691 	return BLK_STS_NOTSUPP;
692 }
693 
694 #endif
695 
696 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
697 				      bool need_map, struct io_comp_batch *iob);
698 
699 static dev_t ublk_chr_devt;
700 static const struct class ublk_chr_class = {
701 	.name = "ublk-char",
702 };
703 
704 static DEFINE_IDR(ublk_index_idr);
705 static DEFINE_SPINLOCK(ublk_idr_lock);
706 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
707 
708 static DEFINE_MUTEX(ublk_ctl_mutex);
709 
710 static struct ublk_batch_fetch_cmd *
711 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
712 {
713 	struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO);
714 
715 	if (fcmd) {
716 		fcmd->cmd = cmd;
717 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
718 	}
719 	return fcmd;
720 }
721 
722 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
723 {
724 	kfree(fcmd);
725 }
726 
727 static void __ublk_release_fcmd(struct ublk_queue *ubq)
728 {
729 	WRITE_ONCE(ubq->active_fcmd, NULL);
730 }
731 
732 /*
733  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
734  * dispatching
735  */
736 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
737 					const struct ublk_batch_io_data *data,
738 					struct ublk_batch_fetch_cmd *fcmd,
739 					int res)
740 {
741 	spin_lock(&ubq->evts_lock);
742 	list_del_init(&fcmd->node);
743 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
744 	__ublk_release_fcmd(ubq);
745 	spin_unlock(&ubq->evts_lock);
746 
747 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
748 	ublk_batch_free_fcmd(fcmd);
749 }
750 
751 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
752 				     struct io_br_sel *sel,
753 				     unsigned int issue_flags)
754 {
755 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
756 		return -ENOBUFS;
757 	return 0;
758 }
759 
760 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
761 				       void __user *buf, const u16 *tag_buf,
762 				       unsigned int len)
763 {
764 	if (copy_to_user(buf, tag_buf, len))
765 		return -EFAULT;
766 	return len;
767 }
768 
769 #define UBLK_MAX_UBLKS UBLK_MINORS
770 
771 /*
772  * Max unprivileged ublk devices allowed to add
773  *
774  * It can be extended to one per-user limit in future or even controlled
775  * by cgroup.
776  */
777 static unsigned int unprivileged_ublks_max = 64;
778 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
779 
780 static struct miscdevice ublk_misc;
781 
782 static inline unsigned ublk_pos_to_hwq(loff_t pos)
783 {
784 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
785 		UBLK_QID_BITS_MASK;
786 }
787 
788 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
789 {
790 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
791 }
792 
793 static inline unsigned ublk_pos_to_tag(loff_t pos)
794 {
795 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
796 		UBLK_TAG_BITS_MASK;
797 }
798 
799 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
800 {
801 	const struct ublk_param_basic *p = &ub->params.basic;
802 
803 	if (p->attrs & UBLK_ATTR_READ_ONLY)
804 		set_disk_ro(ub->ub_disk, true);
805 
806 	set_capacity(ub->ub_disk, p->dev_sectors);
807 }
808 
809 static int ublk_integrity_flags(u32 flags)
810 {
811 	int ret_flags = 0;
812 
813 	if (flags & LBMD_PI_CAP_INTEGRITY) {
814 		flags &= ~LBMD_PI_CAP_INTEGRITY;
815 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
816 	}
817 	if (flags & LBMD_PI_CAP_REFTAG) {
818 		flags &= ~LBMD_PI_CAP_REFTAG;
819 		ret_flags |= BLK_INTEGRITY_REF_TAG;
820 	}
821 	return flags ? -EINVAL : ret_flags;
822 }
823 
824 static int ublk_integrity_pi_tuple_size(u8 csum_type)
825 {
826 	switch (csum_type) {
827 	case LBMD_PI_CSUM_NONE:
828 		return 0;
829 	case LBMD_PI_CSUM_IP:
830 	case LBMD_PI_CSUM_CRC16_T10DIF:
831 		return 8;
832 	case LBMD_PI_CSUM_CRC64_NVME:
833 		return 16;
834 	default:
835 		return -EINVAL;
836 	}
837 }
838 
839 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
840 {
841 	switch (csum_type) {
842 	case LBMD_PI_CSUM_NONE:
843 		return BLK_INTEGRITY_CSUM_NONE;
844 	case LBMD_PI_CSUM_IP:
845 		return BLK_INTEGRITY_CSUM_IP;
846 	case LBMD_PI_CSUM_CRC16_T10DIF:
847 		return BLK_INTEGRITY_CSUM_CRC;
848 	case LBMD_PI_CSUM_CRC64_NVME:
849 		return BLK_INTEGRITY_CSUM_CRC64;
850 	default:
851 		WARN_ON_ONCE(1);
852 		return BLK_INTEGRITY_CSUM_NONE;
853 	}
854 }
855 
856 static int ublk_validate_params(const struct ublk_device *ub)
857 {
858 	/* basic param is the only one which must be set */
859 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
860 		const struct ublk_param_basic *p = &ub->params.basic;
861 
862 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
863 			return -EINVAL;
864 
865 		if (p->logical_bs_shift > p->physical_bs_shift)
866 			return -EINVAL;
867 
868 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
869 			return -EINVAL;
870 
871 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
872 			return -EINVAL;
873 	} else
874 		return -EINVAL;
875 
876 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
877 		const struct ublk_param_discard *p = &ub->params.discard;
878 
879 		/* So far, only support single segment discard */
880 		if (p->max_discard_sectors && p->max_discard_segments != 1)
881 			return -EINVAL;
882 
883 		if (!p->discard_granularity)
884 			return -EINVAL;
885 	}
886 
887 	/* dev_t is read-only */
888 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
889 		return -EINVAL;
890 
891 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
892 		return ublk_dev_param_zoned_validate(ub);
893 	else if (ublk_dev_is_zoned(ub))
894 		return -EINVAL;
895 
896 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
897 		const struct ublk_param_dma_align *p = &ub->params.dma;
898 
899 		if (p->alignment >= PAGE_SIZE)
900 			return -EINVAL;
901 
902 		if (!is_power_of_2(p->alignment + 1))
903 			return -EINVAL;
904 	}
905 
906 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
907 		const struct ublk_param_segment *p = &ub->params.seg;
908 
909 		if (!is_power_of_2(p->seg_boundary_mask + 1))
910 			return -EINVAL;
911 
912 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
913 			return -EINVAL;
914 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
915 			return -EINVAL;
916 	}
917 
918 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
919 		const struct ublk_param_integrity *p = &ub->params.integrity;
920 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
921 		int flags = ublk_integrity_flags(p->flags);
922 
923 		if (!ublk_dev_support_integrity(ub))
924 			return -EINVAL;
925 		if (flags < 0)
926 			return flags;
927 		if (pi_tuple_size < 0)
928 			return pi_tuple_size;
929 		if (!p->metadata_size)
930 			return -EINVAL;
931 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
932 		    p->flags & LBMD_PI_CAP_REFTAG)
933 			return -EINVAL;
934 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
935 			return -EINVAL;
936 		if (p->interval_exp < SECTOR_SHIFT ||
937 		    p->interval_exp > ub->params.basic.logical_bs_shift)
938 			return -EINVAL;
939 	}
940 
941 	return 0;
942 }
943 
944 static void ublk_apply_params(struct ublk_device *ub)
945 {
946 	ublk_dev_param_basic_apply(ub);
947 
948 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
949 		ublk_dev_param_zoned_apply(ub);
950 }
951 
952 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
953 {
954 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
955 		!ublk_support_auto_buf_reg(ubq);
956 }
957 
958 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
959 {
960 	return !ublk_dev_support_user_copy(ub) &&
961 	       !ublk_dev_support_zero_copy(ub) &&
962 	       !ublk_dev_support_auto_buf_reg(ub);
963 }
964 
965 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
966 {
967 	/*
968 	 * read()/write() is involved in user copy, so request reference
969 	 * has to be grabbed
970 	 *
971 	 * for zero copy, request buffer need to be registered to io_uring
972 	 * buffer table, so reference is needed
973 	 *
974 	 * For auto buffer register, ublk server still may issue
975 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
976 	 * so reference is required too.
977 	 */
978 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
979 		ublk_support_auto_buf_reg(ubq);
980 }
981 
982 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
983 {
984 	return ublk_dev_support_user_copy(ub) ||
985 	       ublk_dev_support_zero_copy(ub) ||
986 	       ublk_dev_support_auto_buf_reg(ub);
987 }
988 
989 /*
990  * ublk IO Reference Counting Design
991  * ==================================
992  *
993  * For user-copy and zero-copy modes, ublk uses a split reference model with
994  * two counters that together track IO lifetime:
995  *
996  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
997  *   - io->task_registered_buffers: count of buffers registered on the IO task
998  *
999  * Key Invariant:
1000  * --------------
1001  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1002  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1003  * when no active references exist. After IO completion, both counters become
1004  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1005  * task_registered_buffers are 0.
1006  *
1007  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1008  * exit to determine if all references have been released.
1009  *
1010  * Why Split Counters:
1011  * -------------------
1012  * Buffers registered on the IO daemon task can use the lightweight
1013  * task_registered_buffers counter (simple increment/decrement) instead of
1014  * atomic refcount operations. The ublk_io_release() callback checks if
1015  * current == io->task to decide which counter to update.
1016  *
1017  * This optimization only applies before IO completion. At completion,
1018  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1019  * After that, all subsequent buffer unregistrations must use the atomic ref
1020  * since they may be releasing the last reference.
1021  *
1022  * Reference Lifecycle:
1023  * --------------------
1024  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1025  *
1026  * 2. During IO processing:
1027  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1028  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1029  *    - Buffer unregister callback (ublk_io_release):
1030  *      * If on-task: task_registered_buffers--
1031  *      * If off-task: ref-- via ublk_put_req_ref()
1032  *
1033  * 3. ublk_sub_req_ref() at IO completion:
1034  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1035  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1036  *    - This effectively collapses task_registered_buffers into the atomic ref,
1037  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1038  *      buffers that were already counted
1039  *
1040  * Example (zero-copy, register on-task, unregister off-task):
1041  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1042  *   - Register buffer on-task: task_registered_buffers = 1
1043  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1044  *   - Completion via ublk_sub_req_ref():
1045  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1046  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1047  *
1048  * Example (auto buffer registration):
1049  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1050  *
1051  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1052  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1053  *   - Completion via ublk_sub_req_ref():
1054  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1055  *
1056  * Example (zero-copy, ublk server killed):
1057  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1058  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1059  *
1060  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1061  *   - Register buffer on-task: task_registered_buffers = 1
1062  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1063  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1064  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1065  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1066  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1067  *     and abort pending requests
1068  *
1069  * Batch IO Special Case:
1070  * ----------------------
1071  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1072  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1073  * task_registered_buffers counter still tracks registered buffers for the
1074  * invariant check, even though the callback doesn't decrement it.
1075  *
1076  * Note: updating task_registered_buffers is protected by io->lock.
1077  */
1078 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1079 		struct ublk_io *io)
1080 {
1081 	if (ublk_need_req_ref(ubq))
1082 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1083 }
1084 
1085 static inline bool ublk_get_req_ref(struct ublk_io *io)
1086 {
1087 	return refcount_inc_not_zero(&io->ref);
1088 }
1089 
1090 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1091 {
1092 	if (!refcount_dec_and_test(&io->ref))
1093 		return;
1094 
1095 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1096 	__ublk_complete_rq(req, io, false, NULL);
1097 }
1098 
1099 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1100 {
1101 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1102 
1103 	io->task_registered_buffers = 0;
1104 	return refcount_sub_and_test(sub_refs, &io->ref);
1105 }
1106 
1107 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1108 {
1109 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1110 }
1111 
1112 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1113 {
1114 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1115 }
1116 
1117 /* Called in slow path only, keep it noinline for trace purpose */
1118 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1119 {
1120 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1121 		return ub;
1122 	return NULL;
1123 }
1124 
1125 /* Called in slow path only, keep it noinline for trace purpose */
1126 static noinline void ublk_put_device(struct ublk_device *ub)
1127 {
1128 	put_device(&ub->cdev_dev);
1129 }
1130 
1131 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1132 		int qid)
1133 {
1134 	return dev->queues[qid];
1135 }
1136 
1137 static inline bool ublk_rq_has_data(const struct request *rq)
1138 {
1139 	return bio_has_data(rq->bio);
1140 }
1141 
1142 static inline struct ublksrv_io_desc *
1143 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1144 {
1145 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1146 }
1147 
1148 static inline int __ublk_queue_cmd_buf_size(int depth)
1149 {
1150 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1151 }
1152 
1153 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1154 {
1155 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1156 }
1157 
1158 static int ublk_max_cmd_buf_size(void)
1159 {
1160 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1161 }
1162 
1163 /*
1164  * Should I/O outstanding to the ublk server when it exits be reissued?
1165  * If not, outstanding I/O will get errors.
1166  */
1167 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1168 {
1169 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1170 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1171 }
1172 
1173 /*
1174  * Should I/O issued while there is no ublk server queue? If not, I/O
1175  * issued while there is no ublk server will get errors.
1176  */
1177 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1178 {
1179 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1180 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1181 }
1182 
1183 /*
1184  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1185  * of the device flags for smaller cache footprint - better for fast
1186  * paths.
1187  */
1188 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1189 {
1190 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1191 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1192 }
1193 
1194 /*
1195  * Should ublk devices be stopped (i.e. no recovery possible) when the
1196  * ublk server exits? If not, devices can be used again by a future
1197  * incarnation of a ublk server via the start_recovery/end_recovery
1198  * commands.
1199  */
1200 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1201 {
1202 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1203 }
1204 
1205 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1206 {
1207 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1208 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1209 }
1210 
1211 static void ublk_free_disk(struct gendisk *disk)
1212 {
1213 	struct ublk_device *ub = disk->private_data;
1214 
1215 	clear_bit(UB_STATE_USED, &ub->state);
1216 	ublk_put_device(ub);
1217 }
1218 
1219 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1220 		unsigned int *owner_gid)
1221 {
1222 	kuid_t uid;
1223 	kgid_t gid;
1224 
1225 	current_uid_gid(&uid, &gid);
1226 
1227 	*owner_uid = from_kuid(&init_user_ns, uid);
1228 	*owner_gid = from_kgid(&init_user_ns, gid);
1229 }
1230 
1231 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1232 {
1233 	struct ublk_device *ub = disk->private_data;
1234 
1235 	if (capable(CAP_SYS_ADMIN))
1236 		return 0;
1237 
1238 	/*
1239 	 * If it is one unprivileged device, only owner can open
1240 	 * the disk. Otherwise it could be one trap made by one
1241 	 * evil user who grants this disk's privileges to other
1242 	 * users deliberately.
1243 	 *
1244 	 * This way is reasonable too given anyone can create
1245 	 * unprivileged device, and no need other's grant.
1246 	 */
1247 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1248 		unsigned int curr_uid, curr_gid;
1249 
1250 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1251 
1252 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1253 				ub->dev_info.owner_gid)
1254 			return -EPERM;
1255 	}
1256 
1257 	if (ub->block_open)
1258 		return -ENXIO;
1259 
1260 	return 0;
1261 }
1262 
1263 static const struct block_device_operations ub_fops = {
1264 	.owner =	THIS_MODULE,
1265 	.open =		ublk_open,
1266 	.free_disk =	ublk_free_disk,
1267 	.report_zones =	ublk_report_zones,
1268 };
1269 
1270 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1271 				struct iov_iter *uiter, int dir, size_t *done)
1272 {
1273 	unsigned len;
1274 	void *bv_buf;
1275 	size_t copied;
1276 
1277 	if (*offset >= bv->bv_len) {
1278 		*offset -= bv->bv_len;
1279 		return true;
1280 	}
1281 
1282 	len = bv->bv_len - *offset;
1283 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1284 	if (dir == ITER_DEST)
1285 		copied = copy_to_iter(bv_buf, len, uiter);
1286 	else
1287 		copied = copy_from_iter(bv_buf, len, uiter);
1288 
1289 	kunmap_local(bv_buf);
1290 
1291 	*done += copied;
1292 	if (copied < len)
1293 		return false;
1294 
1295 	*offset = 0;
1296 	return true;
1297 }
1298 
1299 /*
1300  * Copy data between request pages and io_iter, and 'offset'
1301  * is the start point of linear offset of request.
1302  */
1303 static size_t ublk_copy_user_pages(const struct request *req,
1304 		unsigned offset, struct iov_iter *uiter, int dir)
1305 {
1306 	struct req_iterator iter;
1307 	struct bio_vec bv;
1308 	size_t done = 0;
1309 
1310 	rq_for_each_segment(bv, req, iter) {
1311 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1312 			break;
1313 	}
1314 	return done;
1315 }
1316 
1317 #ifdef CONFIG_BLK_DEV_INTEGRITY
1318 static size_t ublk_copy_user_integrity(const struct request *req,
1319 		unsigned offset, struct iov_iter *uiter, int dir)
1320 {
1321 	size_t done = 0;
1322 	struct bio *bio = req->bio;
1323 	struct bvec_iter iter;
1324 	struct bio_vec iv;
1325 
1326 	if (!blk_integrity_rq(req))
1327 		return 0;
1328 
1329 	bio_for_each_integrity_vec(iv, bio, iter) {
1330 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1331 			break;
1332 	}
1333 
1334 	return done;
1335 }
1336 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1337 static size_t ublk_copy_user_integrity(const struct request *req,
1338 		unsigned offset, struct iov_iter *uiter, int dir)
1339 {
1340 	return 0;
1341 }
1342 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1343 
1344 static inline bool ublk_need_map_req(const struct request *req)
1345 {
1346 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1347 }
1348 
1349 static inline bool ublk_need_unmap_req(const struct request *req)
1350 {
1351 	return ublk_rq_has_data(req) &&
1352 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1353 }
1354 
1355 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1356 				const struct request *req,
1357 				const struct ublk_io *io)
1358 {
1359 	const unsigned int rq_bytes = blk_rq_bytes(req);
1360 
1361 	if (!ublk_need_map_io(ubq))
1362 		return rq_bytes;
1363 
1364 	/*
1365 	 * no zero copy, we delay copy WRITE request data into ublksrv
1366 	 * context and the big benefit is that pinning pages in current
1367 	 * context is pretty fast, see ublk_pin_user_pages
1368 	 */
1369 	if (ublk_need_map_req(req)) {
1370 		struct iov_iter iter;
1371 		const int dir = ITER_DEST;
1372 
1373 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1374 		return ublk_copy_user_pages(req, 0, &iter, dir);
1375 	}
1376 	return rq_bytes;
1377 }
1378 
1379 static unsigned int ublk_unmap_io(bool need_map,
1380 		const struct request *req,
1381 		const struct ublk_io *io)
1382 {
1383 	const unsigned int rq_bytes = blk_rq_bytes(req);
1384 
1385 	if (!need_map)
1386 		return rq_bytes;
1387 
1388 	if (ublk_need_unmap_req(req)) {
1389 		struct iov_iter iter;
1390 		const int dir = ITER_SOURCE;
1391 
1392 		WARN_ON_ONCE(io->res > rq_bytes);
1393 
1394 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1395 		return ublk_copy_user_pages(req, 0, &iter, dir);
1396 	}
1397 	return rq_bytes;
1398 }
1399 
1400 static inline unsigned int ublk_req_build_flags(struct request *req)
1401 {
1402 	unsigned flags = 0;
1403 
1404 	if (req->cmd_flags & REQ_FAILFAST_DEV)
1405 		flags |= UBLK_IO_F_FAILFAST_DEV;
1406 
1407 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1408 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1409 
1410 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1411 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
1412 
1413 	if (req->cmd_flags & REQ_META)
1414 		flags |= UBLK_IO_F_META;
1415 
1416 	if (req->cmd_flags & REQ_FUA)
1417 		flags |= UBLK_IO_F_FUA;
1418 
1419 	if (req->cmd_flags & REQ_NOUNMAP)
1420 		flags |= UBLK_IO_F_NOUNMAP;
1421 
1422 	if (req->cmd_flags & REQ_SWAP)
1423 		flags |= UBLK_IO_F_SWAP;
1424 
1425 	if (blk_integrity_rq(req))
1426 		flags |= UBLK_IO_F_INTEGRITY;
1427 
1428 	return flags;
1429 }
1430 
1431 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1432 {
1433 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1434 	struct ublk_io *io = &ubq->ios[req->tag];
1435 	u32 ublk_op;
1436 
1437 	switch (req_op(req)) {
1438 	case REQ_OP_READ:
1439 		ublk_op = UBLK_IO_OP_READ;
1440 		break;
1441 	case REQ_OP_WRITE:
1442 		ublk_op = UBLK_IO_OP_WRITE;
1443 		break;
1444 	case REQ_OP_FLUSH:
1445 		ublk_op = UBLK_IO_OP_FLUSH;
1446 		break;
1447 	case REQ_OP_DISCARD:
1448 		ublk_op = UBLK_IO_OP_DISCARD;
1449 		break;
1450 	case REQ_OP_WRITE_ZEROES:
1451 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1452 		break;
1453 	default:
1454 		if (ublk_queue_is_zoned(ubq))
1455 			return ublk_setup_iod_zoned(ubq, req);
1456 		return BLK_STS_IOERR;
1457 	}
1458 
1459 	/* need to translate since kernel may change */
1460 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
1461 	iod->nr_sectors = blk_rq_sectors(req);
1462 	iod->start_sector = blk_rq_pos(req);
1463 	iod->addr = io->buf.addr;
1464 
1465 	return BLK_STS_OK;
1466 }
1467 
1468 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1469 		struct io_uring_cmd *ioucmd)
1470 {
1471 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1472 }
1473 
1474 static void ublk_end_request(struct request *req, blk_status_t error)
1475 {
1476 	local_bh_disable();
1477 	blk_mq_end_request(req, error);
1478 	local_bh_enable();
1479 }
1480 
1481 /* todo: handle partial completion */
1482 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1483 				      bool need_map, struct io_comp_batch *iob)
1484 {
1485 	unsigned int unmapped_bytes;
1486 	blk_status_t res = BLK_STS_OK;
1487 	bool requeue;
1488 
1489 	/* failed read IO if nothing is read */
1490 	if (!io->res && req_op(req) == REQ_OP_READ)
1491 		io->res = -EIO;
1492 
1493 	if (io->res < 0) {
1494 		res = errno_to_blk_status(io->res);
1495 		goto exit;
1496 	}
1497 
1498 	/*
1499 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1500 	 * directly.
1501 	 *
1502 	 * Both the two needn't unmap.
1503 	 */
1504 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1505 	    req_op(req) != REQ_OP_DRV_IN)
1506 		goto exit;
1507 
1508 	/* for READ request, writing data in iod->addr to rq buffers */
1509 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1510 
1511 	/*
1512 	 * Extremely impossible since we got data filled in just before
1513 	 *
1514 	 * Re-read simply for this unlikely case.
1515 	 */
1516 	if (unlikely(unmapped_bytes < io->res))
1517 		io->res = unmapped_bytes;
1518 
1519 	/*
1520 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1521 	 * happens off this path, then that will prevent ublk's blkdev_release()
1522 	 * from being called on current's task work, see fput() implementation.
1523 	 *
1524 	 * Otherwise, ublk server may not provide forward progress in case of
1525 	 * reading the partition table from bdev_open() with disk->open_mutex
1526 	 * held, and causes dead lock as we could already be holding
1527 	 * disk->open_mutex here.
1528 	 *
1529 	 * Preferably we would not be doing IO with a mutex held that is also
1530 	 * used for release, but this work-around will suffice for now.
1531 	 */
1532 	local_bh_disable();
1533 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1534 	local_bh_enable();
1535 	if (requeue)
1536 		blk_mq_requeue_request(req, true);
1537 	else if (likely(!blk_should_fake_timeout(req->q))) {
1538 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1539 			return;
1540 		__blk_mq_end_request(req, BLK_STS_OK);
1541 	}
1542 
1543 	return;
1544 exit:
1545 	ublk_end_request(req, res);
1546 }
1547 
1548 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1549 						     struct request *req)
1550 {
1551 	/* read cmd first because req will overwrite it */
1552 	struct io_uring_cmd *cmd = io->cmd;
1553 
1554 	/* mark this cmd owned by ublksrv */
1555 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1556 
1557 	/*
1558 	 * clear ACTIVE since we are done with this sqe/cmd slot
1559 	 * We can only accept io cmd in case of being not active.
1560 	 */
1561 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1562 
1563 	io->req = req;
1564 	return cmd;
1565 }
1566 
1567 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1568 				 int res, unsigned issue_flags)
1569 {
1570 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1571 
1572 	/* tell ublksrv one io request is coming */
1573 	io_uring_cmd_done(cmd, res, issue_flags);
1574 }
1575 
1576 #define UBLK_REQUEUE_DELAY_MS	3
1577 
1578 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1579 		struct request *rq)
1580 {
1581 	/* We cannot process this rq so just requeue it. */
1582 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1583 		blk_mq_requeue_request(rq, false);
1584 	else
1585 		ublk_end_request(rq, BLK_STS_IOERR);
1586 }
1587 
1588 static void
1589 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1590 {
1591 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1592 
1593 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1594 }
1595 
1596 enum auto_buf_reg_res {
1597 	AUTO_BUF_REG_FAIL,
1598 	AUTO_BUF_REG_FALLBACK,
1599 	AUTO_BUF_REG_OK,
1600 };
1601 
1602 /*
1603  * Setup io state after auto buffer registration.
1604  *
1605  * Must be called after ublk_auto_buf_register() is done.
1606  * Caller must hold io->lock in batch context.
1607  */
1608 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1609 				   struct request *req, struct ublk_io *io,
1610 				   struct io_uring_cmd *cmd,
1611 				   enum auto_buf_reg_res res)
1612 {
1613 	if (res == AUTO_BUF_REG_OK) {
1614 		io->task_registered_buffers = 1;
1615 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1616 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1617 	}
1618 	ublk_init_req_ref(ubq, io);
1619 	__ublk_prep_compl_io_cmd(io, req);
1620 }
1621 
1622 /* Register request bvec to io_uring for auto buffer registration. */
1623 static enum auto_buf_reg_res
1624 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1625 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1626 		       unsigned int issue_flags)
1627 {
1628 	int ret;
1629 
1630 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1631 				      io->buf.auto_reg.index, issue_flags);
1632 	if (ret) {
1633 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1634 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1635 			return AUTO_BUF_REG_FALLBACK;
1636 		}
1637 		ublk_end_request(req, BLK_STS_IOERR);
1638 		return AUTO_BUF_REG_FAIL;
1639 	}
1640 
1641 	return AUTO_BUF_REG_OK;
1642 }
1643 
1644 /*
1645  * Dispatch IO to userspace with auto buffer registration.
1646  *
1647  * Only called in non-batch context from task work, io->lock not held.
1648  */
1649 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1650 				   struct request *req, struct ublk_io *io,
1651 				   struct io_uring_cmd *cmd,
1652 				   unsigned int issue_flags)
1653 {
1654 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1655 			issue_flags);
1656 
1657 	if (res != AUTO_BUF_REG_FAIL) {
1658 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1659 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1660 	}
1661 }
1662 
1663 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1664 			  struct ublk_io *io)
1665 {
1666 	unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1667 
1668 	/* partially mapped, update io descriptor */
1669 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1670 		/*
1671 		 * Nothing mapped, retry until we succeed.
1672 		 *
1673 		 * We may never succeed in mapping any bytes here because
1674 		 * of OOM. TODO: reserve one buffer with single page pinned
1675 		 * for providing forward progress guarantee.
1676 		 */
1677 		if (unlikely(!mapped_bytes)) {
1678 			blk_mq_requeue_request(req, false);
1679 			blk_mq_delay_kick_requeue_list(req->q,
1680 					UBLK_REQUEUE_DELAY_MS);
1681 			return false;
1682 		}
1683 
1684 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1685 			mapped_bytes >> 9;
1686 	}
1687 
1688 	return true;
1689 }
1690 
1691 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1692 {
1693 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1694 	int tag = req->tag;
1695 	struct ublk_io *io = &ubq->ios[tag];
1696 
1697 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1698 			__func__, ubq->q_id, req->tag, io->flags,
1699 			ublk_get_iod(ubq, req->tag)->addr);
1700 
1701 	/*
1702 	 * Task is exiting if either:
1703 	 *
1704 	 * (1) current != io->task.
1705 	 * io_uring_cmd_complete_in_task() tries to run task_work
1706 	 * in a workqueue if cmd's task is PF_EXITING.
1707 	 *
1708 	 * (2) current->flags & PF_EXITING.
1709 	 */
1710 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1711 		__ublk_abort_rq(ubq, req);
1712 		return;
1713 	}
1714 
1715 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1716 		/*
1717 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1718 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1719 		 * and notify it.
1720 		 */
1721 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1722 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1723 				__func__, ubq->q_id, req->tag, io->flags);
1724 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1725 				     issue_flags);
1726 		return;
1727 	}
1728 
1729 	if (!ublk_start_io(ubq, req, io))
1730 		return;
1731 
1732 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1733 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1734 	} else {
1735 		ublk_init_req_ref(ubq, io);
1736 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1737 	}
1738 }
1739 
1740 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1741 				       const struct ublk_batch_io_data *data,
1742 				       unsigned short tag)
1743 {
1744 	struct ublk_device *ub = data->ub;
1745 	struct ublk_io *io = &ubq->ios[tag];
1746 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1747 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1748 	struct io_uring_cmd *cmd = data->cmd;
1749 
1750 	if (!ublk_start_io(ubq, req, io))
1751 		return false;
1752 
1753 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1754 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1755 				data->issue_flags);
1756 
1757 		if (res == AUTO_BUF_REG_FAIL)
1758 			return false;
1759 	}
1760 
1761 	ublk_io_lock(io);
1762 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1763 	ublk_io_unlock(io);
1764 
1765 	return true;
1766 }
1767 
1768 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1769 				     const struct ublk_batch_io_data *data,
1770 				     unsigned short *tag_buf,
1771 				     unsigned int len)
1772 {
1773 	bool has_unused = false;
1774 	unsigned int i;
1775 
1776 	for (i = 0; i < len; i++) {
1777 		unsigned short tag = tag_buf[i];
1778 
1779 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1780 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1781 			has_unused = true;
1782 		}
1783 	}
1784 
1785 	return has_unused;
1786 }
1787 
1788 /*
1789  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1790  * Returns the new length after filtering.
1791  */
1792 static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1793 					    unsigned int len)
1794 {
1795 	unsigned int i, j;
1796 
1797 	for (i = 0, j = 0; i < len; i++) {
1798 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1799 			if (i != j)
1800 				tag_buf[j] = tag_buf[i];
1801 			j++;
1802 		}
1803 	}
1804 
1805 	return j;
1806 }
1807 
1808 #define MAX_NR_TAG 128
1809 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1810 				 const struct ublk_batch_io_data *data,
1811 				 struct ublk_batch_fetch_cmd *fcmd)
1812 {
1813 	const unsigned int tag_sz = sizeof(unsigned short);
1814 	unsigned short tag_buf[MAX_NR_TAG];
1815 	struct io_br_sel sel;
1816 	size_t len = 0;
1817 	bool needs_filter;
1818 	int ret;
1819 
1820 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1821 
1822 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1823 					 data->issue_flags);
1824 	if (sel.val < 0)
1825 		return sel.val;
1826 	if (!sel.addr)
1827 		return -ENOBUFS;
1828 
1829 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1830 	len = min(len, sizeof(tag_buf)) / tag_sz;
1831 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1832 
1833 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1834 	/* Filter out unused tags before posting to userspace */
1835 	if (unlikely(needs_filter)) {
1836 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1837 
1838 		/* return actual length if all are failed or requeued */
1839 		if (!new_len) {
1840 			/* release the selected buffer */
1841 			sel.val = 0;
1842 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1843 						&sel, data->issue_flags));
1844 			return len;
1845 		}
1846 		len = new_len;
1847 	}
1848 
1849 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1850 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1851 	if (unlikely(ret < 0)) {
1852 		int i, res;
1853 
1854 		/*
1855 		 * Undo prep state for all IOs since userspace never received them.
1856 		 * This restores IOs to pre-prepared state so they can be cleanly
1857 		 * re-prepared when tags are pulled from FIFO again.
1858 		 */
1859 		for (i = 0; i < len; i++) {
1860 			struct ublk_io *io = &ubq->ios[tag_buf[i]];
1861 			int index = -1;
1862 
1863 			ublk_io_lock(io);
1864 			if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1865 				index = io->buf.auto_reg.index;
1866 			io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1867 			io->flags |= UBLK_IO_FLAG_ACTIVE;
1868 			ublk_io_unlock(io);
1869 
1870 			if (index != -1)
1871 				io_buffer_unregister_bvec(data->cmd, index,
1872 						data->issue_flags);
1873 		}
1874 
1875 		res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1876 			tag_buf, len, &ubq->evts_lock);
1877 
1878 		pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1879 				"tags(%d %zu) ret %d\n", __func__, res, len,
1880 				ret);
1881 	}
1882 	return ret;
1883 }
1884 
1885 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1886 		struct ublk_queue *ubq)
1887 {
1888 	struct ublk_batch_fetch_cmd *fcmd;
1889 
1890 	lockdep_assert_held(&ubq->evts_lock);
1891 
1892 	/*
1893 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1894 	 *
1895 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1896 	 *
1897 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1898 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1899 	 */
1900 	smp_mb();
1901 	if (READ_ONCE(ubq->active_fcmd)) {
1902 		fcmd = NULL;
1903 	} else {
1904 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1905 				struct ublk_batch_fetch_cmd, node);
1906 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1907 	}
1908 	return fcmd;
1909 }
1910 
1911 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1912 {
1913 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1914 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1915 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1916 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1917 	struct ublk_batch_io_data data = {
1918 		.ub = pdu->ubq->dev,
1919 		.cmd = fcmd->cmd,
1920 		.issue_flags = issue_flags,
1921 	};
1922 
1923 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1924 
1925 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
1926 }
1927 
1928 static void
1929 ublk_batch_dispatch(struct ublk_queue *ubq,
1930 		    const struct ublk_batch_io_data *data,
1931 		    struct ublk_batch_fetch_cmd *fcmd)
1932 {
1933 	struct ublk_batch_fetch_cmd *new_fcmd;
1934 	unsigned tried = 0;
1935 	int ret = 0;
1936 
1937 again:
1938 	while (!ublk_io_evts_empty(ubq)) {
1939 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
1940 		if (ret <= 0)
1941 			break;
1942 	}
1943 
1944 	if (ret < 0) {
1945 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
1946 		return;
1947 	}
1948 
1949 	__ublk_release_fcmd(ubq);
1950 	/*
1951 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
1952 	 * checking ubq->evts_fifo.
1953 	 *
1954 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
1955 	 */
1956 	smp_mb();
1957 	if (likely(ublk_io_evts_empty(ubq)))
1958 		return;
1959 
1960 	spin_lock(&ubq->evts_lock);
1961 	new_fcmd = __ublk_acquire_fcmd(ubq);
1962 	spin_unlock(&ubq->evts_lock);
1963 
1964 	if (!new_fcmd)
1965 		return;
1966 
1967 	/* Avoid lockup by allowing to handle at most 32 batches */
1968 	if (new_fcmd == fcmd && tried++ < 32)
1969 		goto again;
1970 
1971 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
1972 }
1973 
1974 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1975 {
1976 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1977 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1978 	struct ublk_queue *ubq = pdu->ubq;
1979 
1980 	ublk_dispatch_req(ubq, pdu->req);
1981 }
1982 
1983 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
1984 {
1985 	unsigned short tag = rq->tag;
1986 	struct ublk_batch_fetch_cmd *fcmd = NULL;
1987 
1988 	spin_lock(&ubq->evts_lock);
1989 	kfifo_put(&ubq->evts_fifo, tag);
1990 	if (last)
1991 		fcmd = __ublk_acquire_fcmd(ubq);
1992 	spin_unlock(&ubq->evts_lock);
1993 
1994 	if (fcmd)
1995 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
1996 }
1997 
1998 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1999 {
2000 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2001 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2002 
2003 	pdu->req = rq;
2004 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2005 }
2006 
2007 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2008 {
2009 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2010 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2011 	struct request *rq = pdu->req_list;
2012 	struct request *next;
2013 
2014 	do {
2015 		next = rq->rq_next;
2016 		rq->rq_next = NULL;
2017 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2018 		rq = next;
2019 	} while (rq);
2020 }
2021 
2022 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2023 {
2024 	struct io_uring_cmd *cmd = io->cmd;
2025 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2026 
2027 	pdu->req_list = rq_list_peek(l);
2028 	rq_list_init(l);
2029 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2030 }
2031 
2032 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2033 {
2034 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2035 	pid_t tgid = ubq->dev->ublksrv_tgid;
2036 	struct task_struct *p;
2037 	struct pid *pid;
2038 
2039 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2040 		return BLK_EH_RESET_TIMER;
2041 
2042 	if (unlikely(!tgid))
2043 		return BLK_EH_RESET_TIMER;
2044 
2045 	rcu_read_lock();
2046 	pid = find_vpid(tgid);
2047 	p = pid_task(pid, PIDTYPE_PID);
2048 	if (p)
2049 		send_sig(SIGKILL, p, 0);
2050 	rcu_read_unlock();
2051 	return BLK_EH_DONE;
2052 }
2053 
2054 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2055 				  bool check_cancel)
2056 {
2057 	blk_status_t res;
2058 
2059 	if (unlikely(READ_ONCE(ubq->fail_io)))
2060 		return BLK_STS_TARGET;
2061 
2062 	/* With recovery feature enabled, force_abort is set in
2063 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2064 	 * abort all requeued and new rqs here to let del_gendisk()
2065 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2066 	 * to avoid UAF on io_uring ctx.
2067 	 *
2068 	 * Note: force_abort is guaranteed to be seen because it is set
2069 	 * before request queue is unqiuesced.
2070 	 */
2071 	if (ublk_nosrv_should_queue_io(ubq) &&
2072 	    unlikely(READ_ONCE(ubq->force_abort)))
2073 		return BLK_STS_IOERR;
2074 
2075 	if (check_cancel && unlikely(ubq->canceling))
2076 		return BLK_STS_IOERR;
2077 
2078 	/* fill iod to slot in io cmd buffer */
2079 	res = ublk_setup_iod(ubq, rq);
2080 	if (unlikely(res != BLK_STS_OK))
2081 		return BLK_STS_IOERR;
2082 
2083 	blk_mq_start_request(rq);
2084 	return BLK_STS_OK;
2085 }
2086 
2087 /*
2088  * Common helper for queue_rq that handles request preparation and
2089  * cancellation checks. Returns status and sets should_queue to indicate
2090  * whether the caller should proceed with queuing the request.
2091  */
2092 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2093 						   struct request *rq,
2094 						   bool *should_queue)
2095 {
2096 	blk_status_t res;
2097 
2098 	res = ublk_prep_req(ubq, rq, false);
2099 	if (res != BLK_STS_OK) {
2100 		*should_queue = false;
2101 		return res;
2102 	}
2103 
2104 	/*
2105 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2106 	 * is dealt with, otherwise this request may not be failed in case
2107 	 * of recovery, and cause hang when deleting disk
2108 	 */
2109 	if (unlikely(ubq->canceling)) {
2110 		*should_queue = false;
2111 		__ublk_abort_rq(ubq, rq);
2112 		return BLK_STS_OK;
2113 	}
2114 
2115 	*should_queue = true;
2116 	return BLK_STS_OK;
2117 }
2118 
2119 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2120 		const struct blk_mq_queue_data *bd)
2121 {
2122 	struct ublk_queue *ubq = hctx->driver_data;
2123 	struct request *rq = bd->rq;
2124 	bool should_queue;
2125 	blk_status_t res;
2126 
2127 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2128 	if (!should_queue)
2129 		return res;
2130 
2131 	ublk_queue_cmd(ubq, rq);
2132 	return BLK_STS_OK;
2133 }
2134 
2135 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2136 		const struct blk_mq_queue_data *bd)
2137 {
2138 	struct ublk_queue *ubq = hctx->driver_data;
2139 	struct request *rq = bd->rq;
2140 	bool should_queue;
2141 	blk_status_t res;
2142 
2143 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2144 	if (!should_queue)
2145 		return res;
2146 
2147 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2148 	return BLK_STS_OK;
2149 }
2150 
2151 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2152 					     const struct ublk_io *io2)
2153 {
2154 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2155 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2156 		(io->task == io2->task);
2157 }
2158 
2159 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2160 {
2161 	struct ublk_queue *ubq = hctx->driver_data;
2162 	struct ublk_batch_fetch_cmd *fcmd;
2163 
2164 	spin_lock(&ubq->evts_lock);
2165 	fcmd = __ublk_acquire_fcmd(ubq);
2166 	spin_unlock(&ubq->evts_lock);
2167 
2168 	if (fcmd)
2169 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2170 }
2171 
2172 static void ublk_queue_rqs(struct rq_list *rqlist)
2173 {
2174 	struct rq_list requeue_list = { };
2175 	struct rq_list submit_list = { };
2176 	struct ublk_io *io = NULL;
2177 	struct request *req;
2178 
2179 	while ((req = rq_list_pop(rqlist))) {
2180 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2181 		struct ublk_io *this_io = &this_q->ios[req->tag];
2182 
2183 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2184 			rq_list_add_tail(&requeue_list, req);
2185 			continue;
2186 		}
2187 
2188 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2189 				!rq_list_empty(&submit_list))
2190 			ublk_queue_cmd_list(io, &submit_list);
2191 		io = this_io;
2192 		rq_list_add_tail(&submit_list, req);
2193 	}
2194 
2195 	if (!rq_list_empty(&submit_list))
2196 		ublk_queue_cmd_list(io, &submit_list);
2197 	*rqlist = requeue_list;
2198 }
2199 
2200 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2201 {
2202 	unsigned short tags[MAX_NR_TAG];
2203 	struct ublk_batch_fetch_cmd *fcmd;
2204 	struct request *rq;
2205 	unsigned cnt = 0;
2206 
2207 	spin_lock(&ubq->evts_lock);
2208 	rq_list_for_each(l, rq) {
2209 		tags[cnt++] = (unsigned short)rq->tag;
2210 		if (cnt >= MAX_NR_TAG) {
2211 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2212 			cnt = 0;
2213 		}
2214 	}
2215 	if (cnt)
2216 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2217 	fcmd = __ublk_acquire_fcmd(ubq);
2218 	spin_unlock(&ubq->evts_lock);
2219 
2220 	rq_list_init(l);
2221 	if (fcmd)
2222 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2223 }
2224 
2225 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2226 {
2227 	struct rq_list requeue_list = { };
2228 	struct rq_list submit_list = { };
2229 	struct ublk_queue *ubq = NULL;
2230 	struct request *req;
2231 
2232 	while ((req = rq_list_pop(rqlist))) {
2233 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2234 
2235 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2236 			rq_list_add_tail(&requeue_list, req);
2237 			continue;
2238 		}
2239 
2240 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2241 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2242 		ubq = this_q;
2243 		rq_list_add_tail(&submit_list, req);
2244 	}
2245 
2246 	if (!rq_list_empty(&submit_list))
2247 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2248 	*rqlist = requeue_list;
2249 }
2250 
2251 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2252 		unsigned int hctx_idx)
2253 {
2254 	struct ublk_device *ub = driver_data;
2255 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2256 
2257 	hctx->driver_data = ubq;
2258 	return 0;
2259 }
2260 
2261 static const struct blk_mq_ops ublk_mq_ops = {
2262 	.queue_rq       = ublk_queue_rq,
2263 	.queue_rqs      = ublk_queue_rqs,
2264 	.init_hctx	= ublk_init_hctx,
2265 	.timeout	= ublk_timeout,
2266 };
2267 
2268 static const struct blk_mq_ops ublk_batch_mq_ops = {
2269 	.commit_rqs	= ublk_commit_rqs,
2270 	.queue_rq       = ublk_batch_queue_rq,
2271 	.queue_rqs      = ublk_batch_queue_rqs,
2272 	.init_hctx	= ublk_init_hctx,
2273 	.timeout	= ublk_timeout,
2274 };
2275 
2276 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2277 {
2278 	int i;
2279 
2280 	ubq->nr_io_ready = 0;
2281 
2282 	for (i = 0; i < ubq->q_depth; i++) {
2283 		struct ublk_io *io = &ubq->ios[i];
2284 
2285 		/*
2286 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2287 		 * io->cmd
2288 		 */
2289 		io->flags &= UBLK_IO_FLAG_CANCELED;
2290 		io->cmd = NULL;
2291 		io->buf.addr = 0;
2292 
2293 		/*
2294 		 * old task is PF_EXITING, put it now
2295 		 *
2296 		 * It could be NULL in case of closing one quiesced
2297 		 * device.
2298 		 */
2299 		if (io->task) {
2300 			put_task_struct(io->task);
2301 			io->task = NULL;
2302 		}
2303 
2304 		WARN_ON_ONCE(refcount_read(&io->ref));
2305 		WARN_ON_ONCE(io->task_registered_buffers);
2306 	}
2307 }
2308 
2309 static int ublk_ch_open(struct inode *inode, struct file *filp)
2310 {
2311 	struct ublk_device *ub = container_of(inode->i_cdev,
2312 			struct ublk_device, cdev);
2313 
2314 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2315 		return -EBUSY;
2316 	filp->private_data = ub;
2317 	ub->ublksrv_tgid = current->tgid;
2318 	return 0;
2319 }
2320 
2321 static void ublk_reset_ch_dev(struct ublk_device *ub)
2322 {
2323 	int i;
2324 
2325 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2326 		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2327 
2328 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2329 	ub->mm = NULL;
2330 	ub->nr_queue_ready = 0;
2331 	ub->unprivileged_daemons = false;
2332 	ub->ublksrv_tgid = -1;
2333 }
2334 
2335 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2336 {
2337 	struct gendisk *disk;
2338 
2339 	spin_lock(&ub->lock);
2340 	disk = ub->ub_disk;
2341 	if (disk)
2342 		get_device(disk_to_dev(disk));
2343 	spin_unlock(&ub->lock);
2344 
2345 	return disk;
2346 }
2347 
2348 static void ublk_put_disk(struct gendisk *disk)
2349 {
2350 	if (disk)
2351 		put_device(disk_to_dev(disk));
2352 }
2353 
2354 static void ublk_partition_scan_work(struct work_struct *work)
2355 {
2356 	struct ublk_device *ub =
2357 		container_of(work, struct ublk_device, partition_scan_work);
2358 	/* Hold disk reference to prevent UAF during concurrent teardown */
2359 	struct gendisk *disk = ublk_get_disk(ub);
2360 
2361 	if (!disk)
2362 		return;
2363 
2364 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2365 					     &disk->state)))
2366 		goto out;
2367 
2368 	mutex_lock(&disk->open_mutex);
2369 	bdev_disk_changed(disk, false);
2370 	mutex_unlock(&disk->open_mutex);
2371 out:
2372 	ublk_put_disk(disk);
2373 }
2374 
2375 /*
2376  * Use this function to ensure that ->canceling is consistently set for
2377  * the device and all queues. Do not set these flags directly.
2378  *
2379  * Caller must ensure that:
2380  * - cancel_mutex is held. This ensures that there is no concurrent
2381  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2382  * - there are no concurrent reads of ubq->canceling from the queue_rq
2383  *   path. This can be done by quiescing the queue, or through other
2384  *   means.
2385  */
2386 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2387 	__must_hold(&ub->cancel_mutex)
2388 {
2389 	int i;
2390 
2391 	ub->canceling = canceling;
2392 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2393 		ublk_get_queue(ub, i)->canceling = canceling;
2394 }
2395 
2396 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2397 {
2398 	int i, j;
2399 
2400 	if (!ublk_dev_need_req_ref(ub))
2401 		return false;
2402 
2403 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2404 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2405 
2406 		for (j = 0; j < ubq->q_depth; j++) {
2407 			struct ublk_io *io = &ubq->ios[j];
2408 			unsigned int refs = refcount_read(&io->ref) +
2409 				io->task_registered_buffers;
2410 
2411 			/*
2412 			 * UBLK_REFCOUNT_INIT or zero means no active
2413 			 * reference
2414 			 */
2415 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2416 				return true;
2417 
2418 			/* reset to zero if the io hasn't active references */
2419 			refcount_set(&io->ref, 0);
2420 			io->task_registered_buffers = 0;
2421 		}
2422 	}
2423 	return false;
2424 }
2425 
2426 static void ublk_ch_release_work_fn(struct work_struct *work)
2427 {
2428 	struct ublk_device *ub =
2429 		container_of(work, struct ublk_device, exit_work.work);
2430 	struct gendisk *disk;
2431 	int i;
2432 
2433 	/*
2434 	 * For zero-copy and auto buffer register modes, I/O references
2435 	 * might not be dropped naturally when the daemon is killed, but
2436 	 * io_uring guarantees that registered bvec kernel buffers are
2437 	 * unregistered finally when freeing io_uring context, then the
2438 	 * active references are dropped.
2439 	 *
2440 	 * Wait until active references are dropped for avoiding use-after-free
2441 	 *
2442 	 * registered buffer may be unregistered in io_ring's release hander,
2443 	 * so have to wait by scheduling work function for avoiding the two
2444 	 * file release dependency.
2445 	 */
2446 	if (ublk_check_and_reset_active_ref(ub)) {
2447 		schedule_delayed_work(&ub->exit_work, 1);
2448 		return;
2449 	}
2450 
2451 	/*
2452 	 * disk isn't attached yet, either device isn't live, or it has
2453 	 * been removed already, so we needn't to do anything
2454 	 */
2455 	disk = ublk_get_disk(ub);
2456 	if (!disk)
2457 		goto out;
2458 
2459 	/*
2460 	 * All uring_cmd are done now, so abort any request outstanding to
2461 	 * the ublk server
2462 	 *
2463 	 * This can be done in lockless way because ublk server has been
2464 	 * gone
2465 	 *
2466 	 * More importantly, we have to provide forward progress guarantee
2467 	 * without holding ub->mutex, otherwise control task grabbing
2468 	 * ub->mutex triggers deadlock
2469 	 *
2470 	 * All requests may be inflight, so ->canceling may not be set, set
2471 	 * it now.
2472 	 */
2473 	mutex_lock(&ub->cancel_mutex);
2474 	ublk_set_canceling(ub, true);
2475 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2476 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2477 	mutex_unlock(&ub->cancel_mutex);
2478 	blk_mq_kick_requeue_list(disk->queue);
2479 
2480 	/*
2481 	 * All infligh requests have been completed or requeued and any new
2482 	 * request will be failed or requeued via `->canceling` now, so it is
2483 	 * fine to grab ub->mutex now.
2484 	 */
2485 	mutex_lock(&ub->mutex);
2486 
2487 	/* double check after grabbing lock */
2488 	if (!ub->ub_disk)
2489 		goto unlock;
2490 
2491 	/*
2492 	 * Transition the device to the nosrv state. What exactly this
2493 	 * means depends on the recovery flags
2494 	 */
2495 	if (ublk_nosrv_should_stop_dev(ub)) {
2496 		/*
2497 		 * Allow any pending/future I/O to pass through quickly
2498 		 * with an error. This is needed because del_gendisk
2499 		 * waits for all pending I/O to complete
2500 		 */
2501 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2502 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2503 
2504 		ublk_stop_dev_unlocked(ub);
2505 	} else {
2506 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2507 			/* ->canceling is set and all requests are aborted */
2508 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2509 		} else {
2510 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2511 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2512 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2513 		}
2514 	}
2515 unlock:
2516 	mutex_unlock(&ub->mutex);
2517 	ublk_put_disk(disk);
2518 
2519 	/* all uring_cmd has been done now, reset device & ubq */
2520 	ublk_reset_ch_dev(ub);
2521 out:
2522 	clear_bit(UB_STATE_OPEN, &ub->state);
2523 
2524 	/* put the reference grabbed in ublk_ch_release() */
2525 	ublk_put_device(ub);
2526 }
2527 
2528 static int ublk_ch_release(struct inode *inode, struct file *filp)
2529 {
2530 	struct ublk_device *ub = filp->private_data;
2531 
2532 	/*
2533 	 * Grab ublk device reference, so it won't be gone until we are
2534 	 * really released from work function.
2535 	 */
2536 	ublk_get_device(ub);
2537 
2538 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2539 	schedule_delayed_work(&ub->exit_work, 0);
2540 	return 0;
2541 }
2542 
2543 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
2544 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2545 {
2546 	struct ublk_device *ub = filp->private_data;
2547 	size_t sz = vma->vm_end - vma->vm_start;
2548 	unsigned max_sz = ublk_max_cmd_buf_size();
2549 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2550 	int q_id, ret = 0;
2551 
2552 	spin_lock(&ub->lock);
2553 	if (!ub->mm)
2554 		ub->mm = current->mm;
2555 	if (current->mm != ub->mm)
2556 		ret = -EINVAL;
2557 	spin_unlock(&ub->lock);
2558 
2559 	if (ret)
2560 		return ret;
2561 
2562 	if (vma->vm_flags & VM_WRITE)
2563 		return -EPERM;
2564 
2565 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2566 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2567 		return -EINVAL;
2568 
2569 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2570 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2571 			__func__, q_id, current->pid, vma->vm_start,
2572 			phys_off, (unsigned long)sz);
2573 
2574 	if (sz != ublk_queue_cmd_buf_size(ub))
2575 		return -EINVAL;
2576 
2577 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2578 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2579 }
2580 
2581 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2582 		struct request *req)
2583 {
2584 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2585 			io->flags & UBLK_IO_FLAG_ACTIVE);
2586 
2587 	if (ublk_nosrv_should_reissue_outstanding(ub))
2588 		blk_mq_requeue_request(req, false);
2589 	else {
2590 		io->res = -EIO;
2591 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2592 	}
2593 }
2594 
2595 /*
2596  * Request tag may just be filled to event kfifo, not get chance to
2597  * dispatch, abort these requests too
2598  */
2599 static void ublk_abort_batch_queue(struct ublk_device *ub,
2600 				   struct ublk_queue *ubq)
2601 {
2602 	unsigned short tag;
2603 
2604 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2605 		struct request *req = blk_mq_tag_to_rq(
2606 				ub->tag_set.tags[ubq->q_id], tag);
2607 
2608 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2609 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2610 	}
2611 }
2612 
2613 /*
2614  * Called from ublk char device release handler, when any uring_cmd is
2615  * done, meantime request queue is "quiesced" since all inflight requests
2616  * can't be completed because ublk server is dead.
2617  *
2618  * So no one can hold our request IO reference any more, simply ignore the
2619  * reference, and complete the request immediately
2620  */
2621 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2622 {
2623 	int i;
2624 
2625 	for (i = 0; i < ubq->q_depth; i++) {
2626 		struct ublk_io *io = &ubq->ios[i];
2627 
2628 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2629 			__ublk_fail_req(ub, io, io->req);
2630 	}
2631 
2632 	if (ublk_support_batch_io(ubq))
2633 		ublk_abort_batch_queue(ub, ubq);
2634 }
2635 
2636 static void ublk_start_cancel(struct ublk_device *ub)
2637 {
2638 	struct gendisk *disk = ublk_get_disk(ub);
2639 
2640 	/* Our disk has been dead */
2641 	if (!disk)
2642 		return;
2643 
2644 	mutex_lock(&ub->cancel_mutex);
2645 	if (ub->canceling)
2646 		goto out;
2647 	/*
2648 	 * Now we are serialized with ublk_queue_rq()
2649 	 *
2650 	 * Make sure that ubq->canceling is set when queue is frozen,
2651 	 * because ublk_queue_rq() has to rely on this flag for avoiding to
2652 	 * touch completed uring_cmd
2653 	 */
2654 	blk_mq_quiesce_queue(disk->queue);
2655 	ublk_set_canceling(ub, true);
2656 	blk_mq_unquiesce_queue(disk->queue);
2657 out:
2658 	mutex_unlock(&ub->cancel_mutex);
2659 	ublk_put_disk(disk);
2660 }
2661 
2662 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2663 		unsigned int issue_flags)
2664 {
2665 	struct ublk_io *io = &ubq->ios[tag];
2666 	struct ublk_device *ub = ubq->dev;
2667 	struct request *req;
2668 	bool done;
2669 
2670 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2671 		return;
2672 
2673 	/*
2674 	 * Don't try to cancel this command if the request is started for
2675 	 * avoiding race between io_uring_cmd_done() and
2676 	 * io_uring_cmd_complete_in_task().
2677 	 *
2678 	 * Either the started request will be aborted via __ublk_abort_rq(),
2679 	 * then this uring_cmd is canceled next time, or it will be done in
2680 	 * task work function ublk_dispatch_req() because io_uring guarantees
2681 	 * that ublk_dispatch_req() is always called
2682 	 */
2683 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2684 	if (req && blk_mq_request_started(req) && req->tag == tag)
2685 		return;
2686 
2687 	spin_lock(&ubq->cancel_lock);
2688 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2689 	if (!done)
2690 		io->flags |= UBLK_IO_FLAG_CANCELED;
2691 	spin_unlock(&ubq->cancel_lock);
2692 
2693 	if (!done)
2694 		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2695 }
2696 
2697 /*
2698  * Cancel a batch fetch command if it hasn't been claimed by another path.
2699  *
2700  * An fcmd can only be cancelled if:
2701  * 1. It's not the active_fcmd (which is currently being processed)
2702  * 2. It's still on the list (!list_empty check) - once removed from the list,
2703  *    the fcmd is considered claimed and will be freed by whoever removed it
2704  *
2705  * Use list_del_init() so subsequent list_empty() checks work correctly.
2706  */
2707 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2708 				  struct ublk_batch_fetch_cmd *fcmd,
2709 				  unsigned int issue_flags)
2710 {
2711 	bool done;
2712 
2713 	spin_lock(&ubq->evts_lock);
2714 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2715 	if (done)
2716 		list_del_init(&fcmd->node);
2717 	spin_unlock(&ubq->evts_lock);
2718 
2719 	if (done) {
2720 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2721 		ublk_batch_free_fcmd(fcmd);
2722 	}
2723 }
2724 
2725 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2726 {
2727 	struct ublk_batch_fetch_cmd *fcmd;
2728 	LIST_HEAD(fcmd_list);
2729 
2730 	spin_lock(&ubq->evts_lock);
2731 	ubq->force_abort = true;
2732 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2733 	fcmd = READ_ONCE(ubq->active_fcmd);
2734 	if (fcmd)
2735 		list_move(&fcmd->node, &ubq->fcmd_head);
2736 	spin_unlock(&ubq->evts_lock);
2737 
2738 	while (!list_empty(&fcmd_list)) {
2739 		fcmd = list_first_entry(&fcmd_list,
2740 				struct ublk_batch_fetch_cmd, node);
2741 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2742 	}
2743 }
2744 
2745 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2746 				 unsigned int issue_flags)
2747 {
2748 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2749 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2750 	struct ublk_queue *ubq = pdu->ubq;
2751 
2752 	ublk_start_cancel(ubq->dev);
2753 
2754 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2755 }
2756 
2757 /*
2758  * The ublk char device won't be closed when calling cancel fn, so both
2759  * ublk device and queue are guaranteed to be live
2760  *
2761  * Two-stage cancel:
2762  *
2763  * - make every active uring_cmd done in ->cancel_fn()
2764  *
2765  * - aborting inflight ublk IO requests in ublk char device release handler,
2766  *   which depends on 1st stage because device can only be closed iff all
2767  *   uring_cmd are done
2768  *
2769  * Do _not_ try to acquire ub->mutex before all inflight requests are
2770  * aborted, otherwise deadlock may be caused.
2771  */
2772 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2773 		unsigned int issue_flags)
2774 {
2775 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2776 	struct ublk_queue *ubq = pdu->ubq;
2777 	struct task_struct *task;
2778 	struct ublk_io *io;
2779 
2780 	if (WARN_ON_ONCE(!ubq))
2781 		return;
2782 
2783 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2784 		return;
2785 
2786 	task = io_uring_cmd_get_task(cmd);
2787 	io = &ubq->ios[pdu->tag];
2788 	if (WARN_ON_ONCE(task && task != io->task))
2789 		return;
2790 
2791 	ublk_start_cancel(ubq->dev);
2792 
2793 	WARN_ON_ONCE(io->cmd != cmd);
2794 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2795 }
2796 
2797 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2798 {
2799 	return ubq->nr_io_ready == ubq->q_depth;
2800 }
2801 
2802 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2803 {
2804 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2805 }
2806 
2807 static void ublk_cancel_queue(struct ublk_queue *ubq)
2808 {
2809 	int i;
2810 
2811 	if (ublk_support_batch_io(ubq)) {
2812 		ublk_batch_cancel_queue(ubq);
2813 		return;
2814 	}
2815 
2816 	for (i = 0; i < ubq->q_depth; i++)
2817 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2818 }
2819 
2820 /* Cancel all pending commands, must be called after del_gendisk() returns */
2821 static void ublk_cancel_dev(struct ublk_device *ub)
2822 {
2823 	int i;
2824 
2825 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2826 		ublk_cancel_queue(ublk_get_queue(ub, i));
2827 }
2828 
2829 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2830 {
2831 	bool *idle = data;
2832 
2833 	if (blk_mq_request_started(rq)) {
2834 		*idle = false;
2835 		return false;
2836 	}
2837 	return true;
2838 }
2839 
2840 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2841 {
2842 	bool idle;
2843 
2844 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2845 	while (true) {
2846 		idle = true;
2847 		blk_mq_tagset_busy_iter(&ub->tag_set,
2848 				ublk_check_inflight_rq, &idle);
2849 		if (idle)
2850 			break;
2851 		msleep(UBLK_REQUEUE_DELAY_MS);
2852 	}
2853 }
2854 
2855 static void ublk_force_abort_dev(struct ublk_device *ub)
2856 {
2857 	int i;
2858 
2859 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2860 			__func__, ub->dev_info.dev_id,
2861 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2862 			"LIVE" : "QUIESCED");
2863 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2864 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2865 		ublk_wait_tagset_rqs_idle(ub);
2866 
2867 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2868 		ublk_get_queue(ub, i)->force_abort = true;
2869 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2870 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2871 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2872 }
2873 
2874 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2875 {
2876 	struct gendisk *disk;
2877 
2878 	/* Sync with ublk_abort_queue() by holding the lock */
2879 	spin_lock(&ub->lock);
2880 	disk = ub->ub_disk;
2881 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2882 	ub->dev_info.ublksrv_pid = -1;
2883 	ub->ub_disk = NULL;
2884 	spin_unlock(&ub->lock);
2885 
2886 	return disk;
2887 }
2888 
2889 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2890 	__must_hold(&ub->mutex)
2891 {
2892 	struct gendisk *disk;
2893 
2894 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2895 		return;
2896 
2897 	if (ublk_nosrv_dev_should_queue_io(ub))
2898 		ublk_force_abort_dev(ub);
2899 	del_gendisk(ub->ub_disk);
2900 	disk = ublk_detach_disk(ub);
2901 	put_disk(disk);
2902 }
2903 
2904 static void ublk_stop_dev(struct ublk_device *ub)
2905 {
2906 	mutex_lock(&ub->mutex);
2907 	ublk_stop_dev_unlocked(ub);
2908 	mutex_unlock(&ub->mutex);
2909 	cancel_work_sync(&ub->partition_scan_work);
2910 	ublk_cancel_dev(ub);
2911 }
2912 
2913 /* reset per-queue io flags */
2914 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2915 {
2916 	int j;
2917 
2918 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
2919 	spin_lock(&ubq->cancel_lock);
2920 	for (j = 0; j < ubq->q_depth; j++)
2921 		ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2922 	ubq->canceling = false;
2923 	spin_unlock(&ubq->cancel_lock);
2924 	ubq->fail_io = false;
2925 }
2926 
2927 /* device can only be started after all IOs are ready */
2928 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
2929 	__must_hold(&ub->mutex)
2930 {
2931 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2932 
2933 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2934 		ub->unprivileged_daemons = true;
2935 
2936 	ubq->nr_io_ready++;
2937 
2938 	/* Check if this specific queue is now fully ready */
2939 	if (ublk_queue_ready(ubq)) {
2940 		ub->nr_queue_ready++;
2941 
2942 		/*
2943 		 * Reset queue flags as soon as this queue is ready.
2944 		 * This clears the canceling flag, allowing batch FETCH commands
2945 		 * to succeed during recovery without waiting for all queues.
2946 		 */
2947 		ublk_queue_reset_io_flags(ubq);
2948 	}
2949 
2950 	/* Check if all queues are ready */
2951 	if (ublk_dev_ready(ub)) {
2952 		/*
2953 		 * All queues ready - clear device-level canceling flag
2954 		 * and complete the recovery/initialization.
2955 		 */
2956 		mutex_lock(&ub->cancel_mutex);
2957 		ub->canceling = false;
2958 		mutex_unlock(&ub->cancel_mutex);
2959 		complete_all(&ub->completion);
2960 	}
2961 }
2962 
2963 static inline int ublk_check_cmd_op(u32 cmd_op)
2964 {
2965 	u32 ioc_type = _IOC_TYPE(cmd_op);
2966 
2967 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2968 		return -EOPNOTSUPP;
2969 
2970 	if (ioc_type != 'u' && ioc_type != 0)
2971 		return -EOPNOTSUPP;
2972 
2973 	return 0;
2974 }
2975 
2976 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2977 {
2978 	struct ublk_auto_buf_reg buf;
2979 
2980 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2981 
2982 	if (buf.reserved0 || buf.reserved1)
2983 		return -EINVAL;
2984 
2985 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2986 		return -EINVAL;
2987 	io->buf.auto_reg = buf;
2988 	return 0;
2989 }
2990 
2991 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
2992 				    struct io_uring_cmd *cmd,
2993 				    u16 *buf_idx)
2994 {
2995 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
2996 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
2997 
2998 		/*
2999 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3000 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3001 		 * `io_ring_ctx`.
3002 		 *
3003 		 * If this uring_cmd's io_ring_ctx isn't same with the
3004 		 * one for registering the buffer, it is ublk server's
3005 		 * responsibility for unregistering the buffer, otherwise
3006 		 * this ublk request gets stuck.
3007 		 */
3008 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3009 			*buf_idx = io->buf.auto_reg.index;
3010 	}
3011 }
3012 
3013 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3014 				    struct io_uring_cmd *cmd,
3015 				    u16 *buf_idx)
3016 {
3017 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3018 	return ublk_set_auto_buf_reg(io, cmd);
3019 }
3020 
3021 /* Once we return, `io->req` can't be used any more */
3022 static inline struct request *
3023 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3024 {
3025 	struct request *req = io->req;
3026 
3027 	io->cmd = cmd;
3028 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3029 	/* now this cmd slot is owned by ublk driver */
3030 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3031 
3032 	return req;
3033 }
3034 
3035 static inline int
3036 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3037 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3038 		   u16 *buf_idx)
3039 {
3040 	if (ublk_dev_support_auto_buf_reg(ub))
3041 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3042 
3043 	io->buf.addr = buf_addr;
3044 	return 0;
3045 }
3046 
3047 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3048 				    unsigned int issue_flags,
3049 				    struct ublk_queue *ubq, unsigned int tag)
3050 {
3051 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3052 
3053 	/*
3054 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3055 	 * commands are completed
3056 	 */
3057 	pdu->ubq = ubq;
3058 	pdu->tag = tag;
3059 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3060 }
3061 
3062 static void ublk_io_release(void *priv)
3063 {
3064 	struct request *rq = priv;
3065 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3066 	struct ublk_io *io = &ubq->ios[rq->tag];
3067 
3068 	/*
3069 	 * task_registered_buffers may be 0 if buffers were registered off task
3070 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3071 	 */
3072 	if (current == io->task && io->task_registered_buffers)
3073 		io->task_registered_buffers--;
3074 	else
3075 		ublk_put_req_ref(io, rq);
3076 }
3077 
3078 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3079 				struct ublk_device *ub,
3080 				u16 q_id, u16 tag,
3081 				struct ublk_io *io,
3082 				unsigned int index, unsigned int issue_flags)
3083 {
3084 	struct request *req;
3085 	int ret;
3086 
3087 	if (!ublk_dev_support_zero_copy(ub))
3088 		return -EINVAL;
3089 
3090 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3091 	if (!req)
3092 		return -EINVAL;
3093 
3094 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3095 				      issue_flags);
3096 	if (ret) {
3097 		ublk_put_req_ref(io, req);
3098 		return ret;
3099 	}
3100 
3101 	return 0;
3102 }
3103 
3104 static int
3105 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3106 			    struct ublk_device *ub,
3107 			    u16 q_id, u16 tag, struct ublk_io *io,
3108 			    unsigned index, unsigned issue_flags)
3109 {
3110 	unsigned new_registered_buffers;
3111 	struct request *req = io->req;
3112 	int ret;
3113 
3114 	/*
3115 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3116 	 * If not, fall back on the thread-safe buffer registration.
3117 	 */
3118 	new_registered_buffers = io->task_registered_buffers + 1;
3119 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3120 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3121 					    issue_flags);
3122 
3123 	if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3124 		return -EINVAL;
3125 
3126 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3127 				      issue_flags);
3128 	if (ret)
3129 		return ret;
3130 
3131 	io->task_registered_buffers = new_registered_buffers;
3132 	return 0;
3133 }
3134 
3135 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3136 				  const struct ublk_device *ub,
3137 				  unsigned int index, unsigned int issue_flags)
3138 {
3139 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3140 		return -EINVAL;
3141 
3142 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3143 }
3144 
3145 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3146 {
3147 	if (ublk_dev_need_map_io(ub)) {
3148 		/*
3149 		 * FETCH_RQ has to provide IO buffer if NEED GET
3150 		 * DATA is not enabled
3151 		 */
3152 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3153 			return -EINVAL;
3154 	} else if (buf_addr) {
3155 		/* User copy requires addr to be unset */
3156 		return -EINVAL;
3157 	}
3158 	return 0;
3159 }
3160 
3161 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3162 			struct ublk_io *io, u16 q_id)
3163 {
3164 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3165 	if (ublk_dev_ready(ub))
3166 		return -EBUSY;
3167 
3168 	/* allow each command to be FETCHed at most once */
3169 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3170 		return -EINVAL;
3171 
3172 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3173 
3174 	ublk_fill_io_cmd(io, cmd);
3175 
3176 	if (ublk_dev_support_batch_io(ub))
3177 		WRITE_ONCE(io->task, NULL);
3178 	else
3179 		WRITE_ONCE(io->task, get_task_struct(current));
3180 
3181 	return 0;
3182 }
3183 
3184 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3185 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3186 {
3187 	int ret;
3188 
3189 	/*
3190 	 * When handling FETCH command for setting up ublk uring queue,
3191 	 * ub->mutex is the innermost lock, and we won't block for handling
3192 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3193 	 */
3194 	mutex_lock(&ub->mutex);
3195 	ret = __ublk_fetch(cmd, ub, io, q_id);
3196 	if (!ret)
3197 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3198 	if (!ret)
3199 		ublk_mark_io_ready(ub, q_id);
3200 	mutex_unlock(&ub->mutex);
3201 	return ret;
3202 }
3203 
3204 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3205 				       struct ublk_io *io, __u64 buf_addr)
3206 {
3207 	struct request *req = io->req;
3208 
3209 	if (ublk_dev_need_map_io(ub)) {
3210 		/*
3211 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3212 		 * NEED GET DATA is not enabled or it is Read IO.
3213 		 */
3214 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3215 					req_op(req) == REQ_OP_READ))
3216 			return -EINVAL;
3217 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3218 		/*
3219 		 * User copy requires addr to be unset when command is
3220 		 * not zone append
3221 		 */
3222 		return -EINVAL;
3223 	}
3224 
3225 	return 0;
3226 }
3227 
3228 static bool ublk_need_complete_req(const struct ublk_device *ub,
3229 				   struct ublk_io *io)
3230 {
3231 	if (ublk_dev_need_req_ref(ub))
3232 		return ublk_sub_req_ref(io);
3233 	return true;
3234 }
3235 
3236 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3237 			  struct request *req)
3238 {
3239 	/*
3240 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3241 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3242 	 * do the copy work.
3243 	 */
3244 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3245 	/* update iod->addr because ublksrv may have passed a new io buffer */
3246 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3247 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3248 			__func__, ubq->q_id, req->tag, io->flags,
3249 			ublk_get_iod(ubq, req->tag)->addr);
3250 
3251 	return ublk_start_io(ubq, req, io);
3252 }
3253 
3254 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3255 		unsigned int issue_flags)
3256 {
3257 	/* May point to userspace-mapped memory */
3258 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
3259 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3260 	struct ublk_device *ub = cmd->file->private_data;
3261 	struct ublk_queue *ubq;
3262 	struct ublk_io *io = NULL;
3263 	u32 cmd_op = cmd->cmd_op;
3264 	u16 q_id = READ_ONCE(ub_src->q_id);
3265 	u16 tag = READ_ONCE(ub_src->tag);
3266 	s32 result = READ_ONCE(ub_src->result);
3267 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3268 	struct request *req;
3269 	int ret;
3270 	bool compl;
3271 
3272 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3273 
3274 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3275 			__func__, cmd->cmd_op, q_id, tag, result);
3276 
3277 	ret = ublk_check_cmd_op(cmd_op);
3278 	if (ret)
3279 		goto out;
3280 
3281 	/*
3282 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3283 	 * so no need to validate the q_id, tag, or task
3284 	 */
3285 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3286 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3287 
3288 	ret = -EINVAL;
3289 	if (q_id >= ub->dev_info.nr_hw_queues)
3290 		goto out;
3291 
3292 	ubq = ublk_get_queue(ub, q_id);
3293 
3294 	if (tag >= ub->dev_info.queue_depth)
3295 		goto out;
3296 
3297 	io = &ubq->ios[tag];
3298 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3299 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3300 		ret = ublk_check_fetch_buf(ub, addr);
3301 		if (ret)
3302 			goto out;
3303 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3304 		if (ret)
3305 			goto out;
3306 
3307 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3308 		return -EIOCBQUEUED;
3309 	}
3310 
3311 	if (READ_ONCE(io->task) != current) {
3312 		/*
3313 		 * ublk_register_io_buf() accesses only the io's refcount,
3314 		 * so can be handled on any task
3315 		 */
3316 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3317 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3318 						    addr, issue_flags);
3319 
3320 		goto out;
3321 	}
3322 
3323 	/* there is pending io cmd, something must be wrong */
3324 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3325 		ret = -EBUSY;
3326 		goto out;
3327 	}
3328 
3329 	/*
3330 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3331 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3332 	 */
3333 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3334 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3335 		goto out;
3336 
3337 	switch (_IOC_NR(cmd_op)) {
3338 	case UBLK_IO_REGISTER_IO_BUF:
3339 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3340 						   issue_flags);
3341 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3342 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3343 		if (ret)
3344 			goto out;
3345 		io->res = result;
3346 		req = ublk_fill_io_cmd(io, cmd);
3347 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3348 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3349 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3350 		compl = ublk_need_complete_req(ub, io);
3351 
3352 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3353 			req->__sector = addr;
3354 		if (compl)
3355 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3356 
3357 		if (ret)
3358 			goto out;
3359 		break;
3360 	case UBLK_IO_NEED_GET_DATA:
3361 		/*
3362 		 * ublk_get_data() may fail and fallback to requeue, so keep
3363 		 * uring_cmd active first and prepare for handling new requeued
3364 		 * request
3365 		 */
3366 		req = ublk_fill_io_cmd(io, cmd);
3367 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3368 		WARN_ON_ONCE(ret);
3369 		if (likely(ublk_get_data(ubq, io, req))) {
3370 			__ublk_prep_compl_io_cmd(io, req);
3371 			return UBLK_IO_RES_OK;
3372 		}
3373 		break;
3374 	default:
3375 		goto out;
3376 	}
3377 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3378 	return -EIOCBQUEUED;
3379 
3380  out:
3381 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3382 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3383 	return ret;
3384 }
3385 
3386 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3387 		u16 q_id, u16 tag, struct ublk_io *io)
3388 {
3389 	struct request *req;
3390 
3391 	/*
3392 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3393 	 * which would overwrite it with io->cmd
3394 	 */
3395 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3396 	if (!req)
3397 		return NULL;
3398 
3399 	if (!ublk_get_req_ref(io))
3400 		return NULL;
3401 
3402 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3403 		goto fail_put;
3404 
3405 	if (!ublk_rq_has_data(req))
3406 		goto fail_put;
3407 
3408 	return req;
3409 fail_put:
3410 	ublk_put_req_ref(io, req);
3411 	return NULL;
3412 }
3413 
3414 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3415 {
3416 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3417 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3418 	int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3419 
3420 	if (ret != -EIOCBQUEUED)
3421 		io_uring_cmd_done(cmd, ret, issue_flags);
3422 }
3423 
3424 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3425 {
3426 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3427 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3428 		return 0;
3429 	}
3430 
3431 	/* well-implemented server won't run into unlocked */
3432 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3433 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3434 		return -EIOCBQUEUED;
3435 	}
3436 
3437 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3438 }
3439 
3440 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3441 					const struct ublk_elem_header *elem)
3442 {
3443 	const void *buf = elem;
3444 
3445 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3446 		return *(const __u64 *)(buf + sizeof(*elem));
3447 	return 0;
3448 }
3449 
3450 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3451 					const struct ublk_elem_header *elem)
3452 {
3453 	const void *buf = elem;
3454 
3455 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3456 		return *(const __u64 *)(buf + sizeof(*elem) +
3457 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3458 	return -1;
3459 }
3460 
3461 static struct ublk_auto_buf_reg
3462 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3463 			const struct ublk_elem_header *elem)
3464 {
3465 	struct ublk_auto_buf_reg reg = {
3466 		.index = elem->buf_index,
3467 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3468 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3469 	};
3470 
3471 	return reg;
3472 }
3473 
3474 /*
3475  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3476  * it is the least common multiple(LCM) of 8, 16 and 24
3477  */
3478 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3479 struct ublk_batch_io_iter {
3480 	void __user *uaddr;
3481 	unsigned done, total;
3482 	unsigned char elem_bytes;
3483 	/* copy to this buffer from user space */
3484 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3485 };
3486 
3487 static inline int
3488 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3489 		    struct ublk_batch_io_iter *iter,
3490 		    const struct ublk_batch_io_data *data,
3491 		    unsigned bytes,
3492 		    int (*cb)(struct ublk_queue *q,
3493 			    const struct ublk_batch_io_data *data,
3494 			    const struct ublk_elem_header *elem))
3495 {
3496 	unsigned int i;
3497 	int ret = 0;
3498 
3499 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3500 		const struct ublk_elem_header *elem =
3501 			(const struct ublk_elem_header *)&iter->buf[i];
3502 
3503 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3504 			ret = -EINVAL;
3505 			break;
3506 		}
3507 
3508 		ret = cb(ubq, data, elem);
3509 		if (unlikely(ret))
3510 			break;
3511 	}
3512 
3513 	iter->done += i;
3514 	return ret;
3515 }
3516 
3517 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3518 			     const struct ublk_batch_io_data *data,
3519 			     int (*cb)(struct ublk_queue *q,
3520 				     const struct ublk_batch_io_data *data,
3521 				     const struct ublk_elem_header *elem))
3522 {
3523 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3524 	int ret = 0;
3525 
3526 	while (iter->done < iter->total) {
3527 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3528 
3529 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3530 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3531 					data->ub->dev_info.dev_id);
3532 			return -EFAULT;
3533 		}
3534 
3535 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3536 		if (ret)
3537 			return ret;
3538 	}
3539 	return 0;
3540 }
3541 
3542 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3543 				const struct ublk_batch_io_data *data,
3544 				const struct ublk_elem_header *elem)
3545 {
3546 	struct ublk_io *io = &ubq->ios[elem->tag];
3547 
3548 	/*
3549 	 * If queue was ready before this decrement, it won't be anymore,
3550 	 * so we need to decrement the queue ready count and restore the
3551 	 * canceling flag to prevent new requests from being queued.
3552 	 */
3553 	if (ublk_queue_ready(ubq)) {
3554 		data->ub->nr_queue_ready--;
3555 		spin_lock(&ubq->cancel_lock);
3556 		ubq->canceling = true;
3557 		spin_unlock(&ubq->cancel_lock);
3558 	}
3559 	ubq->nr_io_ready--;
3560 
3561 	ublk_io_lock(io);
3562 	io->flags = 0;
3563 	ublk_io_unlock(io);
3564 	return 0;
3565 }
3566 
3567 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3568 				       const struct ublk_batch_io_data *data)
3569 {
3570 	int ret;
3571 
3572 	/* Re-process only what we've already processed, starting from beginning */
3573 	iter->total = iter->done;
3574 	iter->done = 0;
3575 
3576 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3577 	WARN_ON_ONCE(ret);
3578 }
3579 
3580 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3581 			      const struct ublk_batch_io_data *data,
3582 			      const struct ublk_elem_header *elem)
3583 {
3584 	struct ublk_io *io = &ubq->ios[elem->tag];
3585 	const struct ublk_batch_io *uc = &data->header;
3586 	union ublk_io_buf buf = { 0 };
3587 	int ret;
3588 
3589 	if (ublk_dev_support_auto_buf_reg(data->ub))
3590 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3591 	else if (ublk_dev_need_map_io(data->ub)) {
3592 		buf.addr = ublk_batch_buf_addr(uc, elem);
3593 
3594 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3595 		if (ret)
3596 			return ret;
3597 	}
3598 
3599 	ublk_io_lock(io);
3600 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3601 	if (!ret)
3602 		io->buf = buf;
3603 	ublk_io_unlock(io);
3604 
3605 	if (!ret)
3606 		ublk_mark_io_ready(data->ub, ubq->q_id);
3607 
3608 	return ret;
3609 }
3610 
3611 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3612 {
3613 	const struct ublk_batch_io *uc = &data->header;
3614 	struct io_uring_cmd *cmd = data->cmd;
3615 	struct ublk_batch_io_iter iter = {
3616 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3617 		.total = uc->nr_elem * uc->elem_bytes,
3618 		.elem_bytes = uc->elem_bytes,
3619 	};
3620 	int ret;
3621 
3622 	mutex_lock(&data->ub->mutex);
3623 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3624 
3625 	if (ret && iter.done)
3626 		ublk_batch_revert_prep_cmd(&iter, data);
3627 	mutex_unlock(&data->ub->mutex);
3628 	return ret;
3629 }
3630 
3631 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3632 				      struct ublk_io *io,
3633 				      union ublk_io_buf *buf)
3634 {
3635 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3636 		return -EBUSY;
3637 
3638 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3639 	if (ublk_need_map_io(ubq) && !buf->addr)
3640 		return -EINVAL;
3641 	return 0;
3642 }
3643 
3644 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3645 				const struct ublk_batch_io_data *data,
3646 				const struct ublk_elem_header *elem)
3647 {
3648 	struct ublk_io *io = &ubq->ios[elem->tag];
3649 	const struct ublk_batch_io *uc = &data->header;
3650 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3651 	union ublk_io_buf buf = { 0 };
3652 	struct request *req = NULL;
3653 	bool auto_reg = false;
3654 	bool compl = false;
3655 	int ret;
3656 
3657 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3658 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3659 		auto_reg = true;
3660 	} else if (ublk_dev_need_map_io(data->ub))
3661 		buf.addr = ublk_batch_buf_addr(uc, elem);
3662 
3663 	ublk_io_lock(io);
3664 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3665 	if (!ret) {
3666 		io->res = elem->result;
3667 		io->buf = buf;
3668 		req = ublk_fill_io_cmd(io, data->cmd);
3669 
3670 		if (auto_reg)
3671 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3672 		compl = ublk_need_complete_req(data->ub, io);
3673 	}
3674 	ublk_io_unlock(io);
3675 
3676 	if (unlikely(ret)) {
3677 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3678 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3679 			elem->tag, ret);
3680 		return ret;
3681 	}
3682 
3683 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3684 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3685 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3686 		req->__sector = ublk_batch_zone_lba(uc, elem);
3687 	if (compl)
3688 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3689 	return 0;
3690 }
3691 
3692 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3693 {
3694 	const struct ublk_batch_io *uc = &data->header;
3695 	struct io_uring_cmd *cmd = data->cmd;
3696 	struct ublk_batch_io_iter iter = {
3697 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3698 		.total = uc->nr_elem * uc->elem_bytes,
3699 		.elem_bytes = uc->elem_bytes,
3700 	};
3701 	DEFINE_IO_COMP_BATCH(iob);
3702 	int ret;
3703 
3704 	data->iob = &iob;
3705 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3706 
3707 	if (iob.complete)
3708 		iob.complete(&iob);
3709 
3710 	return iter.done == 0 ? ret : iter.done;
3711 }
3712 
3713 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3714 {
3715 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3716 
3717 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3718 		return -EINVAL;
3719 
3720 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3721 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3722 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3723 		return -EINVAL;
3724 
3725 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3726 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3727 	if (uc->elem_bytes != elem_bytes)
3728 		return -EINVAL;
3729 	return 0;
3730 }
3731 
3732 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3733 {
3734 	const struct ublk_batch_io *uc = &data->header;
3735 
3736 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3737 		return -EINVAL;
3738 
3739 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3740 		return -E2BIG;
3741 
3742 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3743 			!ublk_dev_is_zoned(data->ub))
3744 		return -EINVAL;
3745 
3746 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3747 			!ublk_dev_need_map_io(data->ub))
3748 		return -EINVAL;
3749 
3750 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3751 			!ublk_dev_support_auto_buf_reg(data->ub))
3752 		return -EINVAL;
3753 
3754 	return ublk_check_batch_cmd_flags(uc);
3755 }
3756 
3757 static int ublk_batch_attach(struct ublk_queue *ubq,
3758 			     struct ublk_batch_io_data *data,
3759 			     struct ublk_batch_fetch_cmd *fcmd)
3760 {
3761 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3762 	bool free = false;
3763 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3764 
3765 	spin_lock(&ubq->evts_lock);
3766 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3767 		free = true;
3768 	} else {
3769 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3770 		new_fcmd = __ublk_acquire_fcmd(ubq);
3771 	}
3772 	spin_unlock(&ubq->evts_lock);
3773 
3774 	if (unlikely(free)) {
3775 		ublk_batch_free_fcmd(fcmd);
3776 		return -ENODEV;
3777 	}
3778 
3779 	pdu->ubq = ubq;
3780 	pdu->fcmd = fcmd;
3781 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3782 
3783 	if (!new_fcmd)
3784 		goto out;
3785 
3786 	/*
3787 	 * If the two fetch commands are originated from same io_ring_ctx,
3788 	 * run batch dispatch directly. Otherwise, schedule task work for
3789 	 * doing it.
3790 	 */
3791 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3792 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3793 		data->cmd = new_fcmd->cmd;
3794 		ublk_batch_dispatch(ubq, data, new_fcmd);
3795 	} else {
3796 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3797 				ublk_batch_tw_cb);
3798 	}
3799 out:
3800 	return -EIOCBQUEUED;
3801 }
3802 
3803 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3804 {
3805 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3806 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3807 
3808 	if (!fcmd)
3809 		return -ENOMEM;
3810 
3811 	return ublk_batch_attach(ubq, data, fcmd);
3812 }
3813 
3814 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3815 {
3816 	const struct ublk_batch_io *uc = &data->header;
3817 
3818 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3819 		return -EINVAL;
3820 
3821 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3822 		return -EINVAL;
3823 
3824 	if (uc->elem_bytes != sizeof(__u16))
3825 		return -EINVAL;
3826 
3827 	if (uc->flags != 0)
3828 		return -EINVAL;
3829 
3830 	return 0;
3831 }
3832 
3833 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3834 				     unsigned int issue_flags)
3835 {
3836 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe);
3837 	struct ublk_device *ub = cmd->file->private_data;
3838 	unsigned tag = READ_ONCE(ub_cmd->tag);
3839 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3840 	unsigned index = READ_ONCE(ub_cmd->addr);
3841 	struct ublk_queue *ubq;
3842 	struct ublk_io *io;
3843 
3844 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3845 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3846 
3847 	if (q_id >= ub->dev_info.nr_hw_queues)
3848 		return -EINVAL;
3849 
3850 	if (tag >= ub->dev_info.queue_depth)
3851 		return -EINVAL;
3852 
3853 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3854 		return -EOPNOTSUPP;
3855 
3856 	ubq = ublk_get_queue(ub, q_id);
3857 	io = &ubq->ios[tag];
3858 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3859 			issue_flags);
3860 }
3861 
3862 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3863 				       unsigned int issue_flags)
3864 {
3865 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe);
3866 	struct ublk_device *ub = cmd->file->private_data;
3867 	struct ublk_batch_io_data data = {
3868 		.ub  = ub,
3869 		.cmd = cmd,
3870 		.header = (struct ublk_batch_io) {
3871 			.q_id = READ_ONCE(uc->q_id),
3872 			.flags = READ_ONCE(uc->flags),
3873 			.nr_elem = READ_ONCE(uc->nr_elem),
3874 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3875 		},
3876 		.issue_flags = issue_flags,
3877 	};
3878 	u32 cmd_op = cmd->cmd_op;
3879 	int ret = -EINVAL;
3880 
3881 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3882 		ublk_batch_cancel_fn(cmd, issue_flags);
3883 		return 0;
3884 	}
3885 
3886 	switch (cmd_op) {
3887 	case UBLK_U_IO_PREP_IO_CMDS:
3888 		ret = ublk_check_batch_cmd(&data);
3889 		if (ret)
3890 			goto out;
3891 		ret = ublk_handle_batch_prep_cmd(&data);
3892 		break;
3893 	case UBLK_U_IO_COMMIT_IO_CMDS:
3894 		ret = ublk_check_batch_cmd(&data);
3895 		if (ret)
3896 			goto out;
3897 		ret = ublk_handle_batch_commit_cmd(&data);
3898 		break;
3899 	case UBLK_U_IO_FETCH_IO_CMDS:
3900 		ret = ublk_validate_batch_fetch_cmd(&data);
3901 		if (ret)
3902 			goto out;
3903 		ret = ublk_handle_batch_fetch_cmd(&data);
3904 		break;
3905 	default:
3906 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3907 		break;
3908 	}
3909 out:
3910 	return ret;
3911 }
3912 
3913 static inline bool ublk_check_ubuf_dir(const struct request *req,
3914 		int ubuf_dir)
3915 {
3916 	/* copy ubuf to request pages */
3917 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
3918 	    ubuf_dir == ITER_SOURCE)
3919 		return true;
3920 
3921 	/* copy request pages to ubuf */
3922 	if ((req_op(req) == REQ_OP_WRITE ||
3923 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
3924 	    ubuf_dir == ITER_DEST)
3925 		return true;
3926 
3927 	return false;
3928 }
3929 
3930 static ssize_t
3931 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
3932 {
3933 	struct ublk_device *ub = iocb->ki_filp->private_data;
3934 	struct ublk_queue *ubq;
3935 	struct request *req;
3936 	struct ublk_io *io;
3937 	unsigned data_len;
3938 	bool is_integrity;
3939 	bool on_daemon;
3940 	size_t buf_off;
3941 	u16 tag, q_id;
3942 	ssize_t ret;
3943 
3944 	if (!user_backed_iter(iter))
3945 		return -EACCES;
3946 
3947 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3948 		return -EACCES;
3949 
3950 	tag = ublk_pos_to_tag(iocb->ki_pos);
3951 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
3952 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
3953 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
3954 
3955 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
3956 		return -EINVAL;
3957 
3958 	if (q_id >= ub->dev_info.nr_hw_queues)
3959 		return -EINVAL;
3960 
3961 	ubq = ublk_get_queue(ub, q_id);
3962 	if (!ublk_dev_support_user_copy(ub))
3963 		return -EACCES;
3964 
3965 	if (tag >= ub->dev_info.queue_depth)
3966 		return -EINVAL;
3967 
3968 	io = &ubq->ios[tag];
3969 	on_daemon = current == READ_ONCE(io->task);
3970 	if (on_daemon) {
3971 		/* On daemon, io can't be completed concurrently, so skip ref */
3972 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3973 			return -EINVAL;
3974 
3975 		req = io->req;
3976 		if (!ublk_rq_has_data(req))
3977 			return -EINVAL;
3978 	} else {
3979 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
3980 		if (!req)
3981 			return -EINVAL;
3982 	}
3983 
3984 	if (is_integrity) {
3985 		struct blk_integrity *bi = &req->q->limits.integrity;
3986 
3987 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
3988 	} else {
3989 		data_len = blk_rq_bytes(req);
3990 	}
3991 	if (buf_off > data_len) {
3992 		ret = -EINVAL;
3993 		goto out;
3994 	}
3995 
3996 	if (!ublk_check_ubuf_dir(req, dir)) {
3997 		ret = -EACCES;
3998 		goto out;
3999 	}
4000 
4001 	if (is_integrity)
4002 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4003 	else
4004 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4005 
4006 out:
4007 	if (!on_daemon)
4008 		ublk_put_req_ref(io, req);
4009 	return ret;
4010 }
4011 
4012 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4013 {
4014 	return ublk_user_copy(iocb, to, ITER_DEST);
4015 }
4016 
4017 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4018 {
4019 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4020 }
4021 
4022 static const struct file_operations ublk_ch_fops = {
4023 	.owner = THIS_MODULE,
4024 	.open = ublk_ch_open,
4025 	.release = ublk_ch_release,
4026 	.read_iter = ublk_ch_read_iter,
4027 	.write_iter = ublk_ch_write_iter,
4028 	.uring_cmd = ublk_ch_uring_cmd,
4029 	.mmap = ublk_ch_mmap,
4030 };
4031 
4032 static const struct file_operations ublk_ch_batch_io_fops = {
4033 	.owner = THIS_MODULE,
4034 	.open = ublk_ch_open,
4035 	.release = ublk_ch_release,
4036 	.read_iter = ublk_ch_read_iter,
4037 	.write_iter = ublk_ch_write_iter,
4038 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4039 	.mmap = ublk_ch_mmap,
4040 };
4041 
4042 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4043 {
4044 	int size, i;
4045 
4046 	size = ublk_queue_cmd_buf_size(ub);
4047 
4048 	for (i = 0; i < ubq->q_depth; i++) {
4049 		struct ublk_io *io = &ubq->ios[i];
4050 		if (io->task)
4051 			put_task_struct(io->task);
4052 		WARN_ON_ONCE(refcount_read(&io->ref));
4053 		WARN_ON_ONCE(io->task_registered_buffers);
4054 	}
4055 
4056 	if (ubq->io_cmd_buf)
4057 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4058 
4059 	if (ublk_dev_support_batch_io(ub))
4060 		ublk_io_evts_deinit(ubq);
4061 
4062 	kvfree(ubq);
4063 }
4064 
4065 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4066 {
4067 	struct ublk_queue *ubq = ub->queues[q_id];
4068 
4069 	if (!ubq)
4070 		return;
4071 
4072 	__ublk_deinit_queue(ub, ubq);
4073 	ub->queues[q_id] = NULL;
4074 }
4075 
4076 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4077 {
4078 	unsigned int cpu;
4079 
4080 	/* Find first CPU mapped to this queue */
4081 	for_each_possible_cpu(cpu) {
4082 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4083 			return cpu_to_node(cpu);
4084 	}
4085 
4086 	return NUMA_NO_NODE;
4087 }
4088 
4089 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4090 {
4091 	int depth = ub->dev_info.queue_depth;
4092 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4093 	struct ublk_queue *ubq;
4094 	struct page *page;
4095 	int numa_node;
4096 	int size, i, ret;
4097 
4098 	/* Determine NUMA node based on queue's CPU affinity */
4099 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4100 
4101 	/* Allocate queue structure on local NUMA node */
4102 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4103 			    numa_node);
4104 	if (!ubq)
4105 		return -ENOMEM;
4106 
4107 	spin_lock_init(&ubq->cancel_lock);
4108 	ubq->flags = ub->dev_info.flags;
4109 	ubq->q_id = q_id;
4110 	ubq->q_depth = depth;
4111 	size = ublk_queue_cmd_buf_size(ub);
4112 
4113 	/* Allocate I/O command buffer on local NUMA node */
4114 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4115 	if (!page) {
4116 		kvfree(ubq);
4117 		return -ENOMEM;
4118 	}
4119 	ubq->io_cmd_buf = page_address(page);
4120 
4121 	for (i = 0; i < ubq->q_depth; i++)
4122 		spin_lock_init(&ubq->ios[i].lock);
4123 
4124 	if (ublk_dev_support_batch_io(ub)) {
4125 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4126 		if (ret)
4127 			goto fail;
4128 		INIT_LIST_HEAD(&ubq->fcmd_head);
4129 	}
4130 	ub->queues[q_id] = ubq;
4131 	ubq->dev = ub;
4132 
4133 	return 0;
4134 fail:
4135 	__ublk_deinit_queue(ub, ubq);
4136 	return ret;
4137 }
4138 
4139 static void ublk_deinit_queues(struct ublk_device *ub)
4140 {
4141 	int i;
4142 
4143 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4144 		ublk_deinit_queue(ub, i);
4145 }
4146 
4147 static int ublk_init_queues(struct ublk_device *ub)
4148 {
4149 	int i, ret;
4150 
4151 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4152 		ret = ublk_init_queue(ub, i);
4153 		if (ret)
4154 			goto fail;
4155 	}
4156 
4157 	init_completion(&ub->completion);
4158 	return 0;
4159 
4160  fail:
4161 	ublk_deinit_queues(ub);
4162 	return ret;
4163 }
4164 
4165 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4166 {
4167 	int i = idx;
4168 	int err;
4169 
4170 	spin_lock(&ublk_idr_lock);
4171 	/* allocate id, if @id >= 0, we're requesting that specific id */
4172 	if (i >= 0) {
4173 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4174 		if (err == -ENOSPC)
4175 			err = -EEXIST;
4176 	} else {
4177 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4178 				GFP_NOWAIT);
4179 	}
4180 	spin_unlock(&ublk_idr_lock);
4181 
4182 	if (err >= 0)
4183 		ub->ub_number = err;
4184 
4185 	return err;
4186 }
4187 
4188 static void ublk_free_dev_number(struct ublk_device *ub)
4189 {
4190 	spin_lock(&ublk_idr_lock);
4191 	idr_remove(&ublk_index_idr, ub->ub_number);
4192 	wake_up_all(&ublk_idr_wq);
4193 	spin_unlock(&ublk_idr_lock);
4194 }
4195 
4196 static void ublk_cdev_rel(struct device *dev)
4197 {
4198 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4199 
4200 	blk_mq_free_tag_set(&ub->tag_set);
4201 	ublk_deinit_queues(ub);
4202 	ublk_free_dev_number(ub);
4203 	mutex_destroy(&ub->mutex);
4204 	mutex_destroy(&ub->cancel_mutex);
4205 	kfree(ub);
4206 }
4207 
4208 static int ublk_add_chdev(struct ublk_device *ub)
4209 {
4210 	struct device *dev = &ub->cdev_dev;
4211 	int minor = ub->ub_number;
4212 	int ret;
4213 
4214 	dev->parent = ublk_misc.this_device;
4215 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4216 	dev->class = &ublk_chr_class;
4217 	dev->release = ublk_cdev_rel;
4218 	device_initialize(dev);
4219 
4220 	ret = dev_set_name(dev, "ublkc%d", minor);
4221 	if (ret)
4222 		goto fail;
4223 
4224 	if (ublk_dev_support_batch_io(ub))
4225 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4226 	else
4227 		cdev_init(&ub->cdev, &ublk_ch_fops);
4228 	ret = cdev_device_add(&ub->cdev, dev);
4229 	if (ret)
4230 		goto fail;
4231 
4232 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4233 		unprivileged_ublks_added++;
4234 	return 0;
4235  fail:
4236 	put_device(dev);
4237 	return ret;
4238 }
4239 
4240 /* align max io buffer size with PAGE_SIZE */
4241 static void ublk_align_max_io_size(struct ublk_device *ub)
4242 {
4243 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4244 
4245 	ub->dev_info.max_io_buf_bytes =
4246 		round_down(max_io_bytes, PAGE_SIZE);
4247 }
4248 
4249 static int ublk_add_tag_set(struct ublk_device *ub)
4250 {
4251 	if (ublk_dev_support_batch_io(ub))
4252 		ub->tag_set.ops = &ublk_batch_mq_ops;
4253 	else
4254 		ub->tag_set.ops = &ublk_mq_ops;
4255 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4256 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4257 	ub->tag_set.numa_node = NUMA_NO_NODE;
4258 	ub->tag_set.driver_data = ub;
4259 	return blk_mq_alloc_tag_set(&ub->tag_set);
4260 }
4261 
4262 static void ublk_remove(struct ublk_device *ub)
4263 {
4264 	bool unprivileged;
4265 
4266 	ublk_stop_dev(ub);
4267 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4268 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4269 	ublk_put_device(ub);
4270 
4271 	if (unprivileged)
4272 		unprivileged_ublks_added--;
4273 }
4274 
4275 static struct ublk_device *ublk_get_device_from_id(int idx)
4276 {
4277 	struct ublk_device *ub = NULL;
4278 
4279 	if (idx < 0)
4280 		return NULL;
4281 
4282 	spin_lock(&ublk_idr_lock);
4283 	ub = idr_find(&ublk_index_idr, idx);
4284 	if (ub)
4285 		ub = ublk_get_device(ub);
4286 	spin_unlock(&ublk_idr_lock);
4287 
4288 	return ub;
4289 }
4290 
4291 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4292 {
4293 	rcu_read_lock();
4294 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4295 	rcu_read_unlock();
4296 
4297 	return ub->ublksrv_tgid == ublksrv_pid;
4298 }
4299 
4300 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4301 		const struct ublksrv_ctrl_cmd *header)
4302 {
4303 	const struct ublk_param_basic *p = &ub->params.basic;
4304 	int ublksrv_pid = (int)header->data[0];
4305 	struct queue_limits lim = {
4306 		.logical_block_size	= 1 << p->logical_bs_shift,
4307 		.physical_block_size	= 1 << p->physical_bs_shift,
4308 		.io_min			= 1 << p->io_min_shift,
4309 		.io_opt			= 1 << p->io_opt_shift,
4310 		.max_hw_sectors		= p->max_sectors,
4311 		.chunk_sectors		= p->chunk_sectors,
4312 		.virt_boundary_mask	= p->virt_boundary_mask,
4313 		.max_segments		= USHRT_MAX,
4314 		.max_segment_size	= UINT_MAX,
4315 		.dma_alignment		= 3,
4316 	};
4317 	struct gendisk *disk;
4318 	int ret = -EINVAL;
4319 
4320 	if (ublksrv_pid <= 0)
4321 		return -EINVAL;
4322 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4323 		return -EINVAL;
4324 
4325 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4326 		const struct ublk_param_discard *pd = &ub->params.discard;
4327 
4328 		lim.discard_alignment = pd->discard_alignment;
4329 		lim.discard_granularity = pd->discard_granularity;
4330 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4331 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4332 		lim.max_discard_segments = pd->max_discard_segments;
4333 	}
4334 
4335 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4336 		const struct ublk_param_zoned *p = &ub->params.zoned;
4337 
4338 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4339 			return -EOPNOTSUPP;
4340 
4341 		lim.features |= BLK_FEAT_ZONED;
4342 		lim.max_active_zones = p->max_active_zones;
4343 		lim.max_open_zones =  p->max_open_zones;
4344 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4345 	}
4346 
4347 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4348 		lim.features |= BLK_FEAT_WRITE_CACHE;
4349 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4350 			lim.features |= BLK_FEAT_FUA;
4351 	}
4352 
4353 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4354 		lim.features |= BLK_FEAT_ROTATIONAL;
4355 
4356 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4357 		lim.dma_alignment = ub->params.dma.alignment;
4358 
4359 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4360 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4361 		lim.max_segment_size = ub->params.seg.max_segment_size;
4362 		lim.max_segments = ub->params.seg.max_segments;
4363 	}
4364 
4365 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4366 		const struct ublk_param_integrity *p = &ub->params.integrity;
4367 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4368 
4369 		lim.max_integrity_segments =
4370 			p->max_integrity_segments ?: USHRT_MAX;
4371 		lim.integrity = (struct blk_integrity) {
4372 			.flags = ublk_integrity_flags(p->flags),
4373 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4374 			.metadata_size = p->metadata_size,
4375 			.pi_offset = p->pi_offset,
4376 			.interval_exp = p->interval_exp,
4377 			.tag_size = p->tag_size,
4378 			.pi_tuple_size = pi_tuple_size,
4379 		};
4380 	}
4381 
4382 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4383 		return -EINTR;
4384 
4385 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4386 		return -EINVAL;
4387 
4388 	mutex_lock(&ub->mutex);
4389 	/* device may become not ready in case of F_BATCH */
4390 	if (!ublk_dev_ready(ub)) {
4391 		ret = -EINVAL;
4392 		goto out_unlock;
4393 	}
4394 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4395 	    test_bit(UB_STATE_USED, &ub->state)) {
4396 		ret = -EEXIST;
4397 		goto out_unlock;
4398 	}
4399 
4400 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4401 	if (IS_ERR(disk)) {
4402 		ret = PTR_ERR(disk);
4403 		goto out_unlock;
4404 	}
4405 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4406 	disk->fops = &ub_fops;
4407 	disk->private_data = ub;
4408 
4409 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4410 	ub->ub_disk = disk;
4411 
4412 	ublk_apply_params(ub);
4413 
4414 	/*
4415 	 * Suppress partition scan to avoid potential IO hang.
4416 	 *
4417 	 * If ublk server error occurs during partition scan, the IO may
4418 	 * wait while holding ub->mutex, which can deadlock with other
4419 	 * operations that need the mutex. Defer partition scan to async
4420 	 * work.
4421 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4422 	 * permanently.
4423 	 */
4424 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4425 
4426 	ublk_get_device(ub);
4427 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4428 
4429 	if (ublk_dev_is_zoned(ub)) {
4430 		ret = ublk_revalidate_disk_zones(ub);
4431 		if (ret)
4432 			goto out_put_cdev;
4433 	}
4434 
4435 	ret = add_disk(disk);
4436 	if (ret)
4437 		goto out_put_cdev;
4438 
4439 	set_bit(UB_STATE_USED, &ub->state);
4440 
4441 	/* Skip partition scan if disabled by user */
4442 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4443 		clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4444 	} else {
4445 		/* Schedule async partition scan for trusted daemons */
4446 		if (!ub->unprivileged_daemons)
4447 			schedule_work(&ub->partition_scan_work);
4448 	}
4449 
4450 out_put_cdev:
4451 	if (ret) {
4452 		ublk_detach_disk(ub);
4453 		ublk_put_device(ub);
4454 	}
4455 	if (ret)
4456 		put_disk(disk);
4457 out_unlock:
4458 	mutex_unlock(&ub->mutex);
4459 	return ret;
4460 }
4461 
4462 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4463 		const struct ublksrv_ctrl_cmd *header)
4464 {
4465 	void __user *argp = (void __user *)(unsigned long)header->addr;
4466 	cpumask_var_t cpumask;
4467 	unsigned long queue;
4468 	unsigned int retlen;
4469 	unsigned int i;
4470 	int ret;
4471 
4472 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4473 		return -EINVAL;
4474 	if (header->len & (sizeof(unsigned long)-1))
4475 		return -EINVAL;
4476 	if (!header->addr)
4477 		return -EINVAL;
4478 
4479 	queue = header->data[0];
4480 	if (queue >= ub->dev_info.nr_hw_queues)
4481 		return -EINVAL;
4482 
4483 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4484 		return -ENOMEM;
4485 
4486 	for_each_possible_cpu(i) {
4487 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4488 			cpumask_set_cpu(i, cpumask);
4489 	}
4490 
4491 	ret = -EFAULT;
4492 	retlen = min_t(unsigned short, header->len, cpumask_size());
4493 	if (copy_to_user(argp, cpumask, retlen))
4494 		goto out_free_cpumask;
4495 	if (retlen != header->len &&
4496 	    clear_user(argp + retlen, header->len - retlen))
4497 		goto out_free_cpumask;
4498 
4499 	ret = 0;
4500 out_free_cpumask:
4501 	free_cpumask_var(cpumask);
4502 	return ret;
4503 }
4504 
4505 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4506 {
4507 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4508 			info->dev_id, info->flags);
4509 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4510 			info->nr_hw_queues, info->queue_depth);
4511 }
4512 
4513 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4514 {
4515 	void __user *argp = (void __user *)(unsigned long)header->addr;
4516 	struct ublksrv_ctrl_dev_info info;
4517 	struct ublk_device *ub;
4518 	int ret = -EINVAL;
4519 
4520 	if (header->len < sizeof(info) || !header->addr)
4521 		return -EINVAL;
4522 	if (header->queue_id != (u16)-1) {
4523 		pr_warn("%s: queue_id is wrong %x\n",
4524 			__func__, header->queue_id);
4525 		return -EINVAL;
4526 	}
4527 
4528 	if (copy_from_user(&info, argp, sizeof(info)))
4529 		return -EFAULT;
4530 
4531 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4532 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4533 		return -EINVAL;
4534 
4535 	if (capable(CAP_SYS_ADMIN))
4536 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4537 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4538 		return -EPERM;
4539 
4540 	/* forbid nonsense combinations of recovery flags */
4541 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4542 	case 0:
4543 	case UBLK_F_USER_RECOVERY:
4544 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4545 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4546 		break;
4547 	default:
4548 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4549 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4550 		return -EINVAL;
4551 	}
4552 
4553 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4554 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4555 		return -EINVAL;
4556 	}
4557 
4558 	/*
4559 	 * unprivileged device can't be trusted, but RECOVERY and
4560 	 * RECOVERY_REISSUE still may hang error handling, so can't
4561 	 * support recovery features for unprivileged ublk now
4562 	 *
4563 	 * TODO: provide forward progress for RECOVERY handler, so that
4564 	 * unprivileged device can benefit from it
4565 	 */
4566 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4567 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4568 				UBLK_F_USER_RECOVERY);
4569 
4570 		/*
4571 		 * For USER_COPY, we depends on userspace to fill request
4572 		 * buffer by pwrite() to ublk char device, which can't be
4573 		 * used for unprivileged device
4574 		 *
4575 		 * Same with zero copy or auto buffer register.
4576 		 */
4577 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4578 					UBLK_F_AUTO_BUF_REG))
4579 			return -EINVAL;
4580 	}
4581 
4582 	/* User copy is required to access integrity buffer */
4583 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4584 		return -EINVAL;
4585 
4586 	/* the created device is always owned by current user */
4587 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4588 
4589 	if (header->dev_id != info.dev_id) {
4590 		pr_warn("%s: dev id not match %u %u\n",
4591 			__func__, header->dev_id, info.dev_id);
4592 		return -EINVAL;
4593 	}
4594 
4595 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4596 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4597 			__func__, UBLK_MAX_UBLKS - 1);
4598 		return -EINVAL;
4599 	}
4600 
4601 	ublk_dump_dev_info(&info);
4602 
4603 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4604 	if (ret)
4605 		return ret;
4606 
4607 	ret = -EACCES;
4608 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4609 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4610 		goto out_unlock;
4611 
4612 	ret = -ENOMEM;
4613 	ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL);
4614 	if (!ub)
4615 		goto out_unlock;
4616 	mutex_init(&ub->mutex);
4617 	spin_lock_init(&ub->lock);
4618 	mutex_init(&ub->cancel_mutex);
4619 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4620 
4621 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4622 	if (ret < 0)
4623 		goto out_free_ub;
4624 
4625 	memcpy(&ub->dev_info, &info, sizeof(info));
4626 
4627 	/* update device id */
4628 	ub->dev_info.dev_id = ub->ub_number;
4629 
4630 	/*
4631 	 * 64bit flags will be copied back to userspace as feature
4632 	 * negotiation result, so have to clear flags which driver
4633 	 * doesn't support yet, then userspace can get correct flags
4634 	 * (features) to handle.
4635 	 */
4636 	ub->dev_info.flags &= UBLK_F_ALL;
4637 
4638 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4639 		UBLK_F_URING_CMD_COMP_IN_TASK |
4640 		UBLK_F_PER_IO_DAEMON |
4641 		UBLK_F_BUF_REG_OFF_DAEMON |
4642 		UBLK_F_SAFE_STOP_DEV;
4643 
4644 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4645 	if (ublk_dev_support_batch_io(ub))
4646 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4647 
4648 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4649 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4650 				UBLK_F_AUTO_BUF_REG))
4651 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4652 
4653 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4654 	if (ublk_dev_support_batch_io(ub))
4655 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4656 
4657 	/*
4658 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4659 	 * returning write_append_lba, which is only allowed in case of
4660 	 * user copy or zero copy
4661 	 */
4662 	if (ublk_dev_is_zoned(ub) &&
4663 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4664 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4665 		ret = -EINVAL;
4666 		goto out_free_dev_number;
4667 	}
4668 
4669 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4670 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4671 	ublk_align_max_io_size(ub);
4672 
4673 	ret = ublk_add_tag_set(ub);
4674 	if (ret)
4675 		goto out_free_dev_number;
4676 
4677 	ret = ublk_init_queues(ub);
4678 	if (ret)
4679 		goto out_free_tag_set;
4680 
4681 	ret = -EFAULT;
4682 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4683 		goto out_deinit_queues;
4684 
4685 	/*
4686 	 * Add the char dev so that ublksrv daemon can be setup.
4687 	 * ublk_add_chdev() will cleanup everything if it fails.
4688 	 */
4689 	ret = ublk_add_chdev(ub);
4690 	goto out_unlock;
4691 
4692 out_deinit_queues:
4693 	ublk_deinit_queues(ub);
4694 out_free_tag_set:
4695 	blk_mq_free_tag_set(&ub->tag_set);
4696 out_free_dev_number:
4697 	ublk_free_dev_number(ub);
4698 out_free_ub:
4699 	mutex_destroy(&ub->mutex);
4700 	mutex_destroy(&ub->cancel_mutex);
4701 	kfree(ub);
4702 out_unlock:
4703 	mutex_unlock(&ublk_ctl_mutex);
4704 	return ret;
4705 }
4706 
4707 static inline bool ublk_idr_freed(int id)
4708 {
4709 	void *ptr;
4710 
4711 	spin_lock(&ublk_idr_lock);
4712 	ptr = idr_find(&ublk_index_idr, id);
4713 	spin_unlock(&ublk_idr_lock);
4714 
4715 	return ptr == NULL;
4716 }
4717 
4718 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4719 {
4720 	struct ublk_device *ub = *p_ub;
4721 	int idx = ub->ub_number;
4722 	int ret;
4723 
4724 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4725 	if (ret)
4726 		return ret;
4727 
4728 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4729 		ublk_remove(ub);
4730 		set_bit(UB_STATE_DELETED, &ub->state);
4731 	}
4732 
4733 	/* Mark the reference as consumed */
4734 	*p_ub = NULL;
4735 	ublk_put_device(ub);
4736 	mutex_unlock(&ublk_ctl_mutex);
4737 
4738 	/*
4739 	 * Wait until the idr is removed, then it can be reused after
4740 	 * DEL_DEV command is returned.
4741 	 *
4742 	 * If we returns because of user interrupt, future delete command
4743 	 * may come:
4744 	 *
4745 	 * - the device number isn't freed, this device won't or needn't
4746 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4747 	 *   will be released after the last reference is dropped
4748 	 *
4749 	 * - the device number is freed already, we will not find this
4750 	 *   device via ublk_get_device_from_id()
4751 	 */
4752 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4753 		return -EINTR;
4754 	return 0;
4755 }
4756 
4757 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4758 				      const struct ublksrv_ctrl_cmd *header)
4759 {
4760 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4761 			__func__, cmd_op, header->dev_id, header->queue_id,
4762 			header->data[0], header->addr, header->len);
4763 }
4764 
4765 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4766 {
4767 	ublk_stop_dev(ub);
4768 }
4769 
4770 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4771 {
4772 	struct gendisk *disk;
4773 	int ret = 0;
4774 
4775 	disk = ublk_get_disk(ub);
4776 	if (!disk)
4777 		return -ENODEV;
4778 
4779 	mutex_lock(&disk->open_mutex);
4780 	if (disk_openers(disk) > 0) {
4781 		ret = -EBUSY;
4782 		goto unlock;
4783 	}
4784 	ub->block_open = true;
4785 	/* release open_mutex as del_gendisk() will reacquire it */
4786 	mutex_unlock(&disk->open_mutex);
4787 
4788 	ublk_ctrl_stop_dev(ub);
4789 	goto out;
4790 
4791 unlock:
4792 	mutex_unlock(&disk->open_mutex);
4793 out:
4794 	ublk_put_disk(disk);
4795 	return ret;
4796 }
4797 
4798 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4799 		const struct ublksrv_ctrl_cmd *header)
4800 {
4801 	struct task_struct *p;
4802 	struct pid *pid;
4803 	struct ublksrv_ctrl_dev_info dev_info;
4804 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4805 	void __user *argp = (void __user *)(unsigned long)header->addr;
4806 
4807 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4808 		return -EINVAL;
4809 
4810 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4811 	dev_info.ublksrv_pid = -1;
4812 
4813 	if (init_ublksrv_tgid > 0) {
4814 		rcu_read_lock();
4815 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4816 		p = pid_task(pid, PIDTYPE_TGID);
4817 		if (p) {
4818 			int vnr = task_tgid_vnr(p);
4819 
4820 			if (vnr)
4821 				dev_info.ublksrv_pid = vnr;
4822 		}
4823 		rcu_read_unlock();
4824 	}
4825 
4826 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4827 		return -EFAULT;
4828 
4829 	return 0;
4830 }
4831 
4832 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
4833 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4834 {
4835 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4836 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4837 
4838 	if (ub->ub_disk) {
4839 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4840 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4841 	} else {
4842 		ub->params.devt.disk_major = 0;
4843 		ub->params.devt.disk_minor = 0;
4844 	}
4845 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4846 }
4847 
4848 static int ublk_ctrl_get_params(struct ublk_device *ub,
4849 		const struct ublksrv_ctrl_cmd *header)
4850 {
4851 	void __user *argp = (void __user *)(unsigned long)header->addr;
4852 	struct ublk_params_header ph;
4853 	int ret;
4854 
4855 	if (header->len <= sizeof(ph) || !header->addr)
4856 		return -EINVAL;
4857 
4858 	if (copy_from_user(&ph, argp, sizeof(ph)))
4859 		return -EFAULT;
4860 
4861 	if (ph.len > header->len || !ph.len)
4862 		return -EINVAL;
4863 
4864 	if (ph.len > sizeof(struct ublk_params))
4865 		ph.len = sizeof(struct ublk_params);
4866 
4867 	mutex_lock(&ub->mutex);
4868 	ublk_ctrl_fill_params_devt(ub);
4869 	if (copy_to_user(argp, &ub->params, ph.len))
4870 		ret = -EFAULT;
4871 	else
4872 		ret = 0;
4873 	mutex_unlock(&ub->mutex);
4874 
4875 	return ret;
4876 }
4877 
4878 static int ublk_ctrl_set_params(struct ublk_device *ub,
4879 		const struct ublksrv_ctrl_cmd *header)
4880 {
4881 	void __user *argp = (void __user *)(unsigned long)header->addr;
4882 	struct ublk_params_header ph;
4883 	int ret = -EFAULT;
4884 
4885 	if (header->len <= sizeof(ph) || !header->addr)
4886 		return -EINVAL;
4887 
4888 	if (copy_from_user(&ph, argp, sizeof(ph)))
4889 		return -EFAULT;
4890 
4891 	if (ph.len > header->len || !ph.len || !ph.types)
4892 		return -EINVAL;
4893 
4894 	if (ph.len > sizeof(struct ublk_params))
4895 		ph.len = sizeof(struct ublk_params);
4896 
4897 	mutex_lock(&ub->mutex);
4898 	if (test_bit(UB_STATE_USED, &ub->state)) {
4899 		/*
4900 		 * Parameters can only be changed when device hasn't
4901 		 * been started yet
4902 		 */
4903 		ret = -EACCES;
4904 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
4905 		ret = -EFAULT;
4906 	} else {
4907 		/* clear all we don't support yet */
4908 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
4909 		ret = ublk_validate_params(ub);
4910 		if (ret)
4911 			ub->params.types = 0;
4912 	}
4913 	mutex_unlock(&ub->mutex);
4914 
4915 	return ret;
4916 }
4917 
4918 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
4919 {
4920 	int ret = -EINVAL;
4921 
4922 	mutex_lock(&ub->mutex);
4923 	if (ublk_nosrv_should_stop_dev(ub))
4924 		goto out_unlock;
4925 	/*
4926 	 * START_RECOVERY is only allowd after:
4927 	 *
4928 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
4929 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
4930 	 *     released.
4931 	 *
4932 	 * and one of the following holds
4933 	 *
4934 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
4935 	 *     (a)has quiesced request queue
4936 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
4937 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
4938 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
4939 	 *
4940 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
4941 	 *     quiesced, but all I/O is being immediately errored
4942 	 */
4943 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
4944 		ret = -EBUSY;
4945 		goto out_unlock;
4946 	}
4947 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
4948 	init_completion(&ub->completion);
4949 	ret = 0;
4950  out_unlock:
4951 	mutex_unlock(&ub->mutex);
4952 	return ret;
4953 }
4954 
4955 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
4956 		const struct ublksrv_ctrl_cmd *header)
4957 {
4958 	int ublksrv_pid = (int)header->data[0];
4959 	int ret = -EINVAL;
4960 
4961 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
4962 		 header->dev_id);
4963 
4964 	if (wait_for_completion_interruptible(&ub->completion))
4965 		return -EINTR;
4966 
4967 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
4968 		 header->dev_id);
4969 
4970 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4971 		return -EINVAL;
4972 
4973 	mutex_lock(&ub->mutex);
4974 	if (ublk_nosrv_should_stop_dev(ub))
4975 		goto out_unlock;
4976 
4977 	if (!ublk_dev_in_recoverable_state(ub)) {
4978 		ret = -EBUSY;
4979 		goto out_unlock;
4980 	}
4981 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4982 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4983 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
4984 			__func__, ublksrv_pid, header->dev_id);
4985 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
4986 	ret = 0;
4987  out_unlock:
4988 	mutex_unlock(&ub->mutex);
4989 	return ret;
4990 }
4991 
4992 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
4993 {
4994 	void __user *argp = (void __user *)(unsigned long)header->addr;
4995 	u64 features = UBLK_F_ALL;
4996 
4997 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
4998 		return -EINVAL;
4999 
5000 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5001 		return -EFAULT;
5002 
5003 	return 0;
5004 }
5005 
5006 static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5007 {
5008 	struct ublk_param_basic *p = &ub->params.basic;
5009 	u64 new_size = header->data[0];
5010 
5011 	mutex_lock(&ub->mutex);
5012 	p->dev_sectors = new_size;
5013 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5014 	mutex_unlock(&ub->mutex);
5015 }
5016 
5017 struct count_busy {
5018 	const struct ublk_queue *ubq;
5019 	unsigned int nr_busy;
5020 };
5021 
5022 static bool ublk_count_busy_req(struct request *rq, void *data)
5023 {
5024 	struct count_busy *idle = data;
5025 
5026 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5027 		idle->nr_busy += 1;
5028 	return true;
5029 }
5030 
5031 /* uring_cmd is guaranteed to be active if the associated request is idle */
5032 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5033 {
5034 	struct count_busy data = {
5035 		.ubq = ubq,
5036 	};
5037 
5038 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5039 	return data.nr_busy < ubq->q_depth;
5040 }
5041 
5042 /* Wait until each hw queue has at least one idle IO */
5043 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5044 				 unsigned int timeout_ms)
5045 {
5046 	unsigned int elapsed = 0;
5047 	int ret;
5048 
5049 	/*
5050 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5051 	 * or new fetch command, so needn't wait any more
5052 	 */
5053 	if (ublk_dev_support_batch_io(ub))
5054 		return 0;
5055 
5056 	while (elapsed < timeout_ms && !signal_pending(current)) {
5057 		unsigned int queues_cancelable = 0;
5058 		int i;
5059 
5060 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5061 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5062 
5063 			queues_cancelable += !!ubq_has_idle_io(ubq);
5064 		}
5065 
5066 		/*
5067 		 * Each queue needs at least one active command for
5068 		 * notifying ublk server
5069 		 */
5070 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5071 			break;
5072 
5073 		msleep(UBLK_REQUEUE_DELAY_MS);
5074 		elapsed += UBLK_REQUEUE_DELAY_MS;
5075 	}
5076 
5077 	if (signal_pending(current))
5078 		ret = -EINTR;
5079 	else if (elapsed >= timeout_ms)
5080 		ret = -EBUSY;
5081 	else
5082 		ret = 0;
5083 
5084 	return ret;
5085 }
5086 
5087 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5088 				 const struct ublksrv_ctrl_cmd *header)
5089 {
5090 	/* zero means wait forever */
5091 	u64 timeout_ms = header->data[0];
5092 	struct gendisk *disk;
5093 	int ret = -ENODEV;
5094 
5095 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5096 		return -EOPNOTSUPP;
5097 
5098 	mutex_lock(&ub->mutex);
5099 	disk = ublk_get_disk(ub);
5100 	if (!disk)
5101 		goto unlock;
5102 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5103 		goto put_disk;
5104 
5105 	ret = 0;
5106 	/* already in expected state */
5107 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5108 		goto put_disk;
5109 
5110 	/* Mark the device as canceling */
5111 	mutex_lock(&ub->cancel_mutex);
5112 	blk_mq_quiesce_queue(disk->queue);
5113 	ublk_set_canceling(ub, true);
5114 	blk_mq_unquiesce_queue(disk->queue);
5115 	mutex_unlock(&ub->cancel_mutex);
5116 
5117 	if (!timeout_ms)
5118 		timeout_ms = UINT_MAX;
5119 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5120 
5121 put_disk:
5122 	ublk_put_disk(disk);
5123 unlock:
5124 	mutex_unlock(&ub->mutex);
5125 
5126 	/* Cancel pending uring_cmd */
5127 	if (!ret)
5128 		ublk_cancel_dev(ub);
5129 	return ret;
5130 }
5131 
5132 /*
5133  * All control commands are sent via /dev/ublk-control, so we have to check
5134  * the destination device's permission
5135  */
5136 static int ublk_char_dev_permission(struct ublk_device *ub,
5137 		const char *dev_path, int mask)
5138 {
5139 	int err;
5140 	struct path path;
5141 	struct kstat stat;
5142 
5143 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5144 	if (err)
5145 		return err;
5146 
5147 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5148 	if (err)
5149 		goto exit;
5150 
5151 	err = -EPERM;
5152 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5153 		goto exit;
5154 
5155 	err = inode_permission(&nop_mnt_idmap,
5156 			d_backing_inode(path.dentry), mask);
5157 exit:
5158 	path_put(&path);
5159 	return err;
5160 }
5161 
5162 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5163 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5164 {
5165 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5166 	void __user *argp = (void __user *)(unsigned long)header->addr;
5167 	char *dev_path = NULL;
5168 	int ret = 0;
5169 	int mask;
5170 
5171 	if (!unprivileged) {
5172 		if (!capable(CAP_SYS_ADMIN))
5173 			return -EPERM;
5174 		/*
5175 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5176 		 * char_dev_path in payload too, since userspace may not
5177 		 * know if the specified device is created as unprivileged
5178 		 * mode.
5179 		 */
5180 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5181 			return 0;
5182 	}
5183 
5184 	/*
5185 	 * User has to provide the char device path for unprivileged ublk
5186 	 *
5187 	 * header->addr always points to the dev path buffer, and
5188 	 * header->dev_path_len records length of dev path buffer.
5189 	 */
5190 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5191 		return -EINVAL;
5192 
5193 	if (header->len < header->dev_path_len)
5194 		return -EINVAL;
5195 
5196 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5197 	if (IS_ERR(dev_path))
5198 		return PTR_ERR(dev_path);
5199 
5200 	ret = -EINVAL;
5201 	switch (_IOC_NR(cmd_op)) {
5202 	case UBLK_CMD_GET_DEV_INFO:
5203 	case UBLK_CMD_GET_DEV_INFO2:
5204 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5205 	case UBLK_CMD_GET_PARAMS:
5206 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5207 		mask = MAY_READ;
5208 		break;
5209 	case UBLK_CMD_START_DEV:
5210 	case UBLK_CMD_STOP_DEV:
5211 	case UBLK_CMD_ADD_DEV:
5212 	case UBLK_CMD_DEL_DEV:
5213 	case UBLK_CMD_SET_PARAMS:
5214 	case UBLK_CMD_START_USER_RECOVERY:
5215 	case UBLK_CMD_END_USER_RECOVERY:
5216 	case UBLK_CMD_UPDATE_SIZE:
5217 	case UBLK_CMD_QUIESCE_DEV:
5218 	case UBLK_CMD_TRY_STOP_DEV:
5219 		mask = MAY_READ | MAY_WRITE;
5220 		break;
5221 	default:
5222 		goto exit;
5223 	}
5224 
5225 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5226 	if (!ret) {
5227 		header->len -= header->dev_path_len;
5228 		header->addr += header->dev_path_len;
5229 	}
5230 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5231 			__func__, ub->ub_number, cmd_op,
5232 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5233 			dev_path, ret);
5234 exit:
5235 	kfree(dev_path);
5236 	return ret;
5237 }
5238 
5239 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5240 {
5241 	switch (_IOC_NR(cmd_op)) {
5242 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5243 	case UBLK_CMD_GET_DEV_INFO:
5244 	case UBLK_CMD_GET_DEV_INFO2:
5245 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5246 		return false;
5247 	default:
5248 		return true;
5249 	}
5250 }
5251 
5252 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5253 		unsigned int issue_flags)
5254 {
5255 	/* May point to userspace-mapped memory */
5256 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
5257 	struct ublksrv_ctrl_cmd header;
5258 	struct ublk_device *ub = NULL;
5259 	u32 cmd_op = cmd->cmd_op;
5260 	int ret = -EINVAL;
5261 
5262 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5263 	    issue_flags & IO_URING_F_NONBLOCK)
5264 		return -EAGAIN;
5265 
5266 	if (!(issue_flags & IO_URING_F_SQE128))
5267 		return -EINVAL;
5268 
5269 	header.dev_id = READ_ONCE(ub_src->dev_id);
5270 	header.queue_id = READ_ONCE(ub_src->queue_id);
5271 	header.len = READ_ONCE(ub_src->len);
5272 	header.addr = READ_ONCE(ub_src->addr);
5273 	header.data[0] = READ_ONCE(ub_src->data[0]);
5274 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5275 	ublk_ctrl_cmd_dump(cmd_op, &header);
5276 
5277 	ret = ublk_check_cmd_op(cmd_op);
5278 	if (ret)
5279 		goto out;
5280 
5281 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5282 		ret = ublk_ctrl_get_features(&header);
5283 		goto out;
5284 	}
5285 
5286 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5287 		ret = -ENODEV;
5288 		ub = ublk_get_device_from_id(header.dev_id);
5289 		if (!ub)
5290 			goto out;
5291 
5292 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5293 		if (ret)
5294 			goto put_dev;
5295 	}
5296 
5297 	switch (_IOC_NR(cmd_op)) {
5298 	case UBLK_CMD_START_DEV:
5299 		ret = ublk_ctrl_start_dev(ub, &header);
5300 		break;
5301 	case UBLK_CMD_STOP_DEV:
5302 		ublk_ctrl_stop_dev(ub);
5303 		ret = 0;
5304 		break;
5305 	case UBLK_CMD_GET_DEV_INFO:
5306 	case UBLK_CMD_GET_DEV_INFO2:
5307 		ret = ublk_ctrl_get_dev_info(ub, &header);
5308 		break;
5309 	case UBLK_CMD_ADD_DEV:
5310 		ret = ublk_ctrl_add_dev(&header);
5311 		break;
5312 	case UBLK_CMD_DEL_DEV:
5313 		ret = ublk_ctrl_del_dev(&ub, true);
5314 		break;
5315 	case UBLK_CMD_DEL_DEV_ASYNC:
5316 		ret = ublk_ctrl_del_dev(&ub, false);
5317 		break;
5318 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5319 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5320 		break;
5321 	case UBLK_CMD_GET_PARAMS:
5322 		ret = ublk_ctrl_get_params(ub, &header);
5323 		break;
5324 	case UBLK_CMD_SET_PARAMS:
5325 		ret = ublk_ctrl_set_params(ub, &header);
5326 		break;
5327 	case UBLK_CMD_START_USER_RECOVERY:
5328 		ret = ublk_ctrl_start_recovery(ub);
5329 		break;
5330 	case UBLK_CMD_END_USER_RECOVERY:
5331 		ret = ublk_ctrl_end_recovery(ub, &header);
5332 		break;
5333 	case UBLK_CMD_UPDATE_SIZE:
5334 		ublk_ctrl_set_size(ub, &header);
5335 		ret = 0;
5336 		break;
5337 	case UBLK_CMD_QUIESCE_DEV:
5338 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5339 		break;
5340 	case UBLK_CMD_TRY_STOP_DEV:
5341 		ret = ublk_ctrl_try_stop_dev(ub);
5342 		break;
5343 	default:
5344 		ret = -EOPNOTSUPP;
5345 		break;
5346 	}
5347 
5348  put_dev:
5349 	if (ub)
5350 		ublk_put_device(ub);
5351  out:
5352 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5353 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5354 	return ret;
5355 }
5356 
5357 static const struct file_operations ublk_ctl_fops = {
5358 	.open		= nonseekable_open,
5359 	.uring_cmd      = ublk_ctrl_uring_cmd,
5360 	.owner		= THIS_MODULE,
5361 	.llseek		= noop_llseek,
5362 };
5363 
5364 static struct miscdevice ublk_misc = {
5365 	.minor		= MISC_DYNAMIC_MINOR,
5366 	.name		= "ublk-control",
5367 	.fops		= &ublk_ctl_fops,
5368 };
5369 
5370 static int __init ublk_init(void)
5371 {
5372 	int ret;
5373 
5374 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5375 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5376 	/*
5377 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5378 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5379 	 */
5380 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5381 		     UBLKSRV_IO_INTEGRITY_FLAG);
5382 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5383 
5384 	init_waitqueue_head(&ublk_idr_wq);
5385 
5386 	ret = misc_register(&ublk_misc);
5387 	if (ret)
5388 		return ret;
5389 
5390 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5391 	if (ret)
5392 		goto unregister_mis;
5393 
5394 	ret = class_register(&ublk_chr_class);
5395 	if (ret)
5396 		goto free_chrdev_region;
5397 
5398 	return 0;
5399 
5400 free_chrdev_region:
5401 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5402 unregister_mis:
5403 	misc_deregister(&ublk_misc);
5404 	return ret;
5405 }
5406 
5407 static void __exit ublk_exit(void)
5408 {
5409 	struct ublk_device *ub;
5410 	int id;
5411 
5412 	idr_for_each_entry(&ublk_index_idr, ub, id)
5413 		ublk_remove(ub);
5414 
5415 	class_unregister(&ublk_chr_class);
5416 	misc_deregister(&ublk_misc);
5417 
5418 	idr_destroy(&ublk_index_idr);
5419 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5420 }
5421 
5422 module_init(ublk_init);
5423 module_exit(ublk_exit);
5424 
5425 static int ublk_set_max_unprivileged_ublks(const char *buf,
5426 					   const struct kernel_param *kp)
5427 {
5428 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5429 }
5430 
5431 static int ublk_get_max_unprivileged_ublks(char *buf,
5432 					   const struct kernel_param *kp)
5433 {
5434 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5435 }
5436 
5437 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5438 	.set = ublk_set_max_unprivileged_ublks,
5439 	.get = ublk_get_max_unprivileged_ublks,
5440 };
5441 
5442 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5443 		&unprivileged_ublks_max, 0644);
5444 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5445 
5446 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5447 MODULE_DESCRIPTION("Userspace block device");
5448 MODULE_LICENSE("GPL");
5449