xref: /linux/drivers/block/ublk_drv.c (revision bd7b7ce96db4487bb77692a85ee4489fd2c395df)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <uapi/linux/fs.h>
50 #include <uapi/linux/ublk_cmd.h>
51 
52 #define UBLK_MINORS		(1U << MINORBITS)
53 
54 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
55 
56 /* private ioctl command mirror */
57 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
58 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
59 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
60 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
61 
62 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
63 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
64 
65 /* All UBLK_F_* have to be included into UBLK_F_ALL */
66 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
67 		| UBLK_F_URING_CMD_COMP_IN_TASK \
68 		| UBLK_F_NEED_GET_DATA \
69 		| UBLK_F_USER_RECOVERY \
70 		| UBLK_F_USER_RECOVERY_REISSUE \
71 		| UBLK_F_UNPRIVILEGED_DEV \
72 		| UBLK_F_CMD_IOCTL_ENCODE \
73 		| UBLK_F_USER_COPY \
74 		| UBLK_F_ZONED \
75 		| UBLK_F_USER_RECOVERY_FAIL_IO \
76 		| UBLK_F_UPDATE_SIZE \
77 		| UBLK_F_AUTO_BUF_REG \
78 		| UBLK_F_QUIESCE \
79 		| UBLK_F_PER_IO_DAEMON \
80 		| UBLK_F_BUF_REG_OFF_DAEMON \
81 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
82 		| UBLK_F_SAFE_STOP_DEV \
83 		| UBLK_F_BATCH_IO \
84 		| UBLK_F_NO_AUTO_PART_SCAN)
85 
86 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
87 		| UBLK_F_USER_RECOVERY_REISSUE \
88 		| UBLK_F_USER_RECOVERY_FAIL_IO)
89 
90 /* All UBLK_PARAM_TYPE_* should be included here */
91 #define UBLK_PARAM_TYPE_ALL                                \
92 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
93 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
94 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
95 	 UBLK_PARAM_TYPE_INTEGRITY)
96 
97 #define UBLK_BATCH_F_ALL  \
98 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
99 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
100 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
101 
102 /* ublk batch fetch uring_cmd */
103 struct ublk_batch_fetch_cmd {
104 	struct list_head node;
105 	struct io_uring_cmd *cmd;
106 	unsigned short buf_group;
107 };
108 
109 struct ublk_uring_cmd_pdu {
110 	/*
111 	 * Store requests in same batch temporarily for queuing them to
112 	 * daemon context.
113 	 *
114 	 * It should have been stored to request payload, but we do want
115 	 * to avoid extra pre-allocation, and uring_cmd payload is always
116 	 * free for us
117 	 */
118 	union {
119 		struct request *req;
120 		struct request *req_list;
121 	};
122 
123 	/*
124 	 * The following two are valid in this cmd whole lifetime, and
125 	 * setup in ublk uring_cmd handler
126 	 */
127 	struct ublk_queue *ubq;
128 
129 	union {
130 		u16 tag;
131 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
132 	};
133 };
134 
135 struct ublk_batch_io_data {
136 	struct ublk_device *ub;
137 	struct io_uring_cmd *cmd;
138 	struct ublk_batch_io header;
139 	unsigned int issue_flags;
140 	struct io_comp_batch *iob;
141 };
142 
143 /*
144  * io command is active: sqe cmd is received, and its cqe isn't done
145  *
146  * If the flag is set, the io command is owned by ublk driver, and waited
147  * for incoming blk-mq request from the ublk block device.
148  *
149  * If the flag is cleared, the io command will be completed, and owned by
150  * ublk server.
151  */
152 #define UBLK_IO_FLAG_ACTIVE	0x01
153 
154 /*
155  * IO command is completed via cqe, and it is being handled by ublksrv, and
156  * not committed yet
157  *
158  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
159  * cross verification
160  */
161 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
162 
163 /*
164  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
165  * get data buffer address from ublksrv.
166  *
167  * Then, bio data could be copied into this data buffer for a WRITE request
168  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
169  */
170 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
171 
172 /*
173  * request buffer is registered automatically, so we have to unregister it
174  * before completing this request.
175  *
176  * io_uring will unregister buffer automatically for us during exiting.
177  */
178 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
179 
180 /* atomic RW with ubq->cancel_lock */
181 #define UBLK_IO_FLAG_CANCELED	0x80000000
182 
183 /*
184  * Initialize refcount to a large number to include any registered buffers.
185  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
186  * any buffers registered on the io daemon task.
187  */
188 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
189 
190 /* used for UBLK_F_BATCH_IO only */
191 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
192 
193 union ublk_io_buf {
194 	__u64	addr;
195 	struct ublk_auto_buf_reg auto_reg;
196 };
197 
198 struct ublk_io {
199 	union ublk_io_buf buf;
200 	unsigned int flags;
201 	int res;
202 
203 	union {
204 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
205 		struct io_uring_cmd *cmd;
206 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
207 		struct request *req;
208 	};
209 
210 	struct task_struct *task;
211 
212 	/*
213 	 * The number of uses of this I/O by the ublk server
214 	 * if user copy or zero copy are enabled:
215 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
216 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
217 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
218 	 * - 1 for each io_uring registered buffer not registered on task
219 	 * The I/O can only be completed once all references are dropped.
220 	 * User copy and buffer registration operations are only permitted
221 	 * if the reference count is nonzero.
222 	 */
223 	refcount_t ref;
224 	/* Count of buffers registered on task and not yet unregistered */
225 	unsigned task_registered_buffers;
226 
227 	void *buf_ctx_handle;
228 	spinlock_t lock;
229 } ____cacheline_aligned_in_smp;
230 
231 struct ublk_queue {
232 	int q_id;
233 	int q_depth;
234 
235 	unsigned long flags;
236 	struct ublksrv_io_desc *io_cmd_buf;
237 
238 	bool force_abort;
239 	bool canceling;
240 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
241 	spinlock_t		cancel_lock;
242 	struct ublk_device *dev;
243 	u32 nr_io_ready;
244 
245 	/*
246 	 * For supporting UBLK_F_BATCH_IO only.
247 	 *
248 	 * Inflight ublk request tag is saved in this fifo
249 	 *
250 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
251 	 * so lock is required for storing request tag to fifo
252 	 *
253 	 * Make sure just one reader for fetching request from task work
254 	 * function to ublk server, so no need to grab the lock in reader
255 	 * side.
256 	 *
257 	 * Batch I/O State Management:
258 	 *
259 	 * The batch I/O system uses implicit state management based on the
260 	 * combination of three key variables below.
261 	 *
262 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
263 	 *   No fetch commands available, events queue in evts_fifo
264 	 *
265 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
266 	 *   Fetch commands available but none processing events
267 	 *
268 	 * - ACTIVE: active_fcmd
269 	 *   One fetch command actively processing events from evts_fifo
270 	 *
271 	 * Key Invariants:
272 	 * - At most one active_fcmd at any time (single reader)
273 	 * - active_fcmd is always from fcmd_head list when non-NULL
274 	 * - evts_fifo can be read locklessly by the single active reader
275 	 * - All state transitions require evts_lock protection
276 	 * - Multiple writers to evts_fifo require lock protection
277 	 */
278 	struct {
279 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
280 		spinlock_t evts_lock;
281 
282 		/* List of fetch commands available to process events */
283 		struct list_head fcmd_head;
284 
285 		/* Currently active fetch command (NULL = none active) */
286 		struct ublk_batch_fetch_cmd  *active_fcmd;
287 	}____cacheline_aligned_in_smp;
288 
289 	struct ublk_io ios[] __counted_by(q_depth);
290 };
291 
292 struct ublk_device {
293 	struct gendisk		*ub_disk;
294 
295 	struct ublksrv_ctrl_dev_info	dev_info;
296 
297 	struct blk_mq_tag_set	tag_set;
298 
299 	struct cdev		cdev;
300 	struct device		cdev_dev;
301 
302 #define UB_STATE_OPEN		0
303 #define UB_STATE_USED		1
304 #define UB_STATE_DELETED	2
305 	unsigned long		state;
306 	int			ub_number;
307 
308 	struct mutex		mutex;
309 
310 	spinlock_t		lock;
311 	struct mm_struct	*mm;
312 
313 	struct ublk_params	params;
314 
315 	struct completion	completion;
316 	u32			nr_queue_ready;
317 	bool 			unprivileged_daemons;
318 	struct mutex cancel_mutex;
319 	bool canceling;
320 	pid_t 	ublksrv_tgid;
321 	struct delayed_work	exit_work;
322 	struct work_struct	partition_scan_work;
323 
324 	bool			block_open; /* protected by open_mutex */
325 
326 	struct ublk_queue       *queues[];
327 };
328 
329 /* header of ublk_params */
330 struct ublk_params_header {
331 	__u32	len;
332 	__u32	types;
333 };
334 
335 static void ublk_io_release(void *priv);
336 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
337 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
338 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
339 		u16 q_id, u16 tag, struct ublk_io *io);
340 static inline unsigned int ublk_req_build_flags(struct request *req);
341 static void ublk_batch_dispatch(struct ublk_queue *ubq,
342 				const struct ublk_batch_io_data *data,
343 				struct ublk_batch_fetch_cmd *fcmd);
344 
345 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
346 {
347 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
348 }
349 
350 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
351 {
352 	return ubq->flags & UBLK_F_BATCH_IO;
353 }
354 
355 static inline void ublk_io_lock(struct ublk_io *io)
356 {
357 	spin_lock(&io->lock);
358 }
359 
360 static inline void ublk_io_unlock(struct ublk_io *io)
361 {
362 	spin_unlock(&io->lock);
363 }
364 
365 /* Initialize the event queue */
366 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
367 				    int numa_node)
368 {
369 	spin_lock_init(&q->evts_lock);
370 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
371 }
372 
373 /* Check if event queue is empty */
374 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
375 {
376 	return kfifo_is_empty(&q->evts_fifo);
377 }
378 
379 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
380 {
381 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
382 	kfifo_free(&q->evts_fifo);
383 }
384 
385 static inline struct ublksrv_io_desc *
386 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
387 {
388 	return &ubq->io_cmd_buf[tag];
389 }
390 
391 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
392 {
393 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
394 }
395 
396 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
397 {
398 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
399 }
400 
401 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
402 {
403 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
404 }
405 
406 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
407 {
408 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
409 }
410 
411 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
412 {
413 	return ubq->flags & UBLK_F_USER_COPY;
414 }
415 
416 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
417 {
418 	return ub->dev_info.flags & UBLK_F_USER_COPY;
419 }
420 
421 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
422 {
423 	return ub->dev_info.flags & UBLK_F_ZONED;
424 }
425 
426 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
427 {
428 	return ubq->flags & UBLK_F_ZONED;
429 }
430 
431 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
432 {
433 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
434 }
435 
436 #ifdef CONFIG_BLK_DEV_ZONED
437 
438 struct ublk_zoned_report_desc {
439 	__u64 sector;
440 	__u32 operation;
441 	__u32 nr_zones;
442 };
443 
444 static DEFINE_XARRAY(ublk_zoned_report_descs);
445 
446 static int ublk_zoned_insert_report_desc(const struct request *req,
447 		struct ublk_zoned_report_desc *desc)
448 {
449 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
450 			    desc, GFP_KERNEL);
451 }
452 
453 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
454 		const struct request *req)
455 {
456 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
457 }
458 
459 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
460 		const struct request *req)
461 {
462 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
463 }
464 
465 static int ublk_get_nr_zones(const struct ublk_device *ub)
466 {
467 	const struct ublk_param_basic *p = &ub->params.basic;
468 
469 	/* Zone size is a power of 2 */
470 	return p->dev_sectors >> ilog2(p->chunk_sectors);
471 }
472 
473 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
474 {
475 	return blk_revalidate_disk_zones(ub->ub_disk);
476 }
477 
478 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
479 {
480 	const struct ublk_param_zoned *p = &ub->params.zoned;
481 	int nr_zones;
482 
483 	if (!ublk_dev_is_zoned(ub))
484 		return -EINVAL;
485 
486 	if (!p->max_zone_append_sectors)
487 		return -EINVAL;
488 
489 	nr_zones = ublk_get_nr_zones(ub);
490 
491 	if (p->max_active_zones > nr_zones)
492 		return -EINVAL;
493 
494 	if (p->max_open_zones > nr_zones)
495 		return -EINVAL;
496 
497 	return 0;
498 }
499 
500 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
501 {
502 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
503 }
504 
505 /* Based on virtblk_alloc_report_buffer */
506 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
507 				      unsigned int nr_zones, size_t *buflen)
508 {
509 	struct request_queue *q = ublk->ub_disk->queue;
510 	size_t bufsize;
511 	void *buf;
512 
513 	nr_zones = min_t(unsigned int, nr_zones,
514 			 ublk->ub_disk->nr_zones);
515 
516 	bufsize = nr_zones * sizeof(struct blk_zone);
517 	bufsize =
518 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
519 
520 	while (bufsize >= sizeof(struct blk_zone)) {
521 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
522 		if (buf) {
523 			*buflen = bufsize;
524 			return buf;
525 		}
526 		bufsize >>= 1;
527 	}
528 
529 	*buflen = 0;
530 	return NULL;
531 }
532 
533 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
534 		      unsigned int nr_zones, struct blk_report_zones_args *args)
535 {
536 	struct ublk_device *ub = disk->private_data;
537 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
538 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
539 	unsigned int done_zones = 0;
540 	unsigned int max_zones_per_request;
541 	int ret;
542 	struct blk_zone *buffer;
543 	size_t buffer_length;
544 
545 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
546 			 nr_zones);
547 
548 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
549 	if (!buffer)
550 		return -ENOMEM;
551 
552 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
553 
554 	while (done_zones < nr_zones) {
555 		unsigned int remaining_zones = nr_zones - done_zones;
556 		unsigned int zones_in_request =
557 			min_t(unsigned int, remaining_zones, max_zones_per_request);
558 		struct request *req;
559 		struct ublk_zoned_report_desc desc;
560 		blk_status_t status;
561 
562 		memset(buffer, 0, buffer_length);
563 
564 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
565 		if (IS_ERR(req)) {
566 			ret = PTR_ERR(req);
567 			goto out;
568 		}
569 
570 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
571 		desc.sector = sector;
572 		desc.nr_zones = zones_in_request;
573 		ret = ublk_zoned_insert_report_desc(req, &desc);
574 		if (ret)
575 			goto free_req;
576 
577 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
578 		if (ret)
579 			goto erase_desc;
580 
581 		status = blk_execute_rq(req, 0);
582 		ret = blk_status_to_errno(status);
583 erase_desc:
584 		ublk_zoned_erase_report_desc(req);
585 free_req:
586 		blk_mq_free_request(req);
587 		if (ret)
588 			goto out;
589 
590 		for (unsigned int i = 0; i < zones_in_request; i++) {
591 			struct blk_zone *zone = buffer + i;
592 
593 			/* A zero length zone means no more zones in this response */
594 			if (!zone->len)
595 				break;
596 
597 			ret = disk_report_zone(disk, zone, i, args);
598 			if (ret)
599 				goto out;
600 
601 			done_zones++;
602 			sector += zone_size_sectors;
603 
604 		}
605 	}
606 
607 	ret = done_zones;
608 
609 out:
610 	kvfree(buffer);
611 	return ret;
612 }
613 
614 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
615 					 struct request *req)
616 {
617 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
618 	struct ublk_io *io = &ubq->ios[req->tag];
619 	struct ublk_zoned_report_desc *desc;
620 	u32 ublk_op;
621 
622 	switch (req_op(req)) {
623 	case REQ_OP_ZONE_OPEN:
624 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
625 		break;
626 	case REQ_OP_ZONE_CLOSE:
627 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
628 		break;
629 	case REQ_OP_ZONE_FINISH:
630 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
631 		break;
632 	case REQ_OP_ZONE_RESET:
633 		ublk_op = UBLK_IO_OP_ZONE_RESET;
634 		break;
635 	case REQ_OP_ZONE_APPEND:
636 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
637 		break;
638 	case REQ_OP_ZONE_RESET_ALL:
639 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
640 		break;
641 	case REQ_OP_DRV_IN:
642 		desc = ublk_zoned_get_report_desc(req);
643 		if (!desc)
644 			return BLK_STS_IOERR;
645 		ublk_op = desc->operation;
646 		switch (ublk_op) {
647 		case UBLK_IO_OP_REPORT_ZONES:
648 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
649 			iod->nr_zones = desc->nr_zones;
650 			iod->start_sector = desc->sector;
651 			return BLK_STS_OK;
652 		default:
653 			return BLK_STS_IOERR;
654 		}
655 	case REQ_OP_DRV_OUT:
656 		/* We do not support drv_out */
657 		return BLK_STS_NOTSUPP;
658 	default:
659 		return BLK_STS_IOERR;
660 	}
661 
662 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
663 	iod->nr_sectors = blk_rq_sectors(req);
664 	iod->start_sector = blk_rq_pos(req);
665 	iod->addr = io->buf.addr;
666 
667 	return BLK_STS_OK;
668 }
669 
670 #else
671 
672 #define ublk_report_zones (NULL)
673 
674 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
675 {
676 	return -EOPNOTSUPP;
677 }
678 
679 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
680 {
681 }
682 
683 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
684 {
685 	return 0;
686 }
687 
688 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
689 					 struct request *req)
690 {
691 	return BLK_STS_NOTSUPP;
692 }
693 
694 #endif
695 
696 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
697 				      bool need_map, struct io_comp_batch *iob);
698 
699 static dev_t ublk_chr_devt;
700 static const struct class ublk_chr_class = {
701 	.name = "ublk-char",
702 };
703 
704 static DEFINE_IDR(ublk_index_idr);
705 static DEFINE_SPINLOCK(ublk_idr_lock);
706 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
707 
708 static DEFINE_MUTEX(ublk_ctl_mutex);
709 
710 static struct ublk_batch_fetch_cmd *
711 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
712 {
713 	struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
714 
715 	if (fcmd) {
716 		fcmd->cmd = cmd;
717 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
718 	}
719 	return fcmd;
720 }
721 
722 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
723 {
724 	kfree(fcmd);
725 }
726 
727 static void __ublk_release_fcmd(struct ublk_queue *ubq)
728 {
729 	WRITE_ONCE(ubq->active_fcmd, NULL);
730 }
731 
732 /*
733  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
734  * dispatching
735  */
736 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
737 					const struct ublk_batch_io_data *data,
738 					struct ublk_batch_fetch_cmd *fcmd,
739 					int res)
740 {
741 	spin_lock(&ubq->evts_lock);
742 	list_del_init(&fcmd->node);
743 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
744 	__ublk_release_fcmd(ubq);
745 	spin_unlock(&ubq->evts_lock);
746 
747 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
748 	ublk_batch_free_fcmd(fcmd);
749 }
750 
751 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
752 				     struct io_br_sel *sel,
753 				     unsigned int issue_flags)
754 {
755 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
756 		return -ENOBUFS;
757 	return 0;
758 }
759 
760 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
761 				       void __user *buf, const u16 *tag_buf,
762 				       unsigned int len)
763 {
764 	if (copy_to_user(buf, tag_buf, len))
765 		return -EFAULT;
766 	return len;
767 }
768 
769 #define UBLK_MAX_UBLKS UBLK_MINORS
770 
771 /*
772  * Max unprivileged ublk devices allowed to add
773  *
774  * It can be extended to one per-user limit in future or even controlled
775  * by cgroup.
776  */
777 static unsigned int unprivileged_ublks_max = 64;
778 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
779 
780 static struct miscdevice ublk_misc;
781 
782 static inline unsigned ublk_pos_to_hwq(loff_t pos)
783 {
784 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
785 		UBLK_QID_BITS_MASK;
786 }
787 
788 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
789 {
790 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
791 }
792 
793 static inline unsigned ublk_pos_to_tag(loff_t pos)
794 {
795 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
796 		UBLK_TAG_BITS_MASK;
797 }
798 
799 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
800 {
801 	const struct ublk_param_basic *p = &ub->params.basic;
802 
803 	if (p->attrs & UBLK_ATTR_READ_ONLY)
804 		set_disk_ro(ub->ub_disk, true);
805 
806 	set_capacity(ub->ub_disk, p->dev_sectors);
807 }
808 
809 static int ublk_integrity_flags(u32 flags)
810 {
811 	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
812 
813 	if (flags & LBMD_PI_CAP_INTEGRITY) {
814 		flags &= ~LBMD_PI_CAP_INTEGRITY;
815 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
816 	}
817 	if (flags & LBMD_PI_CAP_REFTAG) {
818 		flags &= ~LBMD_PI_CAP_REFTAG;
819 		ret_flags |= BLK_INTEGRITY_REF_TAG;
820 	}
821 	return flags ? -EINVAL : ret_flags;
822 }
823 
824 static int ublk_integrity_pi_tuple_size(u8 csum_type)
825 {
826 	switch (csum_type) {
827 	case LBMD_PI_CSUM_NONE:
828 		return 0;
829 	case LBMD_PI_CSUM_IP:
830 	case LBMD_PI_CSUM_CRC16_T10DIF:
831 		return 8;
832 	case LBMD_PI_CSUM_CRC64_NVME:
833 		return 16;
834 	default:
835 		return -EINVAL;
836 	}
837 }
838 
839 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
840 {
841 	switch (csum_type) {
842 	case LBMD_PI_CSUM_NONE:
843 		return BLK_INTEGRITY_CSUM_NONE;
844 	case LBMD_PI_CSUM_IP:
845 		return BLK_INTEGRITY_CSUM_IP;
846 	case LBMD_PI_CSUM_CRC16_T10DIF:
847 		return BLK_INTEGRITY_CSUM_CRC;
848 	case LBMD_PI_CSUM_CRC64_NVME:
849 		return BLK_INTEGRITY_CSUM_CRC64;
850 	default:
851 		WARN_ON_ONCE(1);
852 		return BLK_INTEGRITY_CSUM_NONE;
853 	}
854 }
855 
856 static int ublk_validate_params(const struct ublk_device *ub)
857 {
858 	/* basic param is the only one which must be set */
859 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
860 		const struct ublk_param_basic *p = &ub->params.basic;
861 
862 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
863 			return -EINVAL;
864 
865 		if (p->logical_bs_shift > p->physical_bs_shift)
866 			return -EINVAL;
867 
868 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
869 			return -EINVAL;
870 
871 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
872 			return -EINVAL;
873 	} else
874 		return -EINVAL;
875 
876 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
877 		const struct ublk_param_discard *p = &ub->params.discard;
878 
879 		/* So far, only support single segment discard */
880 		if (p->max_discard_sectors && p->max_discard_segments != 1)
881 			return -EINVAL;
882 
883 		if (!p->discard_granularity)
884 			return -EINVAL;
885 	}
886 
887 	/* dev_t is read-only */
888 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
889 		return -EINVAL;
890 
891 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
892 		return ublk_dev_param_zoned_validate(ub);
893 	else if (ublk_dev_is_zoned(ub))
894 		return -EINVAL;
895 
896 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
897 		const struct ublk_param_dma_align *p = &ub->params.dma;
898 
899 		if (p->alignment >= PAGE_SIZE)
900 			return -EINVAL;
901 
902 		if (!is_power_of_2(p->alignment + 1))
903 			return -EINVAL;
904 	}
905 
906 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
907 		const struct ublk_param_segment *p = &ub->params.seg;
908 
909 		if (!is_power_of_2(p->seg_boundary_mask + 1))
910 			return -EINVAL;
911 
912 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
913 			return -EINVAL;
914 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
915 			return -EINVAL;
916 	}
917 
918 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
919 		const struct ublk_param_integrity *p = &ub->params.integrity;
920 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
921 		int flags = ublk_integrity_flags(p->flags);
922 
923 		if (!ublk_dev_support_integrity(ub))
924 			return -EINVAL;
925 		if (flags < 0)
926 			return flags;
927 		if (pi_tuple_size < 0)
928 			return pi_tuple_size;
929 		if (!p->metadata_size)
930 			return -EINVAL;
931 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
932 		    p->flags & LBMD_PI_CAP_REFTAG)
933 			return -EINVAL;
934 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
935 			return -EINVAL;
936 		if (p->interval_exp < SECTOR_SHIFT ||
937 		    p->interval_exp > ub->params.basic.logical_bs_shift)
938 			return -EINVAL;
939 	}
940 
941 	return 0;
942 }
943 
944 static void ublk_apply_params(struct ublk_device *ub)
945 {
946 	ublk_dev_param_basic_apply(ub);
947 
948 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
949 		ublk_dev_param_zoned_apply(ub);
950 }
951 
952 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
953 {
954 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
955 		!ublk_support_auto_buf_reg(ubq);
956 }
957 
958 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
959 {
960 	return !ublk_dev_support_user_copy(ub) &&
961 	       !ublk_dev_support_zero_copy(ub) &&
962 	       !ublk_dev_support_auto_buf_reg(ub);
963 }
964 
965 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
966 {
967 	/*
968 	 * read()/write() is involved in user copy, so request reference
969 	 * has to be grabbed
970 	 *
971 	 * for zero copy, request buffer need to be registered to io_uring
972 	 * buffer table, so reference is needed
973 	 *
974 	 * For auto buffer register, ublk server still may issue
975 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
976 	 * so reference is required too.
977 	 */
978 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
979 		ublk_support_auto_buf_reg(ubq);
980 }
981 
982 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
983 {
984 	return ublk_dev_support_user_copy(ub) ||
985 	       ublk_dev_support_zero_copy(ub) ||
986 	       ublk_dev_support_auto_buf_reg(ub);
987 }
988 
989 /*
990  * ublk IO Reference Counting Design
991  * ==================================
992  *
993  * For user-copy and zero-copy modes, ublk uses a split reference model with
994  * two counters that together track IO lifetime:
995  *
996  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
997  *   - io->task_registered_buffers: count of buffers registered on the IO task
998  *
999  * Key Invariant:
1000  * --------------
1001  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1002  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1003  * when no active references exist. After IO completion, both counters become
1004  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1005  * task_registered_buffers are 0.
1006  *
1007  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1008  * exit to determine if all references have been released.
1009  *
1010  * Why Split Counters:
1011  * -------------------
1012  * Buffers registered on the IO daemon task can use the lightweight
1013  * task_registered_buffers counter (simple increment/decrement) instead of
1014  * atomic refcount operations. The ublk_io_release() callback checks if
1015  * current == io->task to decide which counter to update.
1016  *
1017  * This optimization only applies before IO completion. At completion,
1018  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1019  * After that, all subsequent buffer unregistrations must use the atomic ref
1020  * since they may be releasing the last reference.
1021  *
1022  * Reference Lifecycle:
1023  * --------------------
1024  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1025  *
1026  * 2. During IO processing:
1027  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1028  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1029  *    - Buffer unregister callback (ublk_io_release):
1030  *      * If on-task: task_registered_buffers--
1031  *      * If off-task: ref-- via ublk_put_req_ref()
1032  *
1033  * 3. ublk_sub_req_ref() at IO completion:
1034  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1035  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1036  *    - This effectively collapses task_registered_buffers into the atomic ref,
1037  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1038  *      buffers that were already counted
1039  *
1040  * Example (zero-copy, register on-task, unregister off-task):
1041  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1042  *   - Register buffer on-task: task_registered_buffers = 1
1043  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1044  *   - Completion via ublk_sub_req_ref():
1045  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1046  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1047  *
1048  * Example (auto buffer registration):
1049  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1050  *
1051  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1052  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1053  *   - Completion via ublk_sub_req_ref():
1054  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1055  *
1056  * Example (zero-copy, ublk server killed):
1057  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1058  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1059  *
1060  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1061  *   - Register buffer on-task: task_registered_buffers = 1
1062  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1063  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1064  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1065  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1066  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1067  *     and abort pending requests
1068  *
1069  * Batch IO Special Case:
1070  * ----------------------
1071  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1072  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1073  * task_registered_buffers counter still tracks registered buffers for the
1074  * invariant check, even though the callback doesn't decrement it.
1075  *
1076  * Note: updating task_registered_buffers is protected by io->lock.
1077  */
1078 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1079 		struct ublk_io *io)
1080 {
1081 	if (ublk_need_req_ref(ubq))
1082 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1083 }
1084 
1085 static inline bool ublk_get_req_ref(struct ublk_io *io)
1086 {
1087 	return refcount_inc_not_zero(&io->ref);
1088 }
1089 
1090 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1091 {
1092 	if (!refcount_dec_and_test(&io->ref))
1093 		return;
1094 
1095 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1096 	__ublk_complete_rq(req, io, false, NULL);
1097 }
1098 
1099 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1100 {
1101 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1102 
1103 	io->task_registered_buffers = 0;
1104 	return refcount_sub_and_test(sub_refs, &io->ref);
1105 }
1106 
1107 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1108 {
1109 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1110 }
1111 
1112 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1113 {
1114 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1115 }
1116 
1117 /* Called in slow path only, keep it noinline for trace purpose */
1118 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1119 {
1120 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1121 		return ub;
1122 	return NULL;
1123 }
1124 
1125 /* Called in slow path only, keep it noinline for trace purpose */
1126 static noinline void ublk_put_device(struct ublk_device *ub)
1127 {
1128 	put_device(&ub->cdev_dev);
1129 }
1130 
1131 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1132 		int qid)
1133 {
1134 	return dev->queues[qid];
1135 }
1136 
1137 static inline bool ublk_rq_has_data(const struct request *rq)
1138 {
1139 	return bio_has_data(rq->bio);
1140 }
1141 
1142 static inline struct ublksrv_io_desc *
1143 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1144 {
1145 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1146 }
1147 
1148 static inline int __ublk_queue_cmd_buf_size(int depth)
1149 {
1150 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1151 }
1152 
1153 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1154 {
1155 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1156 }
1157 
1158 static int ublk_max_cmd_buf_size(void)
1159 {
1160 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1161 }
1162 
1163 /*
1164  * Should I/O outstanding to the ublk server when it exits be reissued?
1165  * If not, outstanding I/O will get errors.
1166  */
1167 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1168 {
1169 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1170 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1171 }
1172 
1173 /*
1174  * Should I/O issued while there is no ublk server queue? If not, I/O
1175  * issued while there is no ublk server will get errors.
1176  */
1177 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1178 {
1179 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1180 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1181 }
1182 
1183 /*
1184  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1185  * of the device flags for smaller cache footprint - better for fast
1186  * paths.
1187  */
1188 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1189 {
1190 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1191 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1192 }
1193 
1194 /*
1195  * Should ublk devices be stopped (i.e. no recovery possible) when the
1196  * ublk server exits? If not, devices can be used again by a future
1197  * incarnation of a ublk server via the start_recovery/end_recovery
1198  * commands.
1199  */
1200 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1201 {
1202 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1203 }
1204 
1205 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1206 {
1207 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1208 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1209 }
1210 
1211 static void ublk_free_disk(struct gendisk *disk)
1212 {
1213 	struct ublk_device *ub = disk->private_data;
1214 
1215 	clear_bit(UB_STATE_USED, &ub->state);
1216 	ublk_put_device(ub);
1217 }
1218 
1219 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1220 		unsigned int *owner_gid)
1221 {
1222 	kuid_t uid;
1223 	kgid_t gid;
1224 
1225 	current_uid_gid(&uid, &gid);
1226 
1227 	*owner_uid = from_kuid(&init_user_ns, uid);
1228 	*owner_gid = from_kgid(&init_user_ns, gid);
1229 }
1230 
1231 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1232 {
1233 	struct ublk_device *ub = disk->private_data;
1234 
1235 	if (capable(CAP_SYS_ADMIN))
1236 		return 0;
1237 
1238 	/*
1239 	 * If it is one unprivileged device, only owner can open
1240 	 * the disk. Otherwise it could be one trap made by one
1241 	 * evil user who grants this disk's privileges to other
1242 	 * users deliberately.
1243 	 *
1244 	 * This way is reasonable too given anyone can create
1245 	 * unprivileged device, and no need other's grant.
1246 	 */
1247 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1248 		unsigned int curr_uid, curr_gid;
1249 
1250 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1251 
1252 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1253 				ub->dev_info.owner_gid)
1254 			return -EPERM;
1255 	}
1256 
1257 	if (ub->block_open)
1258 		return -ENXIO;
1259 
1260 	return 0;
1261 }
1262 
1263 static const struct block_device_operations ub_fops = {
1264 	.owner =	THIS_MODULE,
1265 	.open =		ublk_open,
1266 	.free_disk =	ublk_free_disk,
1267 	.report_zones =	ublk_report_zones,
1268 };
1269 
1270 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1271 				struct iov_iter *uiter, int dir, size_t *done)
1272 {
1273 	unsigned len;
1274 	void *bv_buf;
1275 	size_t copied;
1276 
1277 	if (*offset >= bv->bv_len) {
1278 		*offset -= bv->bv_len;
1279 		return true;
1280 	}
1281 
1282 	len = bv->bv_len - *offset;
1283 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1284 	if (dir == ITER_DEST)
1285 		copied = copy_to_iter(bv_buf, len, uiter);
1286 	else
1287 		copied = copy_from_iter(bv_buf, len, uiter);
1288 
1289 	kunmap_local(bv_buf);
1290 
1291 	*done += copied;
1292 	if (copied < len)
1293 		return false;
1294 
1295 	*offset = 0;
1296 	return true;
1297 }
1298 
1299 /*
1300  * Copy data between request pages and io_iter, and 'offset'
1301  * is the start point of linear offset of request.
1302  */
1303 static size_t ublk_copy_user_pages(const struct request *req,
1304 		unsigned offset, struct iov_iter *uiter, int dir)
1305 {
1306 	struct req_iterator iter;
1307 	struct bio_vec bv;
1308 	size_t done = 0;
1309 
1310 	rq_for_each_segment(bv, req, iter) {
1311 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1312 			break;
1313 	}
1314 	return done;
1315 }
1316 
1317 #ifdef CONFIG_BLK_DEV_INTEGRITY
1318 static size_t ublk_copy_user_integrity(const struct request *req,
1319 		unsigned offset, struct iov_iter *uiter, int dir)
1320 {
1321 	size_t done = 0;
1322 	struct bio *bio = req->bio;
1323 	struct bvec_iter iter;
1324 	struct bio_vec iv;
1325 
1326 	if (!blk_integrity_rq(req))
1327 		return 0;
1328 
1329 	bio_for_each_integrity_vec(iv, bio, iter) {
1330 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1331 			break;
1332 	}
1333 
1334 	return done;
1335 }
1336 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1337 static size_t ublk_copy_user_integrity(const struct request *req,
1338 		unsigned offset, struct iov_iter *uiter, int dir)
1339 {
1340 	return 0;
1341 }
1342 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1343 
1344 static inline bool ublk_need_map_req(const struct request *req)
1345 {
1346 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1347 }
1348 
1349 static inline bool ublk_need_unmap_req(const struct request *req)
1350 {
1351 	return ublk_rq_has_data(req) &&
1352 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1353 }
1354 
1355 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1356 				const struct request *req,
1357 				const struct ublk_io *io)
1358 {
1359 	const unsigned int rq_bytes = blk_rq_bytes(req);
1360 
1361 	if (!ublk_need_map_io(ubq))
1362 		return rq_bytes;
1363 
1364 	/*
1365 	 * no zero copy, we delay copy WRITE request data into ublksrv
1366 	 * context and the big benefit is that pinning pages in current
1367 	 * context is pretty fast, see ublk_pin_user_pages
1368 	 */
1369 	if (ublk_need_map_req(req)) {
1370 		struct iov_iter iter;
1371 		const int dir = ITER_DEST;
1372 
1373 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1374 		return ublk_copy_user_pages(req, 0, &iter, dir);
1375 	}
1376 	return rq_bytes;
1377 }
1378 
1379 static unsigned int ublk_unmap_io(bool need_map,
1380 		const struct request *req,
1381 		const struct ublk_io *io)
1382 {
1383 	const unsigned int rq_bytes = blk_rq_bytes(req);
1384 
1385 	if (!need_map)
1386 		return rq_bytes;
1387 
1388 	if (ublk_need_unmap_req(req)) {
1389 		struct iov_iter iter;
1390 		const int dir = ITER_SOURCE;
1391 
1392 		WARN_ON_ONCE(io->res > rq_bytes);
1393 
1394 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1395 		return ublk_copy_user_pages(req, 0, &iter, dir);
1396 	}
1397 	return rq_bytes;
1398 }
1399 
1400 static inline unsigned int ublk_req_build_flags(struct request *req)
1401 {
1402 	unsigned flags = 0;
1403 
1404 	if (req->cmd_flags & REQ_FAILFAST_DEV)
1405 		flags |= UBLK_IO_F_FAILFAST_DEV;
1406 
1407 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1408 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1409 
1410 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1411 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
1412 
1413 	if (req->cmd_flags & REQ_META)
1414 		flags |= UBLK_IO_F_META;
1415 
1416 	if (req->cmd_flags & REQ_FUA)
1417 		flags |= UBLK_IO_F_FUA;
1418 
1419 	if (req->cmd_flags & REQ_NOUNMAP)
1420 		flags |= UBLK_IO_F_NOUNMAP;
1421 
1422 	if (req->cmd_flags & REQ_SWAP)
1423 		flags |= UBLK_IO_F_SWAP;
1424 
1425 	if (blk_integrity_rq(req))
1426 		flags |= UBLK_IO_F_INTEGRITY;
1427 
1428 	return flags;
1429 }
1430 
1431 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1432 {
1433 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1434 	struct ublk_io *io = &ubq->ios[req->tag];
1435 	u32 ublk_op;
1436 
1437 	switch (req_op(req)) {
1438 	case REQ_OP_READ:
1439 		ublk_op = UBLK_IO_OP_READ;
1440 		break;
1441 	case REQ_OP_WRITE:
1442 		ublk_op = UBLK_IO_OP_WRITE;
1443 		break;
1444 	case REQ_OP_FLUSH:
1445 		ublk_op = UBLK_IO_OP_FLUSH;
1446 		break;
1447 	case REQ_OP_DISCARD:
1448 		ublk_op = UBLK_IO_OP_DISCARD;
1449 		break;
1450 	case REQ_OP_WRITE_ZEROES:
1451 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1452 		break;
1453 	default:
1454 		if (ublk_queue_is_zoned(ubq))
1455 			return ublk_setup_iod_zoned(ubq, req);
1456 		return BLK_STS_IOERR;
1457 	}
1458 
1459 	/* need to translate since kernel may change */
1460 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
1461 	iod->nr_sectors = blk_rq_sectors(req);
1462 	iod->start_sector = blk_rq_pos(req);
1463 	iod->addr = io->buf.addr;
1464 
1465 	return BLK_STS_OK;
1466 }
1467 
1468 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1469 		struct io_uring_cmd *ioucmd)
1470 {
1471 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1472 }
1473 
1474 static void ublk_end_request(struct request *req, blk_status_t error)
1475 {
1476 	local_bh_disable();
1477 	blk_mq_end_request(req, error);
1478 	local_bh_enable();
1479 }
1480 
1481 /* todo: handle partial completion */
1482 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1483 				      bool need_map, struct io_comp_batch *iob)
1484 {
1485 	unsigned int unmapped_bytes;
1486 	blk_status_t res = BLK_STS_OK;
1487 	bool requeue;
1488 
1489 	/* failed read IO if nothing is read */
1490 	if (!io->res && req_op(req) == REQ_OP_READ)
1491 		io->res = -EIO;
1492 
1493 	if (io->res < 0) {
1494 		res = errno_to_blk_status(io->res);
1495 		goto exit;
1496 	}
1497 
1498 	/*
1499 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1500 	 * directly.
1501 	 *
1502 	 * Both the two needn't unmap.
1503 	 */
1504 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1505 	    req_op(req) != REQ_OP_DRV_IN)
1506 		goto exit;
1507 
1508 	/* for READ request, writing data in iod->addr to rq buffers */
1509 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1510 
1511 	/*
1512 	 * Extremely impossible since we got data filled in just before
1513 	 *
1514 	 * Re-read simply for this unlikely case.
1515 	 */
1516 	if (unlikely(unmapped_bytes < io->res))
1517 		io->res = unmapped_bytes;
1518 
1519 	/*
1520 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1521 	 * happens off this path, then that will prevent ublk's blkdev_release()
1522 	 * from being called on current's task work, see fput() implementation.
1523 	 *
1524 	 * Otherwise, ublk server may not provide forward progress in case of
1525 	 * reading the partition table from bdev_open() with disk->open_mutex
1526 	 * held, and causes dead lock as we could already be holding
1527 	 * disk->open_mutex here.
1528 	 *
1529 	 * Preferably we would not be doing IO with a mutex held that is also
1530 	 * used for release, but this work-around will suffice for now.
1531 	 */
1532 	local_bh_disable();
1533 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1534 	local_bh_enable();
1535 	if (requeue)
1536 		blk_mq_requeue_request(req, true);
1537 	else if (likely(!blk_should_fake_timeout(req->q))) {
1538 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1539 			return;
1540 		__blk_mq_end_request(req, BLK_STS_OK);
1541 	}
1542 
1543 	return;
1544 exit:
1545 	ublk_end_request(req, res);
1546 }
1547 
1548 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1549 						     struct request *req)
1550 {
1551 	/* read cmd first because req will overwrite it */
1552 	struct io_uring_cmd *cmd = io->cmd;
1553 
1554 	/* mark this cmd owned by ublksrv */
1555 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1556 
1557 	/*
1558 	 * clear ACTIVE since we are done with this sqe/cmd slot
1559 	 * We can only accept io cmd in case of being not active.
1560 	 */
1561 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1562 
1563 	io->req = req;
1564 	return cmd;
1565 }
1566 
1567 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1568 				 int res, unsigned issue_flags)
1569 {
1570 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1571 
1572 	/* tell ublksrv one io request is coming */
1573 	io_uring_cmd_done(cmd, res, issue_flags);
1574 }
1575 
1576 #define UBLK_REQUEUE_DELAY_MS	3
1577 
1578 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1579 		struct request *rq)
1580 {
1581 	/* We cannot process this rq so just requeue it. */
1582 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1583 		blk_mq_requeue_request(rq, false);
1584 	else
1585 		ublk_end_request(rq, BLK_STS_IOERR);
1586 }
1587 
1588 static void
1589 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1590 {
1591 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1592 
1593 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1594 }
1595 
1596 enum auto_buf_reg_res {
1597 	AUTO_BUF_REG_FAIL,
1598 	AUTO_BUF_REG_FALLBACK,
1599 	AUTO_BUF_REG_OK,
1600 };
1601 
1602 /*
1603  * Setup io state after auto buffer registration.
1604  *
1605  * Must be called after ublk_auto_buf_register() is done.
1606  * Caller must hold io->lock in batch context.
1607  */
1608 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1609 				   struct request *req, struct ublk_io *io,
1610 				   struct io_uring_cmd *cmd,
1611 				   enum auto_buf_reg_res res)
1612 {
1613 	if (res == AUTO_BUF_REG_OK) {
1614 		io->task_registered_buffers = 1;
1615 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1616 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1617 	}
1618 	ublk_init_req_ref(ubq, io);
1619 	__ublk_prep_compl_io_cmd(io, req);
1620 }
1621 
1622 /* Register request bvec to io_uring for auto buffer registration. */
1623 static enum auto_buf_reg_res
1624 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1625 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1626 		       unsigned int issue_flags)
1627 {
1628 	int ret;
1629 
1630 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1631 				      io->buf.auto_reg.index, issue_flags);
1632 	if (ret) {
1633 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1634 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1635 			return AUTO_BUF_REG_FALLBACK;
1636 		}
1637 		ublk_end_request(req, BLK_STS_IOERR);
1638 		return AUTO_BUF_REG_FAIL;
1639 	}
1640 
1641 	return AUTO_BUF_REG_OK;
1642 }
1643 
1644 /*
1645  * Dispatch IO to userspace with auto buffer registration.
1646  *
1647  * Only called in non-batch context from task work, io->lock not held.
1648  */
1649 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1650 				   struct request *req, struct ublk_io *io,
1651 				   struct io_uring_cmd *cmd,
1652 				   unsigned int issue_flags)
1653 {
1654 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1655 			issue_flags);
1656 
1657 	if (res != AUTO_BUF_REG_FAIL) {
1658 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1659 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1660 	}
1661 }
1662 
1663 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1664 			  struct ublk_io *io)
1665 {
1666 	unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1667 
1668 	/* partially mapped, update io descriptor */
1669 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1670 		/*
1671 		 * Nothing mapped, retry until we succeed.
1672 		 *
1673 		 * We may never succeed in mapping any bytes here because
1674 		 * of OOM. TODO: reserve one buffer with single page pinned
1675 		 * for providing forward progress guarantee.
1676 		 */
1677 		if (unlikely(!mapped_bytes)) {
1678 			blk_mq_requeue_request(req, false);
1679 			blk_mq_delay_kick_requeue_list(req->q,
1680 					UBLK_REQUEUE_DELAY_MS);
1681 			return false;
1682 		}
1683 
1684 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1685 			mapped_bytes >> 9;
1686 	}
1687 
1688 	return true;
1689 }
1690 
1691 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1692 {
1693 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1694 	int tag = req->tag;
1695 	struct ublk_io *io = &ubq->ios[tag];
1696 
1697 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1698 			__func__, ubq->q_id, req->tag, io->flags,
1699 			ublk_get_iod(ubq, req->tag)->addr);
1700 
1701 	/*
1702 	 * Task is exiting if either:
1703 	 *
1704 	 * (1) current != io->task.
1705 	 * io_uring_cmd_complete_in_task() tries to run task_work
1706 	 * in a workqueue if cmd's task is PF_EXITING.
1707 	 *
1708 	 * (2) current->flags & PF_EXITING.
1709 	 */
1710 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1711 		__ublk_abort_rq(ubq, req);
1712 		return;
1713 	}
1714 
1715 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1716 		/*
1717 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1718 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1719 		 * and notify it.
1720 		 */
1721 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1722 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1723 				__func__, ubq->q_id, req->tag, io->flags);
1724 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1725 				     issue_flags);
1726 		return;
1727 	}
1728 
1729 	if (!ublk_start_io(ubq, req, io))
1730 		return;
1731 
1732 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1733 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1734 	} else {
1735 		ublk_init_req_ref(ubq, io);
1736 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1737 	}
1738 }
1739 
1740 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1741 				       const struct ublk_batch_io_data *data,
1742 				       unsigned short tag)
1743 {
1744 	struct ublk_device *ub = data->ub;
1745 	struct ublk_io *io = &ubq->ios[tag];
1746 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1747 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1748 	struct io_uring_cmd *cmd = data->cmd;
1749 
1750 	if (!ublk_start_io(ubq, req, io))
1751 		return false;
1752 
1753 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1754 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1755 				data->issue_flags);
1756 
1757 		if (res == AUTO_BUF_REG_FAIL)
1758 			return false;
1759 	}
1760 
1761 	ublk_io_lock(io);
1762 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1763 	ublk_io_unlock(io);
1764 
1765 	return true;
1766 }
1767 
1768 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1769 				     const struct ublk_batch_io_data *data,
1770 				     unsigned short *tag_buf,
1771 				     unsigned int len)
1772 {
1773 	bool has_unused = false;
1774 	unsigned int i;
1775 
1776 	for (i = 0; i < len; i++) {
1777 		unsigned short tag = tag_buf[i];
1778 
1779 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1780 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1781 			has_unused = true;
1782 		}
1783 	}
1784 
1785 	return has_unused;
1786 }
1787 
1788 /*
1789  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1790  * Returns the new length after filtering.
1791  */
1792 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1793 					    unsigned int len)
1794 {
1795 	unsigned int i, j;
1796 
1797 	for (i = 0, j = 0; i < len; i++) {
1798 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1799 			if (i != j)
1800 				tag_buf[j] = tag_buf[i];
1801 			j++;
1802 		}
1803 	}
1804 
1805 	return j;
1806 }
1807 
1808 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1809 		const struct ublk_batch_io_data *data,
1810 		unsigned short *tag_buf, size_t len, int ret)
1811 {
1812 	int i, res;
1813 
1814 	/*
1815 	 * Undo prep state for all IOs since userspace never received them.
1816 	 * This restores IOs to pre-prepared state so they can be cleanly
1817 	 * re-prepared when tags are pulled from FIFO again.
1818 	 */
1819 	for (i = 0; i < len; i++) {
1820 		struct ublk_io *io = &ubq->ios[tag_buf[i]];
1821 		int index = -1;
1822 
1823 		ublk_io_lock(io);
1824 		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1825 			index = io->buf.auto_reg.index;
1826 		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1827 		io->flags |= UBLK_IO_FLAG_ACTIVE;
1828 		ublk_io_unlock(io);
1829 
1830 		if (index != -1)
1831 			io_buffer_unregister_bvec(data->cmd, index,
1832 					data->issue_flags);
1833 	}
1834 
1835 	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1836 		tag_buf, len, &ubq->evts_lock);
1837 
1838 	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1839 			"tags(%d %zu) ret %d\n", __func__, res, len,
1840 			ret);
1841 }
1842 
1843 #define MAX_NR_TAG 128
1844 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1845 				 const struct ublk_batch_io_data *data,
1846 				 struct ublk_batch_fetch_cmd *fcmd)
1847 {
1848 	const unsigned int tag_sz = sizeof(unsigned short);
1849 	unsigned short tag_buf[MAX_NR_TAG];
1850 	struct io_br_sel sel;
1851 	size_t len = 0;
1852 	bool needs_filter;
1853 	int ret;
1854 
1855 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1856 
1857 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1858 					 data->issue_flags);
1859 	if (sel.val < 0)
1860 		return sel.val;
1861 	if (!sel.addr)
1862 		return -ENOBUFS;
1863 
1864 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1865 	len = min(len, sizeof(tag_buf)) / tag_sz;
1866 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1867 
1868 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1869 	/* Filter out unused tags before posting to userspace */
1870 	if (unlikely(needs_filter)) {
1871 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1872 
1873 		/* return actual length if all are failed or requeued */
1874 		if (!new_len) {
1875 			/* release the selected buffer */
1876 			sel.val = 0;
1877 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1878 						&sel, data->issue_flags));
1879 			return len;
1880 		}
1881 		len = new_len;
1882 	}
1883 
1884 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1885 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1886 	if (unlikely(ret < 0))
1887 		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1888 	return ret;
1889 }
1890 
1891 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1892 		struct ublk_queue *ubq)
1893 {
1894 	struct ublk_batch_fetch_cmd *fcmd;
1895 
1896 	lockdep_assert_held(&ubq->evts_lock);
1897 
1898 	/*
1899 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1900 	 *
1901 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1902 	 *
1903 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1904 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1905 	 */
1906 	smp_mb();
1907 	if (READ_ONCE(ubq->active_fcmd)) {
1908 		fcmd = NULL;
1909 	} else {
1910 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1911 				struct ublk_batch_fetch_cmd, node);
1912 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1913 	}
1914 	return fcmd;
1915 }
1916 
1917 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1918 {
1919 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1920 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1921 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1922 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1923 	struct ublk_batch_io_data data = {
1924 		.ub = pdu->ubq->dev,
1925 		.cmd = fcmd->cmd,
1926 		.issue_flags = issue_flags,
1927 	};
1928 
1929 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1930 
1931 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
1932 }
1933 
1934 static void
1935 ublk_batch_dispatch(struct ublk_queue *ubq,
1936 		    const struct ublk_batch_io_data *data,
1937 		    struct ublk_batch_fetch_cmd *fcmd)
1938 {
1939 	struct ublk_batch_fetch_cmd *new_fcmd;
1940 	unsigned tried = 0;
1941 	int ret = 0;
1942 
1943 again:
1944 	while (!ublk_io_evts_empty(ubq)) {
1945 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
1946 		if (ret <= 0)
1947 			break;
1948 	}
1949 
1950 	if (ret < 0) {
1951 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
1952 		return;
1953 	}
1954 
1955 	__ublk_release_fcmd(ubq);
1956 	/*
1957 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
1958 	 * checking ubq->evts_fifo.
1959 	 *
1960 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
1961 	 */
1962 	smp_mb();
1963 	if (likely(ublk_io_evts_empty(ubq)))
1964 		return;
1965 
1966 	spin_lock(&ubq->evts_lock);
1967 	new_fcmd = __ublk_acquire_fcmd(ubq);
1968 	spin_unlock(&ubq->evts_lock);
1969 
1970 	if (!new_fcmd)
1971 		return;
1972 
1973 	/* Avoid lockup by allowing to handle at most 32 batches */
1974 	if (new_fcmd == fcmd && tried++ < 32)
1975 		goto again;
1976 
1977 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
1978 }
1979 
1980 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1981 {
1982 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1983 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1984 	struct ublk_queue *ubq = pdu->ubq;
1985 
1986 	ublk_dispatch_req(ubq, pdu->req);
1987 }
1988 
1989 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
1990 {
1991 	unsigned short tag = rq->tag;
1992 	struct ublk_batch_fetch_cmd *fcmd = NULL;
1993 
1994 	spin_lock(&ubq->evts_lock);
1995 	kfifo_put(&ubq->evts_fifo, tag);
1996 	if (last)
1997 		fcmd = __ublk_acquire_fcmd(ubq);
1998 	spin_unlock(&ubq->evts_lock);
1999 
2000 	if (fcmd)
2001 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2002 }
2003 
2004 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2005 {
2006 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2007 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2008 
2009 	pdu->req = rq;
2010 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2011 }
2012 
2013 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2014 {
2015 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2016 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2017 	struct request *rq = pdu->req_list;
2018 	struct request *next;
2019 
2020 	do {
2021 		next = rq->rq_next;
2022 		rq->rq_next = NULL;
2023 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2024 		rq = next;
2025 	} while (rq);
2026 }
2027 
2028 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2029 {
2030 	struct io_uring_cmd *cmd = io->cmd;
2031 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2032 
2033 	pdu->req_list = rq_list_peek(l);
2034 	rq_list_init(l);
2035 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2036 }
2037 
2038 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2039 {
2040 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2041 	pid_t tgid = ubq->dev->ublksrv_tgid;
2042 	struct task_struct *p;
2043 	struct pid *pid;
2044 
2045 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2046 		return BLK_EH_RESET_TIMER;
2047 
2048 	if (unlikely(!tgid))
2049 		return BLK_EH_RESET_TIMER;
2050 
2051 	rcu_read_lock();
2052 	pid = find_vpid(tgid);
2053 	p = pid_task(pid, PIDTYPE_PID);
2054 	if (p)
2055 		send_sig(SIGKILL, p, 0);
2056 	rcu_read_unlock();
2057 	return BLK_EH_DONE;
2058 }
2059 
2060 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2061 				  bool check_cancel)
2062 {
2063 	blk_status_t res;
2064 
2065 	if (unlikely(READ_ONCE(ubq->fail_io)))
2066 		return BLK_STS_TARGET;
2067 
2068 	/* With recovery feature enabled, force_abort is set in
2069 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2070 	 * abort all requeued and new rqs here to let del_gendisk()
2071 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2072 	 * to avoid UAF on io_uring ctx.
2073 	 *
2074 	 * Note: force_abort is guaranteed to be seen because it is set
2075 	 * before request queue is unqiuesced.
2076 	 */
2077 	if (ublk_nosrv_should_queue_io(ubq) &&
2078 	    unlikely(READ_ONCE(ubq->force_abort)))
2079 		return BLK_STS_IOERR;
2080 
2081 	if (check_cancel && unlikely(ubq->canceling))
2082 		return BLK_STS_IOERR;
2083 
2084 	/* fill iod to slot in io cmd buffer */
2085 	res = ublk_setup_iod(ubq, rq);
2086 	if (unlikely(res != BLK_STS_OK))
2087 		return BLK_STS_IOERR;
2088 
2089 	blk_mq_start_request(rq);
2090 	return BLK_STS_OK;
2091 }
2092 
2093 /*
2094  * Common helper for queue_rq that handles request preparation and
2095  * cancellation checks. Returns status and sets should_queue to indicate
2096  * whether the caller should proceed with queuing the request.
2097  */
2098 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2099 						   struct request *rq,
2100 						   bool *should_queue)
2101 {
2102 	blk_status_t res;
2103 
2104 	res = ublk_prep_req(ubq, rq, false);
2105 	if (res != BLK_STS_OK) {
2106 		*should_queue = false;
2107 		return res;
2108 	}
2109 
2110 	/*
2111 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2112 	 * is dealt with, otherwise this request may not be failed in case
2113 	 * of recovery, and cause hang when deleting disk
2114 	 */
2115 	if (unlikely(ubq->canceling)) {
2116 		*should_queue = false;
2117 		__ublk_abort_rq(ubq, rq);
2118 		return BLK_STS_OK;
2119 	}
2120 
2121 	*should_queue = true;
2122 	return BLK_STS_OK;
2123 }
2124 
2125 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2126 		const struct blk_mq_queue_data *bd)
2127 {
2128 	struct ublk_queue *ubq = hctx->driver_data;
2129 	struct request *rq = bd->rq;
2130 	bool should_queue;
2131 	blk_status_t res;
2132 
2133 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2134 	if (!should_queue)
2135 		return res;
2136 
2137 	ublk_queue_cmd(ubq, rq);
2138 	return BLK_STS_OK;
2139 }
2140 
2141 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2142 		const struct blk_mq_queue_data *bd)
2143 {
2144 	struct ublk_queue *ubq = hctx->driver_data;
2145 	struct request *rq = bd->rq;
2146 	bool should_queue;
2147 	blk_status_t res;
2148 
2149 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2150 	if (!should_queue)
2151 		return res;
2152 
2153 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2154 	return BLK_STS_OK;
2155 }
2156 
2157 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2158 					     const struct ublk_io *io2)
2159 {
2160 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2161 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2162 		(io->task == io2->task);
2163 }
2164 
2165 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2166 {
2167 	struct ublk_queue *ubq = hctx->driver_data;
2168 	struct ublk_batch_fetch_cmd *fcmd;
2169 
2170 	spin_lock(&ubq->evts_lock);
2171 	fcmd = __ublk_acquire_fcmd(ubq);
2172 	spin_unlock(&ubq->evts_lock);
2173 
2174 	if (fcmd)
2175 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2176 }
2177 
2178 static void ublk_queue_rqs(struct rq_list *rqlist)
2179 {
2180 	struct rq_list requeue_list = { };
2181 	struct rq_list submit_list = { };
2182 	struct ublk_io *io = NULL;
2183 	struct request *req;
2184 
2185 	while ((req = rq_list_pop(rqlist))) {
2186 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2187 		struct ublk_io *this_io = &this_q->ios[req->tag];
2188 
2189 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2190 			rq_list_add_tail(&requeue_list, req);
2191 			continue;
2192 		}
2193 
2194 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2195 				!rq_list_empty(&submit_list))
2196 			ublk_queue_cmd_list(io, &submit_list);
2197 		io = this_io;
2198 		rq_list_add_tail(&submit_list, req);
2199 	}
2200 
2201 	if (!rq_list_empty(&submit_list))
2202 		ublk_queue_cmd_list(io, &submit_list);
2203 	*rqlist = requeue_list;
2204 }
2205 
2206 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2207 {
2208 	unsigned short tags[MAX_NR_TAG];
2209 	struct ublk_batch_fetch_cmd *fcmd;
2210 	struct request *rq;
2211 	unsigned cnt = 0;
2212 
2213 	spin_lock(&ubq->evts_lock);
2214 	rq_list_for_each(l, rq) {
2215 		tags[cnt++] = (unsigned short)rq->tag;
2216 		if (cnt >= MAX_NR_TAG) {
2217 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2218 			cnt = 0;
2219 		}
2220 	}
2221 	if (cnt)
2222 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2223 	fcmd = __ublk_acquire_fcmd(ubq);
2224 	spin_unlock(&ubq->evts_lock);
2225 
2226 	rq_list_init(l);
2227 	if (fcmd)
2228 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2229 }
2230 
2231 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2232 {
2233 	struct rq_list requeue_list = { };
2234 	struct rq_list submit_list = { };
2235 	struct ublk_queue *ubq = NULL;
2236 	struct request *req;
2237 
2238 	while ((req = rq_list_pop(rqlist))) {
2239 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2240 
2241 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2242 			rq_list_add_tail(&requeue_list, req);
2243 			continue;
2244 		}
2245 
2246 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2247 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2248 		ubq = this_q;
2249 		rq_list_add_tail(&submit_list, req);
2250 	}
2251 
2252 	if (!rq_list_empty(&submit_list))
2253 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2254 	*rqlist = requeue_list;
2255 }
2256 
2257 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2258 		unsigned int hctx_idx)
2259 {
2260 	struct ublk_device *ub = driver_data;
2261 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2262 
2263 	hctx->driver_data = ubq;
2264 	return 0;
2265 }
2266 
2267 static const struct blk_mq_ops ublk_mq_ops = {
2268 	.queue_rq       = ublk_queue_rq,
2269 	.queue_rqs      = ublk_queue_rqs,
2270 	.init_hctx	= ublk_init_hctx,
2271 	.timeout	= ublk_timeout,
2272 };
2273 
2274 static const struct blk_mq_ops ublk_batch_mq_ops = {
2275 	.commit_rqs	= ublk_commit_rqs,
2276 	.queue_rq       = ublk_batch_queue_rq,
2277 	.queue_rqs      = ublk_batch_queue_rqs,
2278 	.init_hctx	= ublk_init_hctx,
2279 	.timeout	= ublk_timeout,
2280 };
2281 
2282 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2283 {
2284 	int i;
2285 
2286 	ubq->nr_io_ready = 0;
2287 
2288 	for (i = 0; i < ubq->q_depth; i++) {
2289 		struct ublk_io *io = &ubq->ios[i];
2290 
2291 		/*
2292 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2293 		 * io->cmd
2294 		 */
2295 		io->flags &= UBLK_IO_FLAG_CANCELED;
2296 		io->cmd = NULL;
2297 		io->buf.addr = 0;
2298 
2299 		/*
2300 		 * old task is PF_EXITING, put it now
2301 		 *
2302 		 * It could be NULL in case of closing one quiesced
2303 		 * device.
2304 		 */
2305 		if (io->task) {
2306 			put_task_struct(io->task);
2307 			io->task = NULL;
2308 		}
2309 
2310 		WARN_ON_ONCE(refcount_read(&io->ref));
2311 		WARN_ON_ONCE(io->task_registered_buffers);
2312 	}
2313 }
2314 
2315 static int ublk_ch_open(struct inode *inode, struct file *filp)
2316 {
2317 	struct ublk_device *ub = container_of(inode->i_cdev,
2318 			struct ublk_device, cdev);
2319 
2320 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2321 		return -EBUSY;
2322 	filp->private_data = ub;
2323 	ub->ublksrv_tgid = current->tgid;
2324 	return 0;
2325 }
2326 
2327 static void ublk_reset_ch_dev(struct ublk_device *ub)
2328 {
2329 	int i;
2330 
2331 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2332 		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2333 
2334 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2335 	ub->mm = NULL;
2336 	ub->nr_queue_ready = 0;
2337 	ub->unprivileged_daemons = false;
2338 	ub->ublksrv_tgid = -1;
2339 }
2340 
2341 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2342 {
2343 	struct gendisk *disk;
2344 
2345 	spin_lock(&ub->lock);
2346 	disk = ub->ub_disk;
2347 	if (disk)
2348 		get_device(disk_to_dev(disk));
2349 	spin_unlock(&ub->lock);
2350 
2351 	return disk;
2352 }
2353 
2354 static void ublk_put_disk(struct gendisk *disk)
2355 {
2356 	if (disk)
2357 		put_device(disk_to_dev(disk));
2358 }
2359 
2360 static void ublk_partition_scan_work(struct work_struct *work)
2361 {
2362 	struct ublk_device *ub =
2363 		container_of(work, struct ublk_device, partition_scan_work);
2364 	/* Hold disk reference to prevent UAF during concurrent teardown */
2365 	struct gendisk *disk = ublk_get_disk(ub);
2366 
2367 	if (!disk)
2368 		return;
2369 
2370 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2371 					     &disk->state)))
2372 		goto out;
2373 
2374 	mutex_lock(&disk->open_mutex);
2375 	bdev_disk_changed(disk, false);
2376 	mutex_unlock(&disk->open_mutex);
2377 out:
2378 	ublk_put_disk(disk);
2379 }
2380 
2381 /*
2382  * Use this function to ensure that ->canceling is consistently set for
2383  * the device and all queues. Do not set these flags directly.
2384  *
2385  * Caller must ensure that:
2386  * - cancel_mutex is held. This ensures that there is no concurrent
2387  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2388  * - there are no concurrent reads of ubq->canceling from the queue_rq
2389  *   path. This can be done by quiescing the queue, or through other
2390  *   means.
2391  */
2392 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2393 	__must_hold(&ub->cancel_mutex)
2394 {
2395 	int i;
2396 
2397 	ub->canceling = canceling;
2398 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2399 		ublk_get_queue(ub, i)->canceling = canceling;
2400 }
2401 
2402 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2403 {
2404 	int i, j;
2405 
2406 	if (!ublk_dev_need_req_ref(ub))
2407 		return false;
2408 
2409 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2410 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2411 
2412 		for (j = 0; j < ubq->q_depth; j++) {
2413 			struct ublk_io *io = &ubq->ios[j];
2414 			unsigned int refs = refcount_read(&io->ref) +
2415 				io->task_registered_buffers;
2416 
2417 			/*
2418 			 * UBLK_REFCOUNT_INIT or zero means no active
2419 			 * reference
2420 			 */
2421 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2422 				return true;
2423 
2424 			/* reset to zero if the io hasn't active references */
2425 			refcount_set(&io->ref, 0);
2426 			io->task_registered_buffers = 0;
2427 		}
2428 	}
2429 	return false;
2430 }
2431 
2432 static void ublk_ch_release_work_fn(struct work_struct *work)
2433 {
2434 	struct ublk_device *ub =
2435 		container_of(work, struct ublk_device, exit_work.work);
2436 	struct gendisk *disk;
2437 	int i;
2438 
2439 	/*
2440 	 * For zero-copy and auto buffer register modes, I/O references
2441 	 * might not be dropped naturally when the daemon is killed, but
2442 	 * io_uring guarantees that registered bvec kernel buffers are
2443 	 * unregistered finally when freeing io_uring context, then the
2444 	 * active references are dropped.
2445 	 *
2446 	 * Wait until active references are dropped for avoiding use-after-free
2447 	 *
2448 	 * registered buffer may be unregistered in io_ring's release hander,
2449 	 * so have to wait by scheduling work function for avoiding the two
2450 	 * file release dependency.
2451 	 */
2452 	if (ublk_check_and_reset_active_ref(ub)) {
2453 		schedule_delayed_work(&ub->exit_work, 1);
2454 		return;
2455 	}
2456 
2457 	/*
2458 	 * disk isn't attached yet, either device isn't live, or it has
2459 	 * been removed already, so we needn't to do anything
2460 	 */
2461 	disk = ublk_get_disk(ub);
2462 	if (!disk)
2463 		goto out;
2464 
2465 	/*
2466 	 * All uring_cmd are done now, so abort any request outstanding to
2467 	 * the ublk server
2468 	 *
2469 	 * This can be done in lockless way because ublk server has been
2470 	 * gone
2471 	 *
2472 	 * More importantly, we have to provide forward progress guarantee
2473 	 * without holding ub->mutex, otherwise control task grabbing
2474 	 * ub->mutex triggers deadlock
2475 	 *
2476 	 * All requests may be inflight, so ->canceling may not be set, set
2477 	 * it now.
2478 	 */
2479 	mutex_lock(&ub->cancel_mutex);
2480 	ublk_set_canceling(ub, true);
2481 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2482 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2483 	mutex_unlock(&ub->cancel_mutex);
2484 	blk_mq_kick_requeue_list(disk->queue);
2485 
2486 	/*
2487 	 * All infligh requests have been completed or requeued and any new
2488 	 * request will be failed or requeued via `->canceling` now, so it is
2489 	 * fine to grab ub->mutex now.
2490 	 */
2491 	mutex_lock(&ub->mutex);
2492 
2493 	/* double check after grabbing lock */
2494 	if (!ub->ub_disk)
2495 		goto unlock;
2496 
2497 	/*
2498 	 * Transition the device to the nosrv state. What exactly this
2499 	 * means depends on the recovery flags
2500 	 */
2501 	if (ublk_nosrv_should_stop_dev(ub)) {
2502 		/*
2503 		 * Allow any pending/future I/O to pass through quickly
2504 		 * with an error. This is needed because del_gendisk
2505 		 * waits for all pending I/O to complete
2506 		 */
2507 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2508 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2509 
2510 		ublk_stop_dev_unlocked(ub);
2511 	} else {
2512 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2513 			/* ->canceling is set and all requests are aborted */
2514 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2515 		} else {
2516 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2517 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2518 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2519 		}
2520 	}
2521 unlock:
2522 	mutex_unlock(&ub->mutex);
2523 	ublk_put_disk(disk);
2524 
2525 	/* all uring_cmd has been done now, reset device & ubq */
2526 	ublk_reset_ch_dev(ub);
2527 out:
2528 	clear_bit(UB_STATE_OPEN, &ub->state);
2529 
2530 	/* put the reference grabbed in ublk_ch_release() */
2531 	ublk_put_device(ub);
2532 }
2533 
2534 static int ublk_ch_release(struct inode *inode, struct file *filp)
2535 {
2536 	struct ublk_device *ub = filp->private_data;
2537 
2538 	/*
2539 	 * Grab ublk device reference, so it won't be gone until we are
2540 	 * really released from work function.
2541 	 */
2542 	ublk_get_device(ub);
2543 
2544 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2545 	schedule_delayed_work(&ub->exit_work, 0);
2546 	return 0;
2547 }
2548 
2549 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
2550 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2551 {
2552 	struct ublk_device *ub = filp->private_data;
2553 	size_t sz = vma->vm_end - vma->vm_start;
2554 	unsigned max_sz = ublk_max_cmd_buf_size();
2555 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2556 	int q_id, ret = 0;
2557 
2558 	spin_lock(&ub->lock);
2559 	if (!ub->mm)
2560 		ub->mm = current->mm;
2561 	if (current->mm != ub->mm)
2562 		ret = -EINVAL;
2563 	spin_unlock(&ub->lock);
2564 
2565 	if (ret)
2566 		return ret;
2567 
2568 	if (vma->vm_flags & VM_WRITE)
2569 		return -EPERM;
2570 
2571 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2572 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2573 		return -EINVAL;
2574 
2575 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2576 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2577 			__func__, q_id, current->pid, vma->vm_start,
2578 			phys_off, (unsigned long)sz);
2579 
2580 	if (sz != ublk_queue_cmd_buf_size(ub))
2581 		return -EINVAL;
2582 
2583 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2584 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2585 }
2586 
2587 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2588 		struct request *req)
2589 {
2590 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2591 			io->flags & UBLK_IO_FLAG_ACTIVE);
2592 
2593 	if (ublk_nosrv_should_reissue_outstanding(ub))
2594 		blk_mq_requeue_request(req, false);
2595 	else {
2596 		io->res = -EIO;
2597 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2598 	}
2599 }
2600 
2601 /*
2602  * Request tag may just be filled to event kfifo, not get chance to
2603  * dispatch, abort these requests too
2604  */
2605 static void ublk_abort_batch_queue(struct ublk_device *ub,
2606 				   struct ublk_queue *ubq)
2607 {
2608 	unsigned short tag;
2609 
2610 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2611 		struct request *req = blk_mq_tag_to_rq(
2612 				ub->tag_set.tags[ubq->q_id], tag);
2613 
2614 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2615 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2616 	}
2617 }
2618 
2619 /*
2620  * Called from ublk char device release handler, when any uring_cmd is
2621  * done, meantime request queue is "quiesced" since all inflight requests
2622  * can't be completed because ublk server is dead.
2623  *
2624  * So no one can hold our request IO reference any more, simply ignore the
2625  * reference, and complete the request immediately
2626  */
2627 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2628 {
2629 	int i;
2630 
2631 	for (i = 0; i < ubq->q_depth; i++) {
2632 		struct ublk_io *io = &ubq->ios[i];
2633 
2634 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2635 			__ublk_fail_req(ub, io, io->req);
2636 	}
2637 
2638 	if (ublk_support_batch_io(ubq))
2639 		ublk_abort_batch_queue(ub, ubq);
2640 }
2641 
2642 static void ublk_start_cancel(struct ublk_device *ub)
2643 {
2644 	struct gendisk *disk = ublk_get_disk(ub);
2645 
2646 	/* Our disk has been dead */
2647 	if (!disk)
2648 		return;
2649 
2650 	mutex_lock(&ub->cancel_mutex);
2651 	if (ub->canceling)
2652 		goto out;
2653 	/*
2654 	 * Now we are serialized with ublk_queue_rq()
2655 	 *
2656 	 * Make sure that ubq->canceling is set when queue is frozen,
2657 	 * because ublk_queue_rq() has to rely on this flag for avoiding to
2658 	 * touch completed uring_cmd
2659 	 */
2660 	blk_mq_quiesce_queue(disk->queue);
2661 	ublk_set_canceling(ub, true);
2662 	blk_mq_unquiesce_queue(disk->queue);
2663 out:
2664 	mutex_unlock(&ub->cancel_mutex);
2665 	ublk_put_disk(disk);
2666 }
2667 
2668 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2669 		unsigned int issue_flags)
2670 {
2671 	struct ublk_io *io = &ubq->ios[tag];
2672 	struct ublk_device *ub = ubq->dev;
2673 	struct request *req;
2674 	bool done;
2675 
2676 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2677 		return;
2678 
2679 	/*
2680 	 * Don't try to cancel this command if the request is started for
2681 	 * avoiding race between io_uring_cmd_done() and
2682 	 * io_uring_cmd_complete_in_task().
2683 	 *
2684 	 * Either the started request will be aborted via __ublk_abort_rq(),
2685 	 * then this uring_cmd is canceled next time, or it will be done in
2686 	 * task work function ublk_dispatch_req() because io_uring guarantees
2687 	 * that ublk_dispatch_req() is always called
2688 	 */
2689 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2690 	if (req && blk_mq_request_started(req) && req->tag == tag)
2691 		return;
2692 
2693 	spin_lock(&ubq->cancel_lock);
2694 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2695 	if (!done)
2696 		io->flags |= UBLK_IO_FLAG_CANCELED;
2697 	spin_unlock(&ubq->cancel_lock);
2698 
2699 	if (!done)
2700 		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2701 }
2702 
2703 /*
2704  * Cancel a batch fetch command if it hasn't been claimed by another path.
2705  *
2706  * An fcmd can only be cancelled if:
2707  * 1. It's not the active_fcmd (which is currently being processed)
2708  * 2. It's still on the list (!list_empty check) - once removed from the list,
2709  *    the fcmd is considered claimed and will be freed by whoever removed it
2710  *
2711  * Use list_del_init() so subsequent list_empty() checks work correctly.
2712  */
2713 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2714 				  struct ublk_batch_fetch_cmd *fcmd,
2715 				  unsigned int issue_flags)
2716 {
2717 	bool done;
2718 
2719 	spin_lock(&ubq->evts_lock);
2720 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2721 	if (done)
2722 		list_del_init(&fcmd->node);
2723 	spin_unlock(&ubq->evts_lock);
2724 
2725 	if (done) {
2726 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2727 		ublk_batch_free_fcmd(fcmd);
2728 	}
2729 }
2730 
2731 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2732 {
2733 	struct ublk_batch_fetch_cmd *fcmd;
2734 	LIST_HEAD(fcmd_list);
2735 
2736 	spin_lock(&ubq->evts_lock);
2737 	ubq->force_abort = true;
2738 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2739 	fcmd = READ_ONCE(ubq->active_fcmd);
2740 	if (fcmd)
2741 		list_move(&fcmd->node, &ubq->fcmd_head);
2742 	spin_unlock(&ubq->evts_lock);
2743 
2744 	while (!list_empty(&fcmd_list)) {
2745 		fcmd = list_first_entry(&fcmd_list,
2746 				struct ublk_batch_fetch_cmd, node);
2747 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2748 	}
2749 }
2750 
2751 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2752 				 unsigned int issue_flags)
2753 {
2754 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2755 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2756 	struct ublk_queue *ubq = pdu->ubq;
2757 
2758 	ublk_start_cancel(ubq->dev);
2759 
2760 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2761 }
2762 
2763 /*
2764  * The ublk char device won't be closed when calling cancel fn, so both
2765  * ublk device and queue are guaranteed to be live
2766  *
2767  * Two-stage cancel:
2768  *
2769  * - make every active uring_cmd done in ->cancel_fn()
2770  *
2771  * - aborting inflight ublk IO requests in ublk char device release handler,
2772  *   which depends on 1st stage because device can only be closed iff all
2773  *   uring_cmd are done
2774  *
2775  * Do _not_ try to acquire ub->mutex before all inflight requests are
2776  * aborted, otherwise deadlock may be caused.
2777  */
2778 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2779 		unsigned int issue_flags)
2780 {
2781 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2782 	struct ublk_queue *ubq = pdu->ubq;
2783 	struct task_struct *task;
2784 	struct ublk_io *io;
2785 
2786 	if (WARN_ON_ONCE(!ubq))
2787 		return;
2788 
2789 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2790 		return;
2791 
2792 	task = io_uring_cmd_get_task(cmd);
2793 	io = &ubq->ios[pdu->tag];
2794 	if (WARN_ON_ONCE(task && task != io->task))
2795 		return;
2796 
2797 	ublk_start_cancel(ubq->dev);
2798 
2799 	WARN_ON_ONCE(io->cmd != cmd);
2800 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2801 }
2802 
2803 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2804 {
2805 	return ubq->nr_io_ready == ubq->q_depth;
2806 }
2807 
2808 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2809 {
2810 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2811 }
2812 
2813 static void ublk_cancel_queue(struct ublk_queue *ubq)
2814 {
2815 	int i;
2816 
2817 	if (ublk_support_batch_io(ubq)) {
2818 		ublk_batch_cancel_queue(ubq);
2819 		return;
2820 	}
2821 
2822 	for (i = 0; i < ubq->q_depth; i++)
2823 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2824 }
2825 
2826 /* Cancel all pending commands, must be called after del_gendisk() returns */
2827 static void ublk_cancel_dev(struct ublk_device *ub)
2828 {
2829 	int i;
2830 
2831 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2832 		ublk_cancel_queue(ublk_get_queue(ub, i));
2833 }
2834 
2835 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2836 {
2837 	bool *idle = data;
2838 
2839 	if (blk_mq_request_started(rq)) {
2840 		*idle = false;
2841 		return false;
2842 	}
2843 	return true;
2844 }
2845 
2846 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2847 {
2848 	bool idle;
2849 
2850 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2851 	while (true) {
2852 		idle = true;
2853 		blk_mq_tagset_busy_iter(&ub->tag_set,
2854 				ublk_check_inflight_rq, &idle);
2855 		if (idle)
2856 			break;
2857 		msleep(UBLK_REQUEUE_DELAY_MS);
2858 	}
2859 }
2860 
2861 static void ublk_force_abort_dev(struct ublk_device *ub)
2862 {
2863 	int i;
2864 
2865 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2866 			__func__, ub->dev_info.dev_id,
2867 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2868 			"LIVE" : "QUIESCED");
2869 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2870 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2871 		ublk_wait_tagset_rqs_idle(ub);
2872 
2873 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2874 		ublk_get_queue(ub, i)->force_abort = true;
2875 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2876 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2877 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2878 }
2879 
2880 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2881 {
2882 	struct gendisk *disk;
2883 
2884 	/* Sync with ublk_abort_queue() by holding the lock */
2885 	spin_lock(&ub->lock);
2886 	disk = ub->ub_disk;
2887 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2888 	ub->dev_info.ublksrv_pid = -1;
2889 	ub->ub_disk = NULL;
2890 	spin_unlock(&ub->lock);
2891 
2892 	return disk;
2893 }
2894 
2895 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2896 	__must_hold(&ub->mutex)
2897 {
2898 	struct gendisk *disk;
2899 
2900 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2901 		return;
2902 
2903 	if (ublk_nosrv_dev_should_queue_io(ub))
2904 		ublk_force_abort_dev(ub);
2905 	del_gendisk(ub->ub_disk);
2906 	disk = ublk_detach_disk(ub);
2907 	put_disk(disk);
2908 }
2909 
2910 static void ublk_stop_dev(struct ublk_device *ub)
2911 {
2912 	mutex_lock(&ub->mutex);
2913 	ublk_stop_dev_unlocked(ub);
2914 	mutex_unlock(&ub->mutex);
2915 	cancel_work_sync(&ub->partition_scan_work);
2916 	ublk_cancel_dev(ub);
2917 }
2918 
2919 static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
2920 {
2921 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
2922 	spin_lock(&ubq->cancel_lock);
2923 	io->flags &= ~UBLK_IO_FLAG_CANCELED;
2924 	spin_unlock(&ubq->cancel_lock);
2925 }
2926 
2927 /* reset per-queue io flags */
2928 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2929 {
2930 	spin_lock(&ubq->cancel_lock);
2931 	ubq->canceling = false;
2932 	spin_unlock(&ubq->cancel_lock);
2933 	ubq->fail_io = false;
2934 }
2935 
2936 /* device can only be started after all IOs are ready */
2937 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
2938 	struct ublk_io *io)
2939 	__must_hold(&ub->mutex)
2940 {
2941 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2942 
2943 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2944 		ub->unprivileged_daemons = true;
2945 
2946 	ubq->nr_io_ready++;
2947 	ublk_reset_io_flags(ubq, io);
2948 
2949 	/* Check if this specific queue is now fully ready */
2950 	if (ublk_queue_ready(ubq)) {
2951 		ub->nr_queue_ready++;
2952 
2953 		/*
2954 		 * Reset queue flags as soon as this queue is ready.
2955 		 * This clears the canceling flag, allowing batch FETCH commands
2956 		 * to succeed during recovery without waiting for all queues.
2957 		 */
2958 		ublk_queue_reset_io_flags(ubq);
2959 	}
2960 
2961 	/* Check if all queues are ready */
2962 	if (ublk_dev_ready(ub)) {
2963 		/*
2964 		 * All queues ready - clear device-level canceling flag
2965 		 * and complete the recovery/initialization.
2966 		 */
2967 		mutex_lock(&ub->cancel_mutex);
2968 		ub->canceling = false;
2969 		mutex_unlock(&ub->cancel_mutex);
2970 		complete_all(&ub->completion);
2971 	}
2972 }
2973 
2974 static inline int ublk_check_cmd_op(u32 cmd_op)
2975 {
2976 	u32 ioc_type = _IOC_TYPE(cmd_op);
2977 
2978 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2979 		return -EOPNOTSUPP;
2980 
2981 	if (ioc_type != 'u' && ioc_type != 0)
2982 		return -EOPNOTSUPP;
2983 
2984 	return 0;
2985 }
2986 
2987 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2988 {
2989 	struct ublk_auto_buf_reg buf;
2990 
2991 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2992 
2993 	if (buf.reserved0 || buf.reserved1)
2994 		return -EINVAL;
2995 
2996 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2997 		return -EINVAL;
2998 	io->buf.auto_reg = buf;
2999 	return 0;
3000 }
3001 
3002 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
3003 				    struct io_uring_cmd *cmd,
3004 				    u16 *buf_idx)
3005 {
3006 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3007 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3008 
3009 		/*
3010 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3011 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3012 		 * `io_ring_ctx`.
3013 		 *
3014 		 * If this uring_cmd's io_ring_ctx isn't same with the
3015 		 * one for registering the buffer, it is ublk server's
3016 		 * responsibility for unregistering the buffer, otherwise
3017 		 * this ublk request gets stuck.
3018 		 */
3019 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3020 			*buf_idx = io->buf.auto_reg.index;
3021 	}
3022 }
3023 
3024 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3025 				    struct io_uring_cmd *cmd,
3026 				    u16 *buf_idx)
3027 {
3028 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3029 	return ublk_set_auto_buf_reg(io, cmd);
3030 }
3031 
3032 /* Once we return, `io->req` can't be used any more */
3033 static inline struct request *
3034 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3035 {
3036 	struct request *req = io->req;
3037 
3038 	io->cmd = cmd;
3039 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3040 	/* now this cmd slot is owned by ublk driver */
3041 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3042 
3043 	return req;
3044 }
3045 
3046 static inline int
3047 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3048 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3049 		   u16 *buf_idx)
3050 {
3051 	if (ublk_dev_support_auto_buf_reg(ub))
3052 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3053 
3054 	io->buf.addr = buf_addr;
3055 	return 0;
3056 }
3057 
3058 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3059 				    unsigned int issue_flags,
3060 				    struct ublk_queue *ubq, unsigned int tag)
3061 {
3062 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3063 
3064 	/*
3065 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3066 	 * commands are completed
3067 	 */
3068 	pdu->ubq = ubq;
3069 	pdu->tag = tag;
3070 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3071 }
3072 
3073 static void ublk_io_release(void *priv)
3074 {
3075 	struct request *rq = priv;
3076 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3077 	struct ublk_io *io = &ubq->ios[rq->tag];
3078 
3079 	/*
3080 	 * task_registered_buffers may be 0 if buffers were registered off task
3081 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3082 	 */
3083 	if (current == io->task && io->task_registered_buffers)
3084 		io->task_registered_buffers--;
3085 	else
3086 		ublk_put_req_ref(io, rq);
3087 }
3088 
3089 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3090 				struct ublk_device *ub,
3091 				u16 q_id, u16 tag,
3092 				struct ublk_io *io,
3093 				unsigned int index, unsigned int issue_flags)
3094 {
3095 	struct request *req;
3096 	int ret;
3097 
3098 	if (!ublk_dev_support_zero_copy(ub))
3099 		return -EINVAL;
3100 
3101 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3102 	if (!req)
3103 		return -EINVAL;
3104 
3105 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3106 				      issue_flags);
3107 	if (ret) {
3108 		ublk_put_req_ref(io, req);
3109 		return ret;
3110 	}
3111 
3112 	return 0;
3113 }
3114 
3115 static int
3116 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3117 			    struct ublk_device *ub,
3118 			    u16 q_id, u16 tag, struct ublk_io *io,
3119 			    unsigned index, unsigned issue_flags)
3120 {
3121 	unsigned new_registered_buffers;
3122 	struct request *req = io->req;
3123 	int ret;
3124 
3125 	/*
3126 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3127 	 * If not, fall back on the thread-safe buffer registration.
3128 	 */
3129 	new_registered_buffers = io->task_registered_buffers + 1;
3130 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3131 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3132 					    issue_flags);
3133 
3134 	if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3135 		return -EINVAL;
3136 
3137 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3138 				      issue_flags);
3139 	if (ret)
3140 		return ret;
3141 
3142 	io->task_registered_buffers = new_registered_buffers;
3143 	return 0;
3144 }
3145 
3146 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3147 				  const struct ublk_device *ub,
3148 				  unsigned int index, unsigned int issue_flags)
3149 {
3150 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3151 		return -EINVAL;
3152 
3153 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3154 }
3155 
3156 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3157 {
3158 	if (ublk_dev_need_map_io(ub)) {
3159 		/*
3160 		 * FETCH_RQ has to provide IO buffer if NEED GET
3161 		 * DATA is not enabled
3162 		 */
3163 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3164 			return -EINVAL;
3165 	} else if (buf_addr) {
3166 		/* User copy requires addr to be unset */
3167 		return -EINVAL;
3168 	}
3169 	return 0;
3170 }
3171 
3172 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3173 			struct ublk_io *io, u16 q_id)
3174 {
3175 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3176 	if (ublk_dev_ready(ub))
3177 		return -EBUSY;
3178 
3179 	/* allow each command to be FETCHed at most once */
3180 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3181 		return -EINVAL;
3182 
3183 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3184 
3185 	ublk_fill_io_cmd(io, cmd);
3186 
3187 	if (ublk_dev_support_batch_io(ub))
3188 		WRITE_ONCE(io->task, NULL);
3189 	else
3190 		WRITE_ONCE(io->task, get_task_struct(current));
3191 
3192 	return 0;
3193 }
3194 
3195 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3196 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3197 {
3198 	int ret;
3199 
3200 	/*
3201 	 * When handling FETCH command for setting up ublk uring queue,
3202 	 * ub->mutex is the innermost lock, and we won't block for handling
3203 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3204 	 */
3205 	mutex_lock(&ub->mutex);
3206 	ret = __ublk_fetch(cmd, ub, io, q_id);
3207 	if (!ret)
3208 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3209 	if (!ret)
3210 		ublk_mark_io_ready(ub, q_id, io);
3211 	mutex_unlock(&ub->mutex);
3212 	return ret;
3213 }
3214 
3215 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3216 				       struct ublk_io *io, __u64 buf_addr)
3217 {
3218 	struct request *req = io->req;
3219 
3220 	if (ublk_dev_need_map_io(ub)) {
3221 		/*
3222 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3223 		 * NEED GET DATA is not enabled or it is Read IO.
3224 		 */
3225 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3226 					req_op(req) == REQ_OP_READ))
3227 			return -EINVAL;
3228 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3229 		/*
3230 		 * User copy requires addr to be unset when command is
3231 		 * not zone append
3232 		 */
3233 		return -EINVAL;
3234 	}
3235 
3236 	return 0;
3237 }
3238 
3239 static bool ublk_need_complete_req(const struct ublk_device *ub,
3240 				   struct ublk_io *io)
3241 {
3242 	if (ublk_dev_need_req_ref(ub))
3243 		return ublk_sub_req_ref(io);
3244 	return true;
3245 }
3246 
3247 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3248 			  struct request *req)
3249 {
3250 	/*
3251 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3252 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3253 	 * do the copy work.
3254 	 */
3255 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3256 	/* update iod->addr because ublksrv may have passed a new io buffer */
3257 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3258 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3259 			__func__, ubq->q_id, req->tag, io->flags,
3260 			ublk_get_iod(ubq, req->tag)->addr);
3261 
3262 	return ublk_start_io(ubq, req, io);
3263 }
3264 
3265 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3266 		unsigned int issue_flags)
3267 {
3268 	/* May point to userspace-mapped memory */
3269 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3270 							       struct ublksrv_io_cmd);
3271 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3272 	struct ublk_device *ub = cmd->file->private_data;
3273 	struct ublk_queue *ubq;
3274 	struct ublk_io *io = NULL;
3275 	u32 cmd_op = cmd->cmd_op;
3276 	u16 q_id = READ_ONCE(ub_src->q_id);
3277 	u16 tag = READ_ONCE(ub_src->tag);
3278 	s32 result = READ_ONCE(ub_src->result);
3279 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3280 	struct request *req;
3281 	int ret;
3282 	bool compl;
3283 
3284 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3285 
3286 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3287 			__func__, cmd->cmd_op, q_id, tag, result);
3288 
3289 	ret = ublk_check_cmd_op(cmd_op);
3290 	if (ret)
3291 		goto out;
3292 
3293 	/*
3294 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3295 	 * so no need to validate the q_id, tag, or task
3296 	 */
3297 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3298 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3299 
3300 	ret = -EINVAL;
3301 	if (q_id >= ub->dev_info.nr_hw_queues)
3302 		goto out;
3303 
3304 	ubq = ublk_get_queue(ub, q_id);
3305 
3306 	if (tag >= ub->dev_info.queue_depth)
3307 		goto out;
3308 
3309 	io = &ubq->ios[tag];
3310 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3311 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3312 		ret = ublk_check_fetch_buf(ub, addr);
3313 		if (ret)
3314 			goto out;
3315 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3316 		if (ret)
3317 			goto out;
3318 
3319 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3320 		return -EIOCBQUEUED;
3321 	}
3322 
3323 	if (READ_ONCE(io->task) != current) {
3324 		/*
3325 		 * ublk_register_io_buf() accesses only the io's refcount,
3326 		 * so can be handled on any task
3327 		 */
3328 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3329 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3330 						    addr, issue_flags);
3331 
3332 		goto out;
3333 	}
3334 
3335 	/* there is pending io cmd, something must be wrong */
3336 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3337 		ret = -EBUSY;
3338 		goto out;
3339 	}
3340 
3341 	/*
3342 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3343 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3344 	 */
3345 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3346 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3347 		goto out;
3348 
3349 	switch (_IOC_NR(cmd_op)) {
3350 	case UBLK_IO_REGISTER_IO_BUF:
3351 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3352 						   issue_flags);
3353 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3354 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3355 		if (ret)
3356 			goto out;
3357 		io->res = result;
3358 		req = ublk_fill_io_cmd(io, cmd);
3359 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3360 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3361 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3362 		compl = ublk_need_complete_req(ub, io);
3363 
3364 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3365 			req->__sector = addr;
3366 		if (compl)
3367 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3368 
3369 		if (ret)
3370 			goto out;
3371 		break;
3372 	case UBLK_IO_NEED_GET_DATA:
3373 		/*
3374 		 * ublk_get_data() may fail and fallback to requeue, so keep
3375 		 * uring_cmd active first and prepare for handling new requeued
3376 		 * request
3377 		 */
3378 		req = ublk_fill_io_cmd(io, cmd);
3379 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3380 		WARN_ON_ONCE(ret);
3381 		if (likely(ublk_get_data(ubq, io, req))) {
3382 			__ublk_prep_compl_io_cmd(io, req);
3383 			return UBLK_IO_RES_OK;
3384 		}
3385 		break;
3386 	default:
3387 		goto out;
3388 	}
3389 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3390 	return -EIOCBQUEUED;
3391 
3392  out:
3393 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3394 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3395 	return ret;
3396 }
3397 
3398 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3399 		u16 q_id, u16 tag, struct ublk_io *io)
3400 {
3401 	struct request *req;
3402 
3403 	/*
3404 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3405 	 * which would overwrite it with io->cmd
3406 	 */
3407 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3408 	if (!req)
3409 		return NULL;
3410 
3411 	if (!ublk_get_req_ref(io))
3412 		return NULL;
3413 
3414 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3415 		goto fail_put;
3416 
3417 	if (!ublk_rq_has_data(req))
3418 		goto fail_put;
3419 
3420 	return req;
3421 fail_put:
3422 	ublk_put_req_ref(io, req);
3423 	return NULL;
3424 }
3425 
3426 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3427 {
3428 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3429 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3430 	int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3431 
3432 	if (ret != -EIOCBQUEUED)
3433 		io_uring_cmd_done(cmd, ret, issue_flags);
3434 }
3435 
3436 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3437 {
3438 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3439 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3440 		return 0;
3441 	}
3442 
3443 	/* well-implemented server won't run into unlocked */
3444 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3445 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3446 		return -EIOCBQUEUED;
3447 	}
3448 
3449 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3450 }
3451 
3452 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3453 					const struct ublk_elem_header *elem)
3454 {
3455 	const void *buf = elem;
3456 
3457 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3458 		return *(const __u64 *)(buf + sizeof(*elem));
3459 	return 0;
3460 }
3461 
3462 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3463 					const struct ublk_elem_header *elem)
3464 {
3465 	const void *buf = elem;
3466 
3467 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3468 		return *(const __u64 *)(buf + sizeof(*elem) +
3469 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3470 	return -1;
3471 }
3472 
3473 static struct ublk_auto_buf_reg
3474 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3475 			const struct ublk_elem_header *elem)
3476 {
3477 	struct ublk_auto_buf_reg reg = {
3478 		.index = elem->buf_index,
3479 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3480 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3481 	};
3482 
3483 	return reg;
3484 }
3485 
3486 /*
3487  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3488  * it is the least common multiple(LCM) of 8, 16 and 24
3489  */
3490 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3491 struct ublk_batch_io_iter {
3492 	void __user *uaddr;
3493 	unsigned done, total;
3494 	unsigned char elem_bytes;
3495 	/* copy to this buffer from user space */
3496 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3497 };
3498 
3499 static inline int
3500 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3501 		    struct ublk_batch_io_iter *iter,
3502 		    const struct ublk_batch_io_data *data,
3503 		    unsigned bytes,
3504 		    int (*cb)(struct ublk_queue *q,
3505 			    const struct ublk_batch_io_data *data,
3506 			    const struct ublk_elem_header *elem))
3507 {
3508 	unsigned int i;
3509 	int ret = 0;
3510 
3511 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3512 		const struct ublk_elem_header *elem =
3513 			(const struct ublk_elem_header *)&iter->buf[i];
3514 
3515 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3516 			ret = -EINVAL;
3517 			break;
3518 		}
3519 
3520 		ret = cb(ubq, data, elem);
3521 		if (unlikely(ret))
3522 			break;
3523 	}
3524 
3525 	iter->done += i;
3526 	return ret;
3527 }
3528 
3529 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3530 			     const struct ublk_batch_io_data *data,
3531 			     int (*cb)(struct ublk_queue *q,
3532 				     const struct ublk_batch_io_data *data,
3533 				     const struct ublk_elem_header *elem))
3534 {
3535 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3536 	int ret = 0;
3537 
3538 	while (iter->done < iter->total) {
3539 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3540 
3541 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3542 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3543 					data->ub->dev_info.dev_id);
3544 			return -EFAULT;
3545 		}
3546 
3547 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3548 		if (ret)
3549 			return ret;
3550 	}
3551 	return 0;
3552 }
3553 
3554 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3555 				const struct ublk_batch_io_data *data,
3556 				const struct ublk_elem_header *elem)
3557 {
3558 	struct ublk_io *io = &ubq->ios[elem->tag];
3559 
3560 	/*
3561 	 * If queue was ready before this decrement, it won't be anymore,
3562 	 * so we need to decrement the queue ready count and restore the
3563 	 * canceling flag to prevent new requests from being queued.
3564 	 */
3565 	if (ublk_queue_ready(ubq)) {
3566 		data->ub->nr_queue_ready--;
3567 		spin_lock(&ubq->cancel_lock);
3568 		ubq->canceling = true;
3569 		spin_unlock(&ubq->cancel_lock);
3570 	}
3571 	ubq->nr_io_ready--;
3572 
3573 	ublk_io_lock(io);
3574 	io->flags = 0;
3575 	ublk_io_unlock(io);
3576 	return 0;
3577 }
3578 
3579 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3580 				       const struct ublk_batch_io_data *data)
3581 {
3582 	int ret;
3583 
3584 	/* Re-process only what we've already processed, starting from beginning */
3585 	iter->total = iter->done;
3586 	iter->done = 0;
3587 
3588 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3589 	WARN_ON_ONCE(ret);
3590 }
3591 
3592 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3593 			      const struct ublk_batch_io_data *data,
3594 			      const struct ublk_elem_header *elem)
3595 {
3596 	struct ublk_io *io = &ubq->ios[elem->tag];
3597 	const struct ublk_batch_io *uc = &data->header;
3598 	union ublk_io_buf buf = { 0 };
3599 	int ret;
3600 
3601 	if (ublk_dev_support_auto_buf_reg(data->ub))
3602 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3603 	else if (ublk_dev_need_map_io(data->ub)) {
3604 		buf.addr = ublk_batch_buf_addr(uc, elem);
3605 
3606 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3607 		if (ret)
3608 			return ret;
3609 	}
3610 
3611 	ublk_io_lock(io);
3612 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3613 	if (!ret)
3614 		io->buf = buf;
3615 	ublk_io_unlock(io);
3616 
3617 	if (!ret)
3618 		ublk_mark_io_ready(data->ub, ubq->q_id, io);
3619 
3620 	return ret;
3621 }
3622 
3623 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3624 {
3625 	const struct ublk_batch_io *uc = &data->header;
3626 	struct io_uring_cmd *cmd = data->cmd;
3627 	struct ublk_batch_io_iter iter = {
3628 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3629 		.total = uc->nr_elem * uc->elem_bytes,
3630 		.elem_bytes = uc->elem_bytes,
3631 	};
3632 	int ret;
3633 
3634 	mutex_lock(&data->ub->mutex);
3635 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3636 
3637 	if (ret && iter.done)
3638 		ublk_batch_revert_prep_cmd(&iter, data);
3639 	mutex_unlock(&data->ub->mutex);
3640 	return ret;
3641 }
3642 
3643 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3644 				      struct ublk_io *io,
3645 				      union ublk_io_buf *buf)
3646 {
3647 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3648 		return -EBUSY;
3649 
3650 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3651 	if (ublk_need_map_io(ubq) && !buf->addr)
3652 		return -EINVAL;
3653 	return 0;
3654 }
3655 
3656 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3657 				const struct ublk_batch_io_data *data,
3658 				const struct ublk_elem_header *elem)
3659 {
3660 	struct ublk_io *io = &ubq->ios[elem->tag];
3661 	const struct ublk_batch_io *uc = &data->header;
3662 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3663 	union ublk_io_buf buf = { 0 };
3664 	struct request *req = NULL;
3665 	bool auto_reg = false;
3666 	bool compl = false;
3667 	int ret;
3668 
3669 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3670 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3671 		auto_reg = true;
3672 	} else if (ublk_dev_need_map_io(data->ub))
3673 		buf.addr = ublk_batch_buf_addr(uc, elem);
3674 
3675 	ublk_io_lock(io);
3676 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3677 	if (!ret) {
3678 		io->res = elem->result;
3679 		io->buf = buf;
3680 		req = ublk_fill_io_cmd(io, data->cmd);
3681 
3682 		if (auto_reg)
3683 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3684 		compl = ublk_need_complete_req(data->ub, io);
3685 	}
3686 	ublk_io_unlock(io);
3687 
3688 	if (unlikely(ret)) {
3689 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3690 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3691 			elem->tag, ret);
3692 		return ret;
3693 	}
3694 
3695 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3696 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3697 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3698 		req->__sector = ublk_batch_zone_lba(uc, elem);
3699 	if (compl)
3700 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3701 	return 0;
3702 }
3703 
3704 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3705 {
3706 	const struct ublk_batch_io *uc = &data->header;
3707 	struct io_uring_cmd *cmd = data->cmd;
3708 	struct ublk_batch_io_iter iter = {
3709 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3710 		.total = uc->nr_elem * uc->elem_bytes,
3711 		.elem_bytes = uc->elem_bytes,
3712 	};
3713 	DEFINE_IO_COMP_BATCH(iob);
3714 	int ret;
3715 
3716 	data->iob = &iob;
3717 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3718 
3719 	if (iob.complete)
3720 		iob.complete(&iob);
3721 
3722 	return iter.done == 0 ? ret : iter.done;
3723 }
3724 
3725 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3726 {
3727 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3728 
3729 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3730 		return -EINVAL;
3731 
3732 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3733 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3734 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3735 		return -EINVAL;
3736 
3737 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3738 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3739 	if (uc->elem_bytes != elem_bytes)
3740 		return -EINVAL;
3741 	return 0;
3742 }
3743 
3744 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3745 {
3746 	const struct ublk_batch_io *uc = &data->header;
3747 
3748 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3749 		return -EINVAL;
3750 
3751 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3752 		return -E2BIG;
3753 
3754 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3755 			!ublk_dev_is_zoned(data->ub))
3756 		return -EINVAL;
3757 
3758 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3759 			!ublk_dev_need_map_io(data->ub))
3760 		return -EINVAL;
3761 
3762 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3763 			!ublk_dev_support_auto_buf_reg(data->ub))
3764 		return -EINVAL;
3765 
3766 	return ublk_check_batch_cmd_flags(uc);
3767 }
3768 
3769 static int ublk_batch_attach(struct ublk_queue *ubq,
3770 			     struct ublk_batch_io_data *data,
3771 			     struct ublk_batch_fetch_cmd *fcmd)
3772 {
3773 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3774 	bool free = false;
3775 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3776 
3777 	spin_lock(&ubq->evts_lock);
3778 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3779 		free = true;
3780 	} else {
3781 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3782 		new_fcmd = __ublk_acquire_fcmd(ubq);
3783 	}
3784 	spin_unlock(&ubq->evts_lock);
3785 
3786 	if (unlikely(free)) {
3787 		ublk_batch_free_fcmd(fcmd);
3788 		return -ENODEV;
3789 	}
3790 
3791 	pdu->ubq = ubq;
3792 	pdu->fcmd = fcmd;
3793 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3794 
3795 	if (!new_fcmd)
3796 		goto out;
3797 
3798 	/*
3799 	 * If the two fetch commands are originated from same io_ring_ctx,
3800 	 * run batch dispatch directly. Otherwise, schedule task work for
3801 	 * doing it.
3802 	 */
3803 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3804 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3805 		data->cmd = new_fcmd->cmd;
3806 		ublk_batch_dispatch(ubq, data, new_fcmd);
3807 	} else {
3808 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3809 				ublk_batch_tw_cb);
3810 	}
3811 out:
3812 	return -EIOCBQUEUED;
3813 }
3814 
3815 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3816 {
3817 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3818 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3819 
3820 	if (!fcmd)
3821 		return -ENOMEM;
3822 
3823 	return ublk_batch_attach(ubq, data, fcmd);
3824 }
3825 
3826 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3827 {
3828 	const struct ublk_batch_io *uc = &data->header;
3829 
3830 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3831 		return -EINVAL;
3832 
3833 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3834 		return -EINVAL;
3835 
3836 	if (uc->elem_bytes != sizeof(__u16))
3837 		return -EINVAL;
3838 
3839 	if (uc->flags != 0)
3840 		return -EINVAL;
3841 
3842 	return 0;
3843 }
3844 
3845 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3846 				     unsigned int issue_flags)
3847 {
3848 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3849 							       struct ublksrv_io_cmd);
3850 	struct ublk_device *ub = cmd->file->private_data;
3851 	unsigned tag = READ_ONCE(ub_cmd->tag);
3852 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3853 	unsigned index = READ_ONCE(ub_cmd->addr);
3854 	struct ublk_queue *ubq;
3855 	struct ublk_io *io;
3856 
3857 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3858 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3859 
3860 	if (q_id >= ub->dev_info.nr_hw_queues)
3861 		return -EINVAL;
3862 
3863 	if (tag >= ub->dev_info.queue_depth)
3864 		return -EINVAL;
3865 
3866 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3867 		return -EOPNOTSUPP;
3868 
3869 	ubq = ublk_get_queue(ub, q_id);
3870 	io = &ubq->ios[tag];
3871 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3872 			issue_flags);
3873 }
3874 
3875 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3876 				       unsigned int issue_flags)
3877 {
3878 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3879 							  struct ublk_batch_io);
3880 	struct ublk_device *ub = cmd->file->private_data;
3881 	struct ublk_batch_io_data data = {
3882 		.ub  = ub,
3883 		.cmd = cmd,
3884 		.header = (struct ublk_batch_io) {
3885 			.q_id = READ_ONCE(uc->q_id),
3886 			.flags = READ_ONCE(uc->flags),
3887 			.nr_elem = READ_ONCE(uc->nr_elem),
3888 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3889 		},
3890 		.issue_flags = issue_flags,
3891 	};
3892 	u32 cmd_op = cmd->cmd_op;
3893 	int ret = -EINVAL;
3894 
3895 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3896 		ublk_batch_cancel_fn(cmd, issue_flags);
3897 		return 0;
3898 	}
3899 
3900 	switch (cmd_op) {
3901 	case UBLK_U_IO_PREP_IO_CMDS:
3902 		ret = ublk_check_batch_cmd(&data);
3903 		if (ret)
3904 			goto out;
3905 		ret = ublk_handle_batch_prep_cmd(&data);
3906 		break;
3907 	case UBLK_U_IO_COMMIT_IO_CMDS:
3908 		ret = ublk_check_batch_cmd(&data);
3909 		if (ret)
3910 			goto out;
3911 		ret = ublk_handle_batch_commit_cmd(&data);
3912 		break;
3913 	case UBLK_U_IO_FETCH_IO_CMDS:
3914 		ret = ublk_validate_batch_fetch_cmd(&data);
3915 		if (ret)
3916 			goto out;
3917 		ret = ublk_handle_batch_fetch_cmd(&data);
3918 		break;
3919 	default:
3920 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3921 		break;
3922 	}
3923 out:
3924 	return ret;
3925 }
3926 
3927 static inline bool ublk_check_ubuf_dir(const struct request *req,
3928 		int ubuf_dir)
3929 {
3930 	/* copy ubuf to request pages */
3931 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
3932 	    ubuf_dir == ITER_SOURCE)
3933 		return true;
3934 
3935 	/* copy request pages to ubuf */
3936 	if ((req_op(req) == REQ_OP_WRITE ||
3937 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
3938 	    ubuf_dir == ITER_DEST)
3939 		return true;
3940 
3941 	return false;
3942 }
3943 
3944 static ssize_t
3945 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
3946 {
3947 	struct ublk_device *ub = iocb->ki_filp->private_data;
3948 	struct ublk_queue *ubq;
3949 	struct request *req;
3950 	struct ublk_io *io;
3951 	unsigned data_len;
3952 	bool is_integrity;
3953 	bool on_daemon;
3954 	size_t buf_off;
3955 	u16 tag, q_id;
3956 	ssize_t ret;
3957 
3958 	if (!user_backed_iter(iter))
3959 		return -EACCES;
3960 
3961 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3962 		return -EACCES;
3963 
3964 	tag = ublk_pos_to_tag(iocb->ki_pos);
3965 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
3966 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
3967 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
3968 
3969 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
3970 		return -EINVAL;
3971 
3972 	if (q_id >= ub->dev_info.nr_hw_queues)
3973 		return -EINVAL;
3974 
3975 	ubq = ublk_get_queue(ub, q_id);
3976 	if (!ublk_dev_support_user_copy(ub))
3977 		return -EACCES;
3978 
3979 	if (tag >= ub->dev_info.queue_depth)
3980 		return -EINVAL;
3981 
3982 	io = &ubq->ios[tag];
3983 	on_daemon = current == READ_ONCE(io->task);
3984 	if (on_daemon) {
3985 		/* On daemon, io can't be completed concurrently, so skip ref */
3986 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3987 			return -EINVAL;
3988 
3989 		req = io->req;
3990 		if (!ublk_rq_has_data(req))
3991 			return -EINVAL;
3992 	} else {
3993 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
3994 		if (!req)
3995 			return -EINVAL;
3996 	}
3997 
3998 	if (is_integrity) {
3999 		struct blk_integrity *bi = &req->q->limits.integrity;
4000 
4001 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
4002 	} else {
4003 		data_len = blk_rq_bytes(req);
4004 	}
4005 	if (buf_off > data_len) {
4006 		ret = -EINVAL;
4007 		goto out;
4008 	}
4009 
4010 	if (!ublk_check_ubuf_dir(req, dir)) {
4011 		ret = -EACCES;
4012 		goto out;
4013 	}
4014 
4015 	if (is_integrity)
4016 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4017 	else
4018 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4019 
4020 out:
4021 	if (!on_daemon)
4022 		ublk_put_req_ref(io, req);
4023 	return ret;
4024 }
4025 
4026 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4027 {
4028 	return ublk_user_copy(iocb, to, ITER_DEST);
4029 }
4030 
4031 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4032 {
4033 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4034 }
4035 
4036 static const struct file_operations ublk_ch_fops = {
4037 	.owner = THIS_MODULE,
4038 	.open = ublk_ch_open,
4039 	.release = ublk_ch_release,
4040 	.read_iter = ublk_ch_read_iter,
4041 	.write_iter = ublk_ch_write_iter,
4042 	.uring_cmd = ublk_ch_uring_cmd,
4043 	.mmap = ublk_ch_mmap,
4044 };
4045 
4046 static const struct file_operations ublk_ch_batch_io_fops = {
4047 	.owner = THIS_MODULE,
4048 	.open = ublk_ch_open,
4049 	.release = ublk_ch_release,
4050 	.read_iter = ublk_ch_read_iter,
4051 	.write_iter = ublk_ch_write_iter,
4052 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4053 	.mmap = ublk_ch_mmap,
4054 };
4055 
4056 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4057 {
4058 	int size, i;
4059 
4060 	size = ublk_queue_cmd_buf_size(ub);
4061 
4062 	for (i = 0; i < ubq->q_depth; i++) {
4063 		struct ublk_io *io = &ubq->ios[i];
4064 		if (io->task)
4065 			put_task_struct(io->task);
4066 		WARN_ON_ONCE(refcount_read(&io->ref));
4067 		WARN_ON_ONCE(io->task_registered_buffers);
4068 	}
4069 
4070 	if (ubq->io_cmd_buf)
4071 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4072 
4073 	if (ublk_dev_support_batch_io(ub))
4074 		ublk_io_evts_deinit(ubq);
4075 
4076 	kvfree(ubq);
4077 }
4078 
4079 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4080 {
4081 	struct ublk_queue *ubq = ub->queues[q_id];
4082 
4083 	if (!ubq)
4084 		return;
4085 
4086 	__ublk_deinit_queue(ub, ubq);
4087 	ub->queues[q_id] = NULL;
4088 }
4089 
4090 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4091 {
4092 	unsigned int cpu;
4093 
4094 	/* Find first CPU mapped to this queue */
4095 	for_each_possible_cpu(cpu) {
4096 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4097 			return cpu_to_node(cpu);
4098 	}
4099 
4100 	return NUMA_NO_NODE;
4101 }
4102 
4103 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4104 {
4105 	int depth = ub->dev_info.queue_depth;
4106 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4107 	struct ublk_queue *ubq;
4108 	struct page *page;
4109 	int numa_node;
4110 	int size, i, ret;
4111 
4112 	/* Determine NUMA node based on queue's CPU affinity */
4113 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4114 
4115 	/* Allocate queue structure on local NUMA node */
4116 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4117 			    numa_node);
4118 	if (!ubq)
4119 		return -ENOMEM;
4120 
4121 	spin_lock_init(&ubq->cancel_lock);
4122 	ubq->flags = ub->dev_info.flags;
4123 	ubq->q_id = q_id;
4124 	ubq->q_depth = depth;
4125 	size = ublk_queue_cmd_buf_size(ub);
4126 
4127 	/* Allocate I/O command buffer on local NUMA node */
4128 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4129 	if (!page) {
4130 		kvfree(ubq);
4131 		return -ENOMEM;
4132 	}
4133 	ubq->io_cmd_buf = page_address(page);
4134 
4135 	for (i = 0; i < ubq->q_depth; i++)
4136 		spin_lock_init(&ubq->ios[i].lock);
4137 
4138 	if (ublk_dev_support_batch_io(ub)) {
4139 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4140 		if (ret)
4141 			goto fail;
4142 		INIT_LIST_HEAD(&ubq->fcmd_head);
4143 	}
4144 	ub->queues[q_id] = ubq;
4145 	ubq->dev = ub;
4146 
4147 	return 0;
4148 fail:
4149 	__ublk_deinit_queue(ub, ubq);
4150 	return ret;
4151 }
4152 
4153 static void ublk_deinit_queues(struct ublk_device *ub)
4154 {
4155 	int i;
4156 
4157 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4158 		ublk_deinit_queue(ub, i);
4159 }
4160 
4161 static int ublk_init_queues(struct ublk_device *ub)
4162 {
4163 	int i, ret;
4164 
4165 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4166 		ret = ublk_init_queue(ub, i);
4167 		if (ret)
4168 			goto fail;
4169 	}
4170 
4171 	init_completion(&ub->completion);
4172 	return 0;
4173 
4174  fail:
4175 	ublk_deinit_queues(ub);
4176 	return ret;
4177 }
4178 
4179 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4180 {
4181 	int i = idx;
4182 	int err;
4183 
4184 	spin_lock(&ublk_idr_lock);
4185 	/* allocate id, if @id >= 0, we're requesting that specific id */
4186 	if (i >= 0) {
4187 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4188 		if (err == -ENOSPC)
4189 			err = -EEXIST;
4190 	} else {
4191 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4192 				GFP_NOWAIT);
4193 	}
4194 	spin_unlock(&ublk_idr_lock);
4195 
4196 	if (err >= 0)
4197 		ub->ub_number = err;
4198 
4199 	return err;
4200 }
4201 
4202 static void ublk_free_dev_number(struct ublk_device *ub)
4203 {
4204 	spin_lock(&ublk_idr_lock);
4205 	idr_remove(&ublk_index_idr, ub->ub_number);
4206 	wake_up_all(&ublk_idr_wq);
4207 	spin_unlock(&ublk_idr_lock);
4208 }
4209 
4210 static void ublk_cdev_rel(struct device *dev)
4211 {
4212 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4213 
4214 	blk_mq_free_tag_set(&ub->tag_set);
4215 	ublk_deinit_queues(ub);
4216 	ublk_free_dev_number(ub);
4217 	mutex_destroy(&ub->mutex);
4218 	mutex_destroy(&ub->cancel_mutex);
4219 	kfree(ub);
4220 }
4221 
4222 static int ublk_add_chdev(struct ublk_device *ub)
4223 {
4224 	struct device *dev = &ub->cdev_dev;
4225 	int minor = ub->ub_number;
4226 	int ret;
4227 
4228 	dev->parent = ublk_misc.this_device;
4229 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4230 	dev->class = &ublk_chr_class;
4231 	dev->release = ublk_cdev_rel;
4232 	device_initialize(dev);
4233 
4234 	ret = dev_set_name(dev, "ublkc%d", minor);
4235 	if (ret)
4236 		goto fail;
4237 
4238 	if (ublk_dev_support_batch_io(ub))
4239 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4240 	else
4241 		cdev_init(&ub->cdev, &ublk_ch_fops);
4242 	ret = cdev_device_add(&ub->cdev, dev);
4243 	if (ret)
4244 		goto fail;
4245 
4246 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4247 		unprivileged_ublks_added++;
4248 	return 0;
4249  fail:
4250 	put_device(dev);
4251 	return ret;
4252 }
4253 
4254 /* align max io buffer size with PAGE_SIZE */
4255 static void ublk_align_max_io_size(struct ublk_device *ub)
4256 {
4257 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4258 
4259 	ub->dev_info.max_io_buf_bytes =
4260 		round_down(max_io_bytes, PAGE_SIZE);
4261 }
4262 
4263 static int ublk_add_tag_set(struct ublk_device *ub)
4264 {
4265 	if (ublk_dev_support_batch_io(ub))
4266 		ub->tag_set.ops = &ublk_batch_mq_ops;
4267 	else
4268 		ub->tag_set.ops = &ublk_mq_ops;
4269 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4270 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4271 	ub->tag_set.numa_node = NUMA_NO_NODE;
4272 	ub->tag_set.driver_data = ub;
4273 	return blk_mq_alloc_tag_set(&ub->tag_set);
4274 }
4275 
4276 static void ublk_remove(struct ublk_device *ub)
4277 {
4278 	bool unprivileged;
4279 
4280 	ublk_stop_dev(ub);
4281 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4282 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4283 	ublk_put_device(ub);
4284 
4285 	if (unprivileged)
4286 		unprivileged_ublks_added--;
4287 }
4288 
4289 static struct ublk_device *ublk_get_device_from_id(int idx)
4290 {
4291 	struct ublk_device *ub = NULL;
4292 
4293 	if (idx < 0)
4294 		return NULL;
4295 
4296 	spin_lock(&ublk_idr_lock);
4297 	ub = idr_find(&ublk_index_idr, idx);
4298 	if (ub)
4299 		ub = ublk_get_device(ub);
4300 	spin_unlock(&ublk_idr_lock);
4301 
4302 	return ub;
4303 }
4304 
4305 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4306 {
4307 	rcu_read_lock();
4308 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4309 	rcu_read_unlock();
4310 
4311 	return ub->ublksrv_tgid == ublksrv_pid;
4312 }
4313 
4314 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4315 		const struct ublksrv_ctrl_cmd *header)
4316 {
4317 	const struct ublk_param_basic *p = &ub->params.basic;
4318 	int ublksrv_pid = (int)header->data[0];
4319 	struct queue_limits lim = {
4320 		.logical_block_size	= 1 << p->logical_bs_shift,
4321 		.physical_block_size	= 1 << p->physical_bs_shift,
4322 		.io_min			= 1 << p->io_min_shift,
4323 		.io_opt			= 1 << p->io_opt_shift,
4324 		.max_hw_sectors		= p->max_sectors,
4325 		.chunk_sectors		= p->chunk_sectors,
4326 		.virt_boundary_mask	= p->virt_boundary_mask,
4327 		.max_segments		= USHRT_MAX,
4328 		.max_segment_size	= UINT_MAX,
4329 		.dma_alignment		= 3,
4330 	};
4331 	struct gendisk *disk;
4332 	int ret = -EINVAL;
4333 
4334 	if (ublksrv_pid <= 0)
4335 		return -EINVAL;
4336 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4337 		return -EINVAL;
4338 
4339 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4340 		const struct ublk_param_discard *pd = &ub->params.discard;
4341 
4342 		lim.discard_alignment = pd->discard_alignment;
4343 		lim.discard_granularity = pd->discard_granularity;
4344 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4345 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4346 		lim.max_discard_segments = pd->max_discard_segments;
4347 	}
4348 
4349 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4350 		const struct ublk_param_zoned *p = &ub->params.zoned;
4351 
4352 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4353 			return -EOPNOTSUPP;
4354 
4355 		lim.features |= BLK_FEAT_ZONED;
4356 		lim.max_active_zones = p->max_active_zones;
4357 		lim.max_open_zones =  p->max_open_zones;
4358 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4359 	}
4360 
4361 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4362 		lim.features |= BLK_FEAT_WRITE_CACHE;
4363 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4364 			lim.features |= BLK_FEAT_FUA;
4365 	}
4366 
4367 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4368 		lim.features |= BLK_FEAT_ROTATIONAL;
4369 
4370 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4371 		lim.dma_alignment = ub->params.dma.alignment;
4372 
4373 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4374 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4375 		lim.max_segment_size = ub->params.seg.max_segment_size;
4376 		lim.max_segments = ub->params.seg.max_segments;
4377 	}
4378 
4379 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4380 		const struct ublk_param_integrity *p = &ub->params.integrity;
4381 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4382 
4383 		lim.max_integrity_segments =
4384 			p->max_integrity_segments ?: USHRT_MAX;
4385 		lim.integrity = (struct blk_integrity) {
4386 			.flags = ublk_integrity_flags(p->flags),
4387 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4388 			.metadata_size = p->metadata_size,
4389 			.pi_offset = p->pi_offset,
4390 			.interval_exp = p->interval_exp,
4391 			.tag_size = p->tag_size,
4392 			.pi_tuple_size = pi_tuple_size,
4393 		};
4394 	}
4395 
4396 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4397 		return -EINTR;
4398 
4399 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4400 		return -EINVAL;
4401 
4402 	mutex_lock(&ub->mutex);
4403 	/* device may become not ready in case of F_BATCH */
4404 	if (!ublk_dev_ready(ub)) {
4405 		ret = -EINVAL;
4406 		goto out_unlock;
4407 	}
4408 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4409 	    test_bit(UB_STATE_USED, &ub->state)) {
4410 		ret = -EEXIST;
4411 		goto out_unlock;
4412 	}
4413 
4414 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4415 	if (IS_ERR(disk)) {
4416 		ret = PTR_ERR(disk);
4417 		goto out_unlock;
4418 	}
4419 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4420 	disk->fops = &ub_fops;
4421 	disk->private_data = ub;
4422 
4423 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4424 	ub->ub_disk = disk;
4425 
4426 	ublk_apply_params(ub);
4427 
4428 	/*
4429 	 * Suppress partition scan to avoid potential IO hang.
4430 	 *
4431 	 * If ublk server error occurs during partition scan, the IO may
4432 	 * wait while holding ub->mutex, which can deadlock with other
4433 	 * operations that need the mutex. Defer partition scan to async
4434 	 * work.
4435 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4436 	 * permanently.
4437 	 */
4438 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4439 
4440 	ublk_get_device(ub);
4441 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4442 
4443 	if (ublk_dev_is_zoned(ub)) {
4444 		ret = ublk_revalidate_disk_zones(ub);
4445 		if (ret)
4446 			goto out_put_cdev;
4447 	}
4448 
4449 	ret = add_disk(disk);
4450 	if (ret)
4451 		goto out_put_cdev;
4452 
4453 	set_bit(UB_STATE_USED, &ub->state);
4454 
4455 	/* Skip partition scan if disabled by user */
4456 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4457 		clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4458 	} else {
4459 		/* Schedule async partition scan for trusted daemons */
4460 		if (!ub->unprivileged_daemons)
4461 			schedule_work(&ub->partition_scan_work);
4462 	}
4463 
4464 out_put_cdev:
4465 	if (ret) {
4466 		ublk_detach_disk(ub);
4467 		ublk_put_device(ub);
4468 	}
4469 	if (ret)
4470 		put_disk(disk);
4471 out_unlock:
4472 	mutex_unlock(&ub->mutex);
4473 	return ret;
4474 }
4475 
4476 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4477 		const struct ublksrv_ctrl_cmd *header)
4478 {
4479 	void __user *argp = (void __user *)(unsigned long)header->addr;
4480 	cpumask_var_t cpumask;
4481 	unsigned long queue;
4482 	unsigned int retlen;
4483 	unsigned int i;
4484 	int ret;
4485 
4486 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4487 		return -EINVAL;
4488 	if (header->len & (sizeof(unsigned long)-1))
4489 		return -EINVAL;
4490 	if (!header->addr)
4491 		return -EINVAL;
4492 
4493 	queue = header->data[0];
4494 	if (queue >= ub->dev_info.nr_hw_queues)
4495 		return -EINVAL;
4496 
4497 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4498 		return -ENOMEM;
4499 
4500 	for_each_possible_cpu(i) {
4501 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4502 			cpumask_set_cpu(i, cpumask);
4503 	}
4504 
4505 	ret = -EFAULT;
4506 	retlen = min_t(unsigned short, header->len, cpumask_size());
4507 	if (copy_to_user(argp, cpumask, retlen))
4508 		goto out_free_cpumask;
4509 	if (retlen != header->len &&
4510 	    clear_user(argp + retlen, header->len - retlen))
4511 		goto out_free_cpumask;
4512 
4513 	ret = 0;
4514 out_free_cpumask:
4515 	free_cpumask_var(cpumask);
4516 	return ret;
4517 }
4518 
4519 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4520 {
4521 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4522 			info->dev_id, info->flags);
4523 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4524 			info->nr_hw_queues, info->queue_depth);
4525 }
4526 
4527 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4528 {
4529 	void __user *argp = (void __user *)(unsigned long)header->addr;
4530 	struct ublksrv_ctrl_dev_info info;
4531 	struct ublk_device *ub;
4532 	int ret = -EINVAL;
4533 
4534 	if (header->len < sizeof(info) || !header->addr)
4535 		return -EINVAL;
4536 	if (header->queue_id != (u16)-1) {
4537 		pr_warn("%s: queue_id is wrong %x\n",
4538 			__func__, header->queue_id);
4539 		return -EINVAL;
4540 	}
4541 
4542 	if (copy_from_user(&info, argp, sizeof(info)))
4543 		return -EFAULT;
4544 
4545 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4546 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4547 		return -EINVAL;
4548 
4549 	if (capable(CAP_SYS_ADMIN))
4550 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4551 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4552 		return -EPERM;
4553 
4554 	/* forbid nonsense combinations of recovery flags */
4555 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4556 	case 0:
4557 	case UBLK_F_USER_RECOVERY:
4558 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4559 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4560 		break;
4561 	default:
4562 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4563 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4564 		return -EINVAL;
4565 	}
4566 
4567 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4568 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4569 		return -EINVAL;
4570 	}
4571 
4572 	/*
4573 	 * unprivileged device can't be trusted, but RECOVERY and
4574 	 * RECOVERY_REISSUE still may hang error handling, so can't
4575 	 * support recovery features for unprivileged ublk now
4576 	 *
4577 	 * TODO: provide forward progress for RECOVERY handler, so that
4578 	 * unprivileged device can benefit from it
4579 	 */
4580 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4581 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4582 				UBLK_F_USER_RECOVERY);
4583 
4584 		/*
4585 		 * For USER_COPY, we depends on userspace to fill request
4586 		 * buffer by pwrite() to ublk char device, which can't be
4587 		 * used for unprivileged device
4588 		 *
4589 		 * Same with zero copy or auto buffer register.
4590 		 */
4591 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4592 					UBLK_F_AUTO_BUF_REG))
4593 			return -EINVAL;
4594 	}
4595 
4596 	/* User copy is required to access integrity buffer */
4597 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4598 		return -EINVAL;
4599 
4600 	/* the created device is always owned by current user */
4601 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4602 
4603 	if (header->dev_id != info.dev_id) {
4604 		pr_warn("%s: dev id not match %u %u\n",
4605 			__func__, header->dev_id, info.dev_id);
4606 		return -EINVAL;
4607 	}
4608 
4609 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4610 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4611 			__func__, UBLK_MAX_UBLKS - 1);
4612 		return -EINVAL;
4613 	}
4614 
4615 	ublk_dump_dev_info(&info);
4616 
4617 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4618 	if (ret)
4619 		return ret;
4620 
4621 	ret = -EACCES;
4622 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4623 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4624 		goto out_unlock;
4625 
4626 	ret = -ENOMEM;
4627 	ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4628 	if (!ub)
4629 		goto out_unlock;
4630 	mutex_init(&ub->mutex);
4631 	spin_lock_init(&ub->lock);
4632 	mutex_init(&ub->cancel_mutex);
4633 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4634 
4635 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4636 	if (ret < 0)
4637 		goto out_free_ub;
4638 
4639 	memcpy(&ub->dev_info, &info, sizeof(info));
4640 
4641 	/* update device id */
4642 	ub->dev_info.dev_id = ub->ub_number;
4643 
4644 	/*
4645 	 * 64bit flags will be copied back to userspace as feature
4646 	 * negotiation result, so have to clear flags which driver
4647 	 * doesn't support yet, then userspace can get correct flags
4648 	 * (features) to handle.
4649 	 */
4650 	ub->dev_info.flags &= UBLK_F_ALL;
4651 
4652 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4653 		UBLK_F_URING_CMD_COMP_IN_TASK |
4654 		UBLK_F_PER_IO_DAEMON |
4655 		UBLK_F_BUF_REG_OFF_DAEMON |
4656 		UBLK_F_SAFE_STOP_DEV;
4657 
4658 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4659 	if (ublk_dev_support_batch_io(ub))
4660 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4661 
4662 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4663 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4664 				UBLK_F_AUTO_BUF_REG))
4665 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4666 
4667 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4668 	if (ublk_dev_support_batch_io(ub))
4669 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4670 
4671 	/*
4672 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4673 	 * returning write_append_lba, which is only allowed in case of
4674 	 * user copy or zero copy
4675 	 */
4676 	if (ublk_dev_is_zoned(ub) &&
4677 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4678 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4679 		ret = -EINVAL;
4680 		goto out_free_dev_number;
4681 	}
4682 
4683 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4684 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4685 	ublk_align_max_io_size(ub);
4686 
4687 	ret = ublk_add_tag_set(ub);
4688 	if (ret)
4689 		goto out_free_dev_number;
4690 
4691 	ret = ublk_init_queues(ub);
4692 	if (ret)
4693 		goto out_free_tag_set;
4694 
4695 	ret = -EFAULT;
4696 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4697 		goto out_deinit_queues;
4698 
4699 	/*
4700 	 * Add the char dev so that ublksrv daemon can be setup.
4701 	 * ublk_add_chdev() will cleanup everything if it fails.
4702 	 */
4703 	ret = ublk_add_chdev(ub);
4704 	goto out_unlock;
4705 
4706 out_deinit_queues:
4707 	ublk_deinit_queues(ub);
4708 out_free_tag_set:
4709 	blk_mq_free_tag_set(&ub->tag_set);
4710 out_free_dev_number:
4711 	ublk_free_dev_number(ub);
4712 out_free_ub:
4713 	mutex_destroy(&ub->mutex);
4714 	mutex_destroy(&ub->cancel_mutex);
4715 	kfree(ub);
4716 out_unlock:
4717 	mutex_unlock(&ublk_ctl_mutex);
4718 	return ret;
4719 }
4720 
4721 static inline bool ublk_idr_freed(int id)
4722 {
4723 	void *ptr;
4724 
4725 	spin_lock(&ublk_idr_lock);
4726 	ptr = idr_find(&ublk_index_idr, id);
4727 	spin_unlock(&ublk_idr_lock);
4728 
4729 	return ptr == NULL;
4730 }
4731 
4732 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4733 {
4734 	struct ublk_device *ub = *p_ub;
4735 	int idx = ub->ub_number;
4736 	int ret;
4737 
4738 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4739 	if (ret)
4740 		return ret;
4741 
4742 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4743 		ublk_remove(ub);
4744 		set_bit(UB_STATE_DELETED, &ub->state);
4745 	}
4746 
4747 	/* Mark the reference as consumed */
4748 	*p_ub = NULL;
4749 	ublk_put_device(ub);
4750 	mutex_unlock(&ublk_ctl_mutex);
4751 
4752 	/*
4753 	 * Wait until the idr is removed, then it can be reused after
4754 	 * DEL_DEV command is returned.
4755 	 *
4756 	 * If we returns because of user interrupt, future delete command
4757 	 * may come:
4758 	 *
4759 	 * - the device number isn't freed, this device won't or needn't
4760 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4761 	 *   will be released after the last reference is dropped
4762 	 *
4763 	 * - the device number is freed already, we will not find this
4764 	 *   device via ublk_get_device_from_id()
4765 	 */
4766 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4767 		return -EINTR;
4768 	return 0;
4769 }
4770 
4771 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4772 				      const struct ublksrv_ctrl_cmd *header)
4773 {
4774 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4775 			__func__, cmd_op, header->dev_id, header->queue_id,
4776 			header->data[0], header->addr, header->len);
4777 }
4778 
4779 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4780 {
4781 	ublk_stop_dev(ub);
4782 }
4783 
4784 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4785 {
4786 	struct gendisk *disk;
4787 	int ret = 0;
4788 
4789 	disk = ublk_get_disk(ub);
4790 	if (!disk)
4791 		return -ENODEV;
4792 
4793 	mutex_lock(&disk->open_mutex);
4794 	if (disk_openers(disk) > 0) {
4795 		ret = -EBUSY;
4796 		goto unlock;
4797 	}
4798 	ub->block_open = true;
4799 	/* release open_mutex as del_gendisk() will reacquire it */
4800 	mutex_unlock(&disk->open_mutex);
4801 
4802 	ublk_ctrl_stop_dev(ub);
4803 	goto out;
4804 
4805 unlock:
4806 	mutex_unlock(&disk->open_mutex);
4807 out:
4808 	ublk_put_disk(disk);
4809 	return ret;
4810 }
4811 
4812 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4813 		const struct ublksrv_ctrl_cmd *header)
4814 {
4815 	struct task_struct *p;
4816 	struct pid *pid;
4817 	struct ublksrv_ctrl_dev_info dev_info;
4818 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4819 	void __user *argp = (void __user *)(unsigned long)header->addr;
4820 
4821 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4822 		return -EINVAL;
4823 
4824 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4825 	dev_info.ublksrv_pid = -1;
4826 
4827 	if (init_ublksrv_tgid > 0) {
4828 		rcu_read_lock();
4829 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4830 		p = pid_task(pid, PIDTYPE_TGID);
4831 		if (p) {
4832 			int vnr = task_tgid_vnr(p);
4833 
4834 			if (vnr)
4835 				dev_info.ublksrv_pid = vnr;
4836 		}
4837 		rcu_read_unlock();
4838 	}
4839 
4840 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4841 		return -EFAULT;
4842 
4843 	return 0;
4844 }
4845 
4846 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
4847 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4848 {
4849 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4850 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4851 
4852 	if (ub->ub_disk) {
4853 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4854 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4855 	} else {
4856 		ub->params.devt.disk_major = 0;
4857 		ub->params.devt.disk_minor = 0;
4858 	}
4859 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4860 }
4861 
4862 static int ublk_ctrl_get_params(struct ublk_device *ub,
4863 		const struct ublksrv_ctrl_cmd *header)
4864 {
4865 	void __user *argp = (void __user *)(unsigned long)header->addr;
4866 	struct ublk_params_header ph;
4867 	int ret;
4868 
4869 	if (header->len <= sizeof(ph) || !header->addr)
4870 		return -EINVAL;
4871 
4872 	if (copy_from_user(&ph, argp, sizeof(ph)))
4873 		return -EFAULT;
4874 
4875 	if (ph.len > header->len || !ph.len)
4876 		return -EINVAL;
4877 
4878 	if (ph.len > sizeof(struct ublk_params))
4879 		ph.len = sizeof(struct ublk_params);
4880 
4881 	mutex_lock(&ub->mutex);
4882 	ublk_ctrl_fill_params_devt(ub);
4883 	if (copy_to_user(argp, &ub->params, ph.len))
4884 		ret = -EFAULT;
4885 	else
4886 		ret = 0;
4887 	mutex_unlock(&ub->mutex);
4888 
4889 	return ret;
4890 }
4891 
4892 static int ublk_ctrl_set_params(struct ublk_device *ub,
4893 		const struct ublksrv_ctrl_cmd *header)
4894 {
4895 	void __user *argp = (void __user *)(unsigned long)header->addr;
4896 	struct ublk_params_header ph;
4897 	int ret = -EFAULT;
4898 
4899 	if (header->len <= sizeof(ph) || !header->addr)
4900 		return -EINVAL;
4901 
4902 	if (copy_from_user(&ph, argp, sizeof(ph)))
4903 		return -EFAULT;
4904 
4905 	if (ph.len > header->len || !ph.len || !ph.types)
4906 		return -EINVAL;
4907 
4908 	if (ph.len > sizeof(struct ublk_params))
4909 		ph.len = sizeof(struct ublk_params);
4910 
4911 	mutex_lock(&ub->mutex);
4912 	if (test_bit(UB_STATE_USED, &ub->state)) {
4913 		/*
4914 		 * Parameters can only be changed when device hasn't
4915 		 * been started yet
4916 		 */
4917 		ret = -EACCES;
4918 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
4919 		ret = -EFAULT;
4920 	} else {
4921 		/* clear all we don't support yet */
4922 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
4923 		ret = ublk_validate_params(ub);
4924 		if (ret)
4925 			ub->params.types = 0;
4926 	}
4927 	mutex_unlock(&ub->mutex);
4928 
4929 	return ret;
4930 }
4931 
4932 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
4933 {
4934 	int ret = -EINVAL;
4935 
4936 	mutex_lock(&ub->mutex);
4937 	if (ublk_nosrv_should_stop_dev(ub))
4938 		goto out_unlock;
4939 	/*
4940 	 * START_RECOVERY is only allowd after:
4941 	 *
4942 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
4943 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
4944 	 *     released.
4945 	 *
4946 	 * and one of the following holds
4947 	 *
4948 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
4949 	 *     (a)has quiesced request queue
4950 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
4951 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
4952 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
4953 	 *
4954 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
4955 	 *     quiesced, but all I/O is being immediately errored
4956 	 */
4957 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
4958 		ret = -EBUSY;
4959 		goto out_unlock;
4960 	}
4961 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
4962 	init_completion(&ub->completion);
4963 	ret = 0;
4964  out_unlock:
4965 	mutex_unlock(&ub->mutex);
4966 	return ret;
4967 }
4968 
4969 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
4970 		const struct ublksrv_ctrl_cmd *header)
4971 {
4972 	int ublksrv_pid = (int)header->data[0];
4973 	int ret = -EINVAL;
4974 
4975 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
4976 		 header->dev_id);
4977 
4978 	if (wait_for_completion_interruptible(&ub->completion))
4979 		return -EINTR;
4980 
4981 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
4982 		 header->dev_id);
4983 
4984 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4985 		return -EINVAL;
4986 
4987 	mutex_lock(&ub->mutex);
4988 	if (ublk_nosrv_should_stop_dev(ub))
4989 		goto out_unlock;
4990 
4991 	if (!ublk_dev_in_recoverable_state(ub)) {
4992 		ret = -EBUSY;
4993 		goto out_unlock;
4994 	}
4995 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4996 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4997 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
4998 			__func__, ublksrv_pid, header->dev_id);
4999 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
5000 	ret = 0;
5001  out_unlock:
5002 	mutex_unlock(&ub->mutex);
5003 	return ret;
5004 }
5005 
5006 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5007 {
5008 	void __user *argp = (void __user *)(unsigned long)header->addr;
5009 	u64 features = UBLK_F_ALL;
5010 
5011 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
5012 		return -EINVAL;
5013 
5014 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5015 		return -EFAULT;
5016 
5017 	return 0;
5018 }
5019 
5020 static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5021 {
5022 	struct ublk_param_basic *p = &ub->params.basic;
5023 	u64 new_size = header->data[0];
5024 
5025 	mutex_lock(&ub->mutex);
5026 	p->dev_sectors = new_size;
5027 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5028 	mutex_unlock(&ub->mutex);
5029 }
5030 
5031 struct count_busy {
5032 	const struct ublk_queue *ubq;
5033 	unsigned int nr_busy;
5034 };
5035 
5036 static bool ublk_count_busy_req(struct request *rq, void *data)
5037 {
5038 	struct count_busy *idle = data;
5039 
5040 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5041 		idle->nr_busy += 1;
5042 	return true;
5043 }
5044 
5045 /* uring_cmd is guaranteed to be active if the associated request is idle */
5046 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5047 {
5048 	struct count_busy data = {
5049 		.ubq = ubq,
5050 	};
5051 
5052 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5053 	return data.nr_busy < ubq->q_depth;
5054 }
5055 
5056 /* Wait until each hw queue has at least one idle IO */
5057 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5058 				 unsigned int timeout_ms)
5059 {
5060 	unsigned int elapsed = 0;
5061 	int ret;
5062 
5063 	/*
5064 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5065 	 * or new fetch command, so needn't wait any more
5066 	 */
5067 	if (ublk_dev_support_batch_io(ub))
5068 		return 0;
5069 
5070 	while (elapsed < timeout_ms && !signal_pending(current)) {
5071 		unsigned int queues_cancelable = 0;
5072 		int i;
5073 
5074 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5075 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5076 
5077 			queues_cancelable += !!ubq_has_idle_io(ubq);
5078 		}
5079 
5080 		/*
5081 		 * Each queue needs at least one active command for
5082 		 * notifying ublk server
5083 		 */
5084 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5085 			break;
5086 
5087 		msleep(UBLK_REQUEUE_DELAY_MS);
5088 		elapsed += UBLK_REQUEUE_DELAY_MS;
5089 	}
5090 
5091 	if (signal_pending(current))
5092 		ret = -EINTR;
5093 	else if (elapsed >= timeout_ms)
5094 		ret = -EBUSY;
5095 	else
5096 		ret = 0;
5097 
5098 	return ret;
5099 }
5100 
5101 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5102 				 const struct ublksrv_ctrl_cmd *header)
5103 {
5104 	/* zero means wait forever */
5105 	u64 timeout_ms = header->data[0];
5106 	struct gendisk *disk;
5107 	int ret = -ENODEV;
5108 
5109 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5110 		return -EOPNOTSUPP;
5111 
5112 	mutex_lock(&ub->mutex);
5113 	disk = ublk_get_disk(ub);
5114 	if (!disk)
5115 		goto unlock;
5116 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5117 		goto put_disk;
5118 
5119 	ret = 0;
5120 	/* already in expected state */
5121 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5122 		goto put_disk;
5123 
5124 	/* Mark the device as canceling */
5125 	mutex_lock(&ub->cancel_mutex);
5126 	blk_mq_quiesce_queue(disk->queue);
5127 	ublk_set_canceling(ub, true);
5128 	blk_mq_unquiesce_queue(disk->queue);
5129 	mutex_unlock(&ub->cancel_mutex);
5130 
5131 	if (!timeout_ms)
5132 		timeout_ms = UINT_MAX;
5133 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5134 
5135 put_disk:
5136 	ublk_put_disk(disk);
5137 unlock:
5138 	mutex_unlock(&ub->mutex);
5139 
5140 	/* Cancel pending uring_cmd */
5141 	if (!ret)
5142 		ublk_cancel_dev(ub);
5143 	return ret;
5144 }
5145 
5146 /*
5147  * All control commands are sent via /dev/ublk-control, so we have to check
5148  * the destination device's permission
5149  */
5150 static int ublk_char_dev_permission(struct ublk_device *ub,
5151 		const char *dev_path, int mask)
5152 {
5153 	int err;
5154 	struct path path;
5155 	struct kstat stat;
5156 
5157 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5158 	if (err)
5159 		return err;
5160 
5161 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5162 	if (err)
5163 		goto exit;
5164 
5165 	err = -EPERM;
5166 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5167 		goto exit;
5168 
5169 	err = inode_permission(&nop_mnt_idmap,
5170 			d_backing_inode(path.dentry), mask);
5171 exit:
5172 	path_put(&path);
5173 	return err;
5174 }
5175 
5176 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5177 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5178 {
5179 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5180 	void __user *argp = (void __user *)(unsigned long)header->addr;
5181 	char *dev_path = NULL;
5182 	int ret = 0;
5183 	int mask;
5184 
5185 	if (!unprivileged) {
5186 		if (!capable(CAP_SYS_ADMIN))
5187 			return -EPERM;
5188 		/*
5189 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5190 		 * char_dev_path in payload too, since userspace may not
5191 		 * know if the specified device is created as unprivileged
5192 		 * mode.
5193 		 */
5194 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5195 			return 0;
5196 	}
5197 
5198 	/*
5199 	 * User has to provide the char device path for unprivileged ublk
5200 	 *
5201 	 * header->addr always points to the dev path buffer, and
5202 	 * header->dev_path_len records length of dev path buffer.
5203 	 */
5204 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5205 		return -EINVAL;
5206 
5207 	if (header->len < header->dev_path_len)
5208 		return -EINVAL;
5209 
5210 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5211 	if (IS_ERR(dev_path))
5212 		return PTR_ERR(dev_path);
5213 
5214 	ret = -EINVAL;
5215 	switch (_IOC_NR(cmd_op)) {
5216 	case UBLK_CMD_GET_DEV_INFO:
5217 	case UBLK_CMD_GET_DEV_INFO2:
5218 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5219 	case UBLK_CMD_GET_PARAMS:
5220 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5221 		mask = MAY_READ;
5222 		break;
5223 	case UBLK_CMD_START_DEV:
5224 	case UBLK_CMD_STOP_DEV:
5225 	case UBLK_CMD_ADD_DEV:
5226 	case UBLK_CMD_DEL_DEV:
5227 	case UBLK_CMD_SET_PARAMS:
5228 	case UBLK_CMD_START_USER_RECOVERY:
5229 	case UBLK_CMD_END_USER_RECOVERY:
5230 	case UBLK_CMD_UPDATE_SIZE:
5231 	case UBLK_CMD_QUIESCE_DEV:
5232 	case UBLK_CMD_TRY_STOP_DEV:
5233 		mask = MAY_READ | MAY_WRITE;
5234 		break;
5235 	default:
5236 		goto exit;
5237 	}
5238 
5239 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5240 	if (!ret) {
5241 		header->len -= header->dev_path_len;
5242 		header->addr += header->dev_path_len;
5243 	}
5244 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5245 			__func__, ub->ub_number, cmd_op,
5246 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5247 			dev_path, ret);
5248 exit:
5249 	kfree(dev_path);
5250 	return ret;
5251 }
5252 
5253 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5254 {
5255 	switch (_IOC_NR(cmd_op)) {
5256 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5257 	case UBLK_CMD_GET_DEV_INFO:
5258 	case UBLK_CMD_GET_DEV_INFO2:
5259 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5260 		return false;
5261 	default:
5262 		return true;
5263 	}
5264 }
5265 
5266 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5267 		unsigned int issue_flags)
5268 {
5269 	/* May point to userspace-mapped memory */
5270 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5271 								    struct ublksrv_ctrl_cmd);
5272 	struct ublksrv_ctrl_cmd header;
5273 	struct ublk_device *ub = NULL;
5274 	u32 cmd_op = cmd->cmd_op;
5275 	int ret = -EINVAL;
5276 
5277 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5278 	    issue_flags & IO_URING_F_NONBLOCK)
5279 		return -EAGAIN;
5280 
5281 	if (!(issue_flags & IO_URING_F_SQE128))
5282 		return -EINVAL;
5283 
5284 	header.dev_id = READ_ONCE(ub_src->dev_id);
5285 	header.queue_id = READ_ONCE(ub_src->queue_id);
5286 	header.len = READ_ONCE(ub_src->len);
5287 	header.addr = READ_ONCE(ub_src->addr);
5288 	header.data[0] = READ_ONCE(ub_src->data[0]);
5289 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5290 	ublk_ctrl_cmd_dump(cmd_op, &header);
5291 
5292 	ret = ublk_check_cmd_op(cmd_op);
5293 	if (ret)
5294 		goto out;
5295 
5296 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5297 		ret = ublk_ctrl_get_features(&header);
5298 		goto out;
5299 	}
5300 
5301 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5302 		ret = -ENODEV;
5303 		ub = ublk_get_device_from_id(header.dev_id);
5304 		if (!ub)
5305 			goto out;
5306 
5307 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5308 		if (ret)
5309 			goto put_dev;
5310 	}
5311 
5312 	switch (_IOC_NR(cmd_op)) {
5313 	case UBLK_CMD_START_DEV:
5314 		ret = ublk_ctrl_start_dev(ub, &header);
5315 		break;
5316 	case UBLK_CMD_STOP_DEV:
5317 		ublk_ctrl_stop_dev(ub);
5318 		ret = 0;
5319 		break;
5320 	case UBLK_CMD_GET_DEV_INFO:
5321 	case UBLK_CMD_GET_DEV_INFO2:
5322 		ret = ublk_ctrl_get_dev_info(ub, &header);
5323 		break;
5324 	case UBLK_CMD_ADD_DEV:
5325 		ret = ublk_ctrl_add_dev(&header);
5326 		break;
5327 	case UBLK_CMD_DEL_DEV:
5328 		ret = ublk_ctrl_del_dev(&ub, true);
5329 		break;
5330 	case UBLK_CMD_DEL_DEV_ASYNC:
5331 		ret = ublk_ctrl_del_dev(&ub, false);
5332 		break;
5333 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5334 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5335 		break;
5336 	case UBLK_CMD_GET_PARAMS:
5337 		ret = ublk_ctrl_get_params(ub, &header);
5338 		break;
5339 	case UBLK_CMD_SET_PARAMS:
5340 		ret = ublk_ctrl_set_params(ub, &header);
5341 		break;
5342 	case UBLK_CMD_START_USER_RECOVERY:
5343 		ret = ublk_ctrl_start_recovery(ub);
5344 		break;
5345 	case UBLK_CMD_END_USER_RECOVERY:
5346 		ret = ublk_ctrl_end_recovery(ub, &header);
5347 		break;
5348 	case UBLK_CMD_UPDATE_SIZE:
5349 		ublk_ctrl_set_size(ub, &header);
5350 		ret = 0;
5351 		break;
5352 	case UBLK_CMD_QUIESCE_DEV:
5353 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5354 		break;
5355 	case UBLK_CMD_TRY_STOP_DEV:
5356 		ret = ublk_ctrl_try_stop_dev(ub);
5357 		break;
5358 	default:
5359 		ret = -EOPNOTSUPP;
5360 		break;
5361 	}
5362 
5363  put_dev:
5364 	if (ub)
5365 		ublk_put_device(ub);
5366  out:
5367 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5368 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5369 	return ret;
5370 }
5371 
5372 static const struct file_operations ublk_ctl_fops = {
5373 	.open		= nonseekable_open,
5374 	.uring_cmd      = ublk_ctrl_uring_cmd,
5375 	.owner		= THIS_MODULE,
5376 	.llseek		= noop_llseek,
5377 };
5378 
5379 static struct miscdevice ublk_misc = {
5380 	.minor		= MISC_DYNAMIC_MINOR,
5381 	.name		= "ublk-control",
5382 	.fops		= &ublk_ctl_fops,
5383 };
5384 
5385 static int __init ublk_init(void)
5386 {
5387 	int ret;
5388 
5389 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5390 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5391 	/*
5392 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5393 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5394 	 */
5395 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5396 		     UBLKSRV_IO_INTEGRITY_FLAG);
5397 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5398 
5399 	init_waitqueue_head(&ublk_idr_wq);
5400 
5401 	ret = misc_register(&ublk_misc);
5402 	if (ret)
5403 		return ret;
5404 
5405 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5406 	if (ret)
5407 		goto unregister_mis;
5408 
5409 	ret = class_register(&ublk_chr_class);
5410 	if (ret)
5411 		goto free_chrdev_region;
5412 
5413 	return 0;
5414 
5415 free_chrdev_region:
5416 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5417 unregister_mis:
5418 	misc_deregister(&ublk_misc);
5419 	return ret;
5420 }
5421 
5422 static void __exit ublk_exit(void)
5423 {
5424 	struct ublk_device *ub;
5425 	int id;
5426 
5427 	idr_for_each_entry(&ublk_index_idr, ub, id)
5428 		ublk_remove(ub);
5429 
5430 	class_unregister(&ublk_chr_class);
5431 	misc_deregister(&ublk_misc);
5432 
5433 	idr_destroy(&ublk_index_idr);
5434 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5435 }
5436 
5437 module_init(ublk_init);
5438 module_exit(ublk_exit);
5439 
5440 static int ublk_set_max_unprivileged_ublks(const char *buf,
5441 					   const struct kernel_param *kp)
5442 {
5443 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5444 }
5445 
5446 static int ublk_get_max_unprivileged_ublks(char *buf,
5447 					   const struct kernel_param *kp)
5448 {
5449 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5450 }
5451 
5452 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5453 	.set = ublk_set_max_unprivileged_ublks,
5454 	.get = ublk_get_max_unprivileged_ublks,
5455 };
5456 
5457 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5458 		&unprivileged_ublks_max, 0644);
5459 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5460 
5461 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5462 MODULE_DESCRIPTION("Userspace block device");
5463 MODULE_LICENSE("GPL");
5464