xref: /linux/drivers/block/ublk_drv.c (revision f990ad67f0febc51274adb604d5bdeab0d06d024)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <linux/kfifo.h>
48 #include <linux/blk-integrity.h>
49 #include <uapi/linux/fs.h>
50 #include <uapi/linux/ublk_cmd.h>
51 
52 #define UBLK_MINORS		(1U << MINORBITS)
53 
54 #define UBLK_INVALID_BUF_IDX 	((u16)-1)
55 
56 /* private ioctl command mirror */
57 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
58 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
59 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
60 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
61 
62 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
63 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
64 
65 /* All UBLK_F_* have to be included into UBLK_F_ALL */
66 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
67 		| UBLK_F_URING_CMD_COMP_IN_TASK \
68 		| UBLK_F_NEED_GET_DATA \
69 		| UBLK_F_USER_RECOVERY \
70 		| UBLK_F_USER_RECOVERY_REISSUE \
71 		| UBLK_F_UNPRIVILEGED_DEV \
72 		| UBLK_F_CMD_IOCTL_ENCODE \
73 		| UBLK_F_USER_COPY \
74 		| UBLK_F_ZONED \
75 		| UBLK_F_USER_RECOVERY_FAIL_IO \
76 		| UBLK_F_UPDATE_SIZE \
77 		| UBLK_F_AUTO_BUF_REG \
78 		| UBLK_F_QUIESCE \
79 		| UBLK_F_PER_IO_DAEMON \
80 		| UBLK_F_BUF_REG_OFF_DAEMON \
81 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
82 		| UBLK_F_SAFE_STOP_DEV \
83 		| UBLK_F_BATCH_IO \
84 		| UBLK_F_NO_AUTO_PART_SCAN)
85 
86 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
87 		| UBLK_F_USER_RECOVERY_REISSUE \
88 		| UBLK_F_USER_RECOVERY_FAIL_IO)
89 
90 /* All UBLK_PARAM_TYPE_* should be included here */
91 #define UBLK_PARAM_TYPE_ALL                                \
92 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
93 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
94 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
95 	 UBLK_PARAM_TYPE_INTEGRITY)
96 
97 #define UBLK_BATCH_F_ALL  \
98 	(UBLK_BATCH_F_HAS_ZONE_LBA | \
99 	 UBLK_BATCH_F_HAS_BUF_ADDR | \
100 	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
101 
102 /* ublk batch fetch uring_cmd */
103 struct ublk_batch_fetch_cmd {
104 	struct list_head node;
105 	struct io_uring_cmd *cmd;
106 	unsigned short buf_group;
107 };
108 
109 struct ublk_uring_cmd_pdu {
110 	/*
111 	 * Store requests in same batch temporarily for queuing them to
112 	 * daemon context.
113 	 *
114 	 * It should have been stored to request payload, but we do want
115 	 * to avoid extra pre-allocation, and uring_cmd payload is always
116 	 * free for us
117 	 */
118 	union {
119 		struct request *req;
120 		struct request *req_list;
121 	};
122 
123 	/*
124 	 * The following two are valid in this cmd whole lifetime, and
125 	 * setup in ublk uring_cmd handler
126 	 */
127 	struct ublk_queue *ubq;
128 
129 	union {
130 		u16 tag;
131 		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
132 	};
133 };
134 
135 struct ublk_batch_io_data {
136 	struct ublk_device *ub;
137 	struct io_uring_cmd *cmd;
138 	struct ublk_batch_io header;
139 	unsigned int issue_flags;
140 	struct io_comp_batch *iob;
141 };
142 
143 /*
144  * io command is active: sqe cmd is received, and its cqe isn't done
145  *
146  * If the flag is set, the io command is owned by ublk driver, and waited
147  * for incoming blk-mq request from the ublk block device.
148  *
149  * If the flag is cleared, the io command will be completed, and owned by
150  * ublk server.
151  */
152 #define UBLK_IO_FLAG_ACTIVE	0x01
153 
154 /*
155  * IO command is completed via cqe, and it is being handled by ublksrv, and
156  * not committed yet
157  *
158  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
159  * cross verification
160  */
161 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
162 
163 /*
164  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
165  * get data buffer address from ublksrv.
166  *
167  * Then, bio data could be copied into this data buffer for a WRITE request
168  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
169  */
170 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
171 
172 /*
173  * request buffer is registered automatically, so we have to unregister it
174  * before completing this request.
175  *
176  * io_uring will unregister buffer automatically for us during exiting.
177  */
178 #define UBLK_IO_FLAG_AUTO_BUF_REG 	0x10
179 
180 /* atomic RW with ubq->cancel_lock */
181 #define UBLK_IO_FLAG_CANCELED	0x80000000
182 
183 /*
184  * Initialize refcount to a large number to include any registered buffers.
185  * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
186  * any buffers registered on the io daemon task.
187  */
188 #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
189 
190 /* used for UBLK_F_BATCH_IO only */
191 #define UBLK_BATCH_IO_UNUSED_TAG	((unsigned short)-1)
192 
193 union ublk_io_buf {
194 	__u64	addr;
195 	struct ublk_auto_buf_reg auto_reg;
196 };
197 
198 struct ublk_io {
199 	union ublk_io_buf buf;
200 	unsigned int flags;
201 	int res;
202 
203 	union {
204 		/* valid if UBLK_IO_FLAG_ACTIVE is set */
205 		struct io_uring_cmd *cmd;
206 		/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
207 		struct request *req;
208 	};
209 
210 	struct task_struct *task;
211 
212 	/*
213 	 * The number of uses of this I/O by the ublk server
214 	 * if user copy or zero copy are enabled:
215 	 * - UBLK_REFCOUNT_INIT from dispatch to the server
216 	 *   until UBLK_IO_COMMIT_AND_FETCH_REQ
217 	 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
218 	 * - 1 for each io_uring registered buffer not registered on task
219 	 * The I/O can only be completed once all references are dropped.
220 	 * User copy and buffer registration operations are only permitted
221 	 * if the reference count is nonzero.
222 	 */
223 	refcount_t ref;
224 	/* Count of buffers registered on task and not yet unregistered */
225 	unsigned task_registered_buffers;
226 
227 	void *buf_ctx_handle;
228 	spinlock_t lock;
229 } ____cacheline_aligned_in_smp;
230 
231 struct ublk_queue {
232 	int q_id;
233 	int q_depth;
234 
235 	unsigned long flags;
236 	struct ublksrv_io_desc *io_cmd_buf;
237 
238 	bool force_abort;
239 	bool canceling;
240 	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
241 	spinlock_t		cancel_lock;
242 	struct ublk_device *dev;
243 	u32 nr_io_ready;
244 
245 	/*
246 	 * For supporting UBLK_F_BATCH_IO only.
247 	 *
248 	 * Inflight ublk request tag is saved in this fifo
249 	 *
250 	 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
251 	 * so lock is required for storing request tag to fifo
252 	 *
253 	 * Make sure just one reader for fetching request from task work
254 	 * function to ublk server, so no need to grab the lock in reader
255 	 * side.
256 	 *
257 	 * Batch I/O State Management:
258 	 *
259 	 * The batch I/O system uses implicit state management based on the
260 	 * combination of three key variables below.
261 	 *
262 	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
263 	 *   No fetch commands available, events queue in evts_fifo
264 	 *
265 	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
266 	 *   Fetch commands available but none processing events
267 	 *
268 	 * - ACTIVE: active_fcmd
269 	 *   One fetch command actively processing events from evts_fifo
270 	 *
271 	 * Key Invariants:
272 	 * - At most one active_fcmd at any time (single reader)
273 	 * - active_fcmd is always from fcmd_head list when non-NULL
274 	 * - evts_fifo can be read locklessly by the single active reader
275 	 * - All state transitions require evts_lock protection
276 	 * - Multiple writers to evts_fifo require lock protection
277 	 */
278 	struct {
279 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
280 		spinlock_t evts_lock;
281 
282 		/* List of fetch commands available to process events */
283 		struct list_head fcmd_head;
284 
285 		/* Currently active fetch command (NULL = none active) */
286 		struct ublk_batch_fetch_cmd  *active_fcmd;
287 	}____cacheline_aligned_in_smp;
288 
289 	struct ublk_io ios[] __counted_by(q_depth);
290 };
291 
292 struct ublk_device {
293 	struct gendisk		*ub_disk;
294 
295 	struct ublksrv_ctrl_dev_info	dev_info;
296 
297 	struct blk_mq_tag_set	tag_set;
298 
299 	struct cdev		cdev;
300 	struct device		cdev_dev;
301 
302 #define UB_STATE_OPEN		0
303 #define UB_STATE_USED		1
304 #define UB_STATE_DELETED	2
305 	unsigned long		state;
306 	int			ub_number;
307 
308 	struct mutex		mutex;
309 
310 	spinlock_t		lock;
311 	struct mm_struct	*mm;
312 
313 	struct ublk_params	params;
314 
315 	struct completion	completion;
316 	u32			nr_queue_ready;
317 	bool 			unprivileged_daemons;
318 	struct mutex cancel_mutex;
319 	bool canceling;
320 	pid_t 	ublksrv_tgid;
321 	struct delayed_work	exit_work;
322 	struct work_struct	partition_scan_work;
323 
324 	bool			block_open; /* protected by open_mutex */
325 
326 	struct ublk_queue       *queues[];
327 };
328 
329 /* header of ublk_params */
330 struct ublk_params_header {
331 	__u32	len;
332 	__u32	types;
333 };
334 
335 static void ublk_io_release(void *priv);
336 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
337 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
338 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
339 		u16 q_id, u16 tag, struct ublk_io *io);
340 static inline unsigned int ublk_req_build_flags(struct request *req);
341 static void ublk_batch_dispatch(struct ublk_queue *ubq,
342 				const struct ublk_batch_io_data *data,
343 				struct ublk_batch_fetch_cmd *fcmd);
344 
345 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
346 {
347 	return ub->dev_info.flags & UBLK_F_BATCH_IO;
348 }
349 
350 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
351 {
352 	return ubq->flags & UBLK_F_BATCH_IO;
353 }
354 
355 static inline void ublk_io_lock(struct ublk_io *io)
356 {
357 	spin_lock(&io->lock);
358 }
359 
360 static inline void ublk_io_unlock(struct ublk_io *io)
361 {
362 	spin_unlock(&io->lock);
363 }
364 
365 /* Initialize the event queue */
366 static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
367 				    int numa_node)
368 {
369 	spin_lock_init(&q->evts_lock);
370 	return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
371 }
372 
373 /* Check if event queue is empty */
374 static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
375 {
376 	return kfifo_is_empty(&q->evts_fifo);
377 }
378 
379 static inline void ublk_io_evts_deinit(struct ublk_queue *q)
380 {
381 	WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
382 	kfifo_free(&q->evts_fifo);
383 }
384 
385 static inline struct ublksrv_io_desc *
386 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
387 {
388 	return &ubq->io_cmd_buf[tag];
389 }
390 
391 static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
392 {
393 	return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
394 }
395 
396 static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
397 {
398 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
399 }
400 
401 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
402 {
403 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
404 }
405 
406 static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
407 {
408 	return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
409 }
410 
411 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
412 {
413 	return ubq->flags & UBLK_F_USER_COPY;
414 }
415 
416 static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
417 {
418 	return ub->dev_info.flags & UBLK_F_USER_COPY;
419 }
420 
421 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
422 {
423 	return ub->dev_info.flags & UBLK_F_ZONED;
424 }
425 
426 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
427 {
428 	return ubq->flags & UBLK_F_ZONED;
429 }
430 
431 static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
432 {
433 	return ub->dev_info.flags & UBLK_F_INTEGRITY;
434 }
435 
436 #ifdef CONFIG_BLK_DEV_ZONED
437 
438 struct ublk_zoned_report_desc {
439 	__u64 sector;
440 	__u32 operation;
441 	__u32 nr_zones;
442 };
443 
444 static DEFINE_XARRAY(ublk_zoned_report_descs);
445 
446 static int ublk_zoned_insert_report_desc(const struct request *req,
447 		struct ublk_zoned_report_desc *desc)
448 {
449 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
450 			    desc, GFP_KERNEL);
451 }
452 
453 static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
454 		const struct request *req)
455 {
456 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
457 }
458 
459 static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
460 		const struct request *req)
461 {
462 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
463 }
464 
465 static int ublk_get_nr_zones(const struct ublk_device *ub)
466 {
467 	const struct ublk_param_basic *p = &ub->params.basic;
468 
469 	/* Zone size is a power of 2 */
470 	return p->dev_sectors >> ilog2(p->chunk_sectors);
471 }
472 
473 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
474 {
475 	return blk_revalidate_disk_zones(ub->ub_disk);
476 }
477 
478 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
479 {
480 	const struct ublk_param_zoned *p = &ub->params.zoned;
481 	int nr_zones;
482 
483 	if (!ublk_dev_is_zoned(ub))
484 		return -EINVAL;
485 
486 	if (!p->max_zone_append_sectors)
487 		return -EINVAL;
488 
489 	nr_zones = ublk_get_nr_zones(ub);
490 
491 	if (p->max_active_zones > nr_zones)
492 		return -EINVAL;
493 
494 	if (p->max_open_zones > nr_zones)
495 		return -EINVAL;
496 
497 	return 0;
498 }
499 
500 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
501 {
502 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
503 }
504 
505 /* Based on virtblk_alloc_report_buffer */
506 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
507 				      unsigned int nr_zones, size_t *buflen)
508 {
509 	struct request_queue *q = ublk->ub_disk->queue;
510 	size_t bufsize;
511 	void *buf;
512 
513 	nr_zones = min_t(unsigned int, nr_zones,
514 			 ublk->ub_disk->nr_zones);
515 
516 	bufsize = nr_zones * sizeof(struct blk_zone);
517 	bufsize =
518 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
519 
520 	while (bufsize >= sizeof(struct blk_zone)) {
521 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
522 		if (buf) {
523 			*buflen = bufsize;
524 			return buf;
525 		}
526 		bufsize >>= 1;
527 	}
528 
529 	*buflen = 0;
530 	return NULL;
531 }
532 
533 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
534 		      unsigned int nr_zones, struct blk_report_zones_args *args)
535 {
536 	struct ublk_device *ub = disk->private_data;
537 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
538 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
539 	unsigned int done_zones = 0;
540 	unsigned int max_zones_per_request;
541 	int ret;
542 	struct blk_zone *buffer;
543 	size_t buffer_length;
544 
545 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
546 			 nr_zones);
547 
548 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
549 	if (!buffer)
550 		return -ENOMEM;
551 
552 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
553 
554 	while (done_zones < nr_zones) {
555 		unsigned int remaining_zones = nr_zones - done_zones;
556 		unsigned int zones_in_request =
557 			min_t(unsigned int, remaining_zones, max_zones_per_request);
558 		struct request *req;
559 		struct ublk_zoned_report_desc desc;
560 		blk_status_t status;
561 
562 		memset(buffer, 0, buffer_length);
563 
564 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
565 		if (IS_ERR(req)) {
566 			ret = PTR_ERR(req);
567 			goto out;
568 		}
569 
570 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
571 		desc.sector = sector;
572 		desc.nr_zones = zones_in_request;
573 		ret = ublk_zoned_insert_report_desc(req, &desc);
574 		if (ret)
575 			goto free_req;
576 
577 		ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
578 		if (ret)
579 			goto erase_desc;
580 
581 		status = blk_execute_rq(req, 0);
582 		ret = blk_status_to_errno(status);
583 erase_desc:
584 		ublk_zoned_erase_report_desc(req);
585 free_req:
586 		blk_mq_free_request(req);
587 		if (ret)
588 			goto out;
589 
590 		for (unsigned int i = 0; i < zones_in_request; i++) {
591 			struct blk_zone *zone = buffer + i;
592 
593 			/* A zero length zone means no more zones in this response */
594 			if (!zone->len)
595 				break;
596 
597 			ret = disk_report_zone(disk, zone, i, args);
598 			if (ret)
599 				goto out;
600 
601 			done_zones++;
602 			sector += zone_size_sectors;
603 
604 		}
605 	}
606 
607 	ret = done_zones;
608 
609 out:
610 	kvfree(buffer);
611 	return ret;
612 }
613 
614 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
615 					 struct request *req)
616 {
617 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
618 	struct ublk_io *io = &ubq->ios[req->tag];
619 	struct ublk_zoned_report_desc *desc;
620 	u32 ublk_op;
621 
622 	switch (req_op(req)) {
623 	case REQ_OP_ZONE_OPEN:
624 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
625 		break;
626 	case REQ_OP_ZONE_CLOSE:
627 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
628 		break;
629 	case REQ_OP_ZONE_FINISH:
630 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
631 		break;
632 	case REQ_OP_ZONE_RESET:
633 		ublk_op = UBLK_IO_OP_ZONE_RESET;
634 		break;
635 	case REQ_OP_ZONE_APPEND:
636 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
637 		break;
638 	case REQ_OP_ZONE_RESET_ALL:
639 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
640 		break;
641 	case REQ_OP_DRV_IN:
642 		desc = ublk_zoned_get_report_desc(req);
643 		if (!desc)
644 			return BLK_STS_IOERR;
645 		ublk_op = desc->operation;
646 		switch (ublk_op) {
647 		case UBLK_IO_OP_REPORT_ZONES:
648 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
649 			iod->nr_zones = desc->nr_zones;
650 			iod->start_sector = desc->sector;
651 			return BLK_STS_OK;
652 		default:
653 			return BLK_STS_IOERR;
654 		}
655 	case REQ_OP_DRV_OUT:
656 		/* We do not support drv_out */
657 		return BLK_STS_NOTSUPP;
658 	default:
659 		return BLK_STS_IOERR;
660 	}
661 
662 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
663 	iod->nr_sectors = blk_rq_sectors(req);
664 	iod->start_sector = blk_rq_pos(req);
665 	iod->addr = io->buf.addr;
666 
667 	return BLK_STS_OK;
668 }
669 
670 #else
671 
672 #define ublk_report_zones (NULL)
673 
674 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
675 {
676 	return -EOPNOTSUPP;
677 }
678 
679 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
680 {
681 }
682 
683 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
684 {
685 	return 0;
686 }
687 
688 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
689 					 struct request *req)
690 {
691 	return BLK_STS_NOTSUPP;
692 }
693 
694 #endif
695 
696 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
697 				      bool need_map, struct io_comp_batch *iob);
698 
699 static dev_t ublk_chr_devt;
700 static const struct class ublk_chr_class = {
701 	.name = "ublk-char",
702 };
703 
704 static DEFINE_IDR(ublk_index_idr);
705 static DEFINE_SPINLOCK(ublk_idr_lock);
706 static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
707 
708 static DEFINE_MUTEX(ublk_ctl_mutex);
709 
710 static struct ublk_batch_fetch_cmd *
711 ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
712 {
713 	struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
714 
715 	if (fcmd) {
716 		fcmd->cmd = cmd;
717 		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
718 	}
719 	return fcmd;
720 }
721 
722 static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
723 {
724 	kfree(fcmd);
725 }
726 
727 static void __ublk_release_fcmd(struct ublk_queue *ubq)
728 {
729 	WRITE_ONCE(ubq->active_fcmd, NULL);
730 }
731 
732 /*
733  * Nothing can move on, so clear ->active_fcmd, and the caller should stop
734  * dispatching
735  */
736 static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
737 					const struct ublk_batch_io_data *data,
738 					struct ublk_batch_fetch_cmd *fcmd,
739 					int res)
740 {
741 	spin_lock(&ubq->evts_lock);
742 	list_del_init(&fcmd->node);
743 	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
744 	__ublk_release_fcmd(ubq);
745 	spin_unlock(&ubq->evts_lock);
746 
747 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
748 	ublk_batch_free_fcmd(fcmd);
749 }
750 
751 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
752 				     struct io_br_sel *sel,
753 				     unsigned int issue_flags)
754 {
755 	if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
756 		return -ENOBUFS;
757 	return 0;
758 }
759 
760 static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
761 				       void __user *buf, const u16 *tag_buf,
762 				       unsigned int len)
763 {
764 	if (copy_to_user(buf, tag_buf, len))
765 		return -EFAULT;
766 	return len;
767 }
768 
769 #define UBLK_MAX_UBLKS UBLK_MINORS
770 
771 /*
772  * Max unprivileged ublk devices allowed to add
773  *
774  * It can be extended to one per-user limit in future or even controlled
775  * by cgroup.
776  */
777 static unsigned int unprivileged_ublks_max = 64;
778 static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
779 
780 static struct miscdevice ublk_misc;
781 
782 static inline unsigned ublk_pos_to_hwq(loff_t pos)
783 {
784 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
785 		UBLK_QID_BITS_MASK;
786 }
787 
788 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
789 {
790 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
791 }
792 
793 static inline unsigned ublk_pos_to_tag(loff_t pos)
794 {
795 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
796 		UBLK_TAG_BITS_MASK;
797 }
798 
799 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
800 {
801 	const struct ublk_param_basic *p = &ub->params.basic;
802 
803 	if (p->attrs & UBLK_ATTR_READ_ONLY)
804 		set_disk_ro(ub->ub_disk, true);
805 
806 	set_capacity(ub->ub_disk, p->dev_sectors);
807 }
808 
809 static int ublk_integrity_flags(u32 flags)
810 {
811 	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
812 
813 	if (flags & LBMD_PI_CAP_INTEGRITY) {
814 		flags &= ~LBMD_PI_CAP_INTEGRITY;
815 		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
816 	}
817 	if (flags & LBMD_PI_CAP_REFTAG) {
818 		flags &= ~LBMD_PI_CAP_REFTAG;
819 		ret_flags |= BLK_INTEGRITY_REF_TAG;
820 	}
821 	return flags ? -EINVAL : ret_flags;
822 }
823 
824 static int ublk_integrity_pi_tuple_size(u8 csum_type)
825 {
826 	switch (csum_type) {
827 	case LBMD_PI_CSUM_NONE:
828 		return 0;
829 	case LBMD_PI_CSUM_IP:
830 	case LBMD_PI_CSUM_CRC16_T10DIF:
831 		return 8;
832 	case LBMD_PI_CSUM_CRC64_NVME:
833 		return 16;
834 	default:
835 		return -EINVAL;
836 	}
837 }
838 
839 static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
840 {
841 	switch (csum_type) {
842 	case LBMD_PI_CSUM_NONE:
843 		return BLK_INTEGRITY_CSUM_NONE;
844 	case LBMD_PI_CSUM_IP:
845 		return BLK_INTEGRITY_CSUM_IP;
846 	case LBMD_PI_CSUM_CRC16_T10DIF:
847 		return BLK_INTEGRITY_CSUM_CRC;
848 	case LBMD_PI_CSUM_CRC64_NVME:
849 		return BLK_INTEGRITY_CSUM_CRC64;
850 	default:
851 		WARN_ON_ONCE(1);
852 		return BLK_INTEGRITY_CSUM_NONE;
853 	}
854 }
855 
856 static int ublk_validate_params(const struct ublk_device *ub)
857 {
858 	/* basic param is the only one which must be set */
859 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
860 		const struct ublk_param_basic *p = &ub->params.basic;
861 
862 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
863 			return -EINVAL;
864 
865 		if (p->logical_bs_shift > p->physical_bs_shift)
866 			return -EINVAL;
867 
868 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
869 			return -EINVAL;
870 
871 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
872 			return -EINVAL;
873 	} else
874 		return -EINVAL;
875 
876 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
877 		const struct ublk_param_discard *p = &ub->params.discard;
878 
879 		/* So far, only support single segment discard */
880 		if (p->max_discard_sectors && p->max_discard_segments != 1)
881 			return -EINVAL;
882 
883 		if (!p->discard_granularity)
884 			return -EINVAL;
885 	}
886 
887 	/* dev_t is read-only */
888 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
889 		return -EINVAL;
890 
891 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
892 		return ublk_dev_param_zoned_validate(ub);
893 	else if (ublk_dev_is_zoned(ub))
894 		return -EINVAL;
895 
896 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
897 		const struct ublk_param_dma_align *p = &ub->params.dma;
898 
899 		if (p->alignment >= PAGE_SIZE)
900 			return -EINVAL;
901 
902 		if (!is_power_of_2(p->alignment + 1))
903 			return -EINVAL;
904 	}
905 
906 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
907 		const struct ublk_param_segment *p = &ub->params.seg;
908 
909 		if (!is_power_of_2(p->seg_boundary_mask + 1))
910 			return -EINVAL;
911 
912 		if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
913 			return -EINVAL;
914 		if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
915 			return -EINVAL;
916 	}
917 
918 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
919 		const struct ublk_param_integrity *p = &ub->params.integrity;
920 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
921 		int flags = ublk_integrity_flags(p->flags);
922 
923 		if (!ublk_dev_support_integrity(ub))
924 			return -EINVAL;
925 		if (flags < 0)
926 			return flags;
927 		if (pi_tuple_size < 0)
928 			return pi_tuple_size;
929 		if (!p->metadata_size)
930 			return -EINVAL;
931 		if (p->csum_type == LBMD_PI_CSUM_NONE &&
932 		    p->flags & LBMD_PI_CAP_REFTAG)
933 			return -EINVAL;
934 		if (p->pi_offset + pi_tuple_size > p->metadata_size)
935 			return -EINVAL;
936 		if (p->interval_exp < SECTOR_SHIFT ||
937 		    p->interval_exp > ub->params.basic.logical_bs_shift)
938 			return -EINVAL;
939 	}
940 
941 	return 0;
942 }
943 
944 static void ublk_apply_params(struct ublk_device *ub)
945 {
946 	ublk_dev_param_basic_apply(ub);
947 
948 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
949 		ublk_dev_param_zoned_apply(ub);
950 }
951 
952 static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
953 {
954 	return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
955 		!ublk_support_auto_buf_reg(ubq);
956 }
957 
958 static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
959 {
960 	return !ublk_dev_support_user_copy(ub) &&
961 	       !ublk_dev_support_zero_copy(ub) &&
962 	       !ublk_dev_support_auto_buf_reg(ub);
963 }
964 
965 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
966 {
967 	/*
968 	 * read()/write() is involved in user copy, so request reference
969 	 * has to be grabbed
970 	 *
971 	 * for zero copy, request buffer need to be registered to io_uring
972 	 * buffer table, so reference is needed
973 	 *
974 	 * For auto buffer register, ublk server still may issue
975 	 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
976 	 * so reference is required too.
977 	 */
978 	return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
979 		ublk_support_auto_buf_reg(ubq);
980 }
981 
982 static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
983 {
984 	return ublk_dev_support_user_copy(ub) ||
985 	       ublk_dev_support_zero_copy(ub) ||
986 	       ublk_dev_support_auto_buf_reg(ub);
987 }
988 
989 /*
990  * ublk IO Reference Counting Design
991  * ==================================
992  *
993  * For user-copy and zero-copy modes, ublk uses a split reference model with
994  * two counters that together track IO lifetime:
995  *
996  *   - io->ref: refcount for off-task buffer registrations and user-copy ops
997  *   - io->task_registered_buffers: count of buffers registered on the IO task
998  *
999  * Key Invariant:
1000  * --------------
1001  * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
1002  * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
1003  * when no active references exist. After IO completion, both counters become
1004  * zero. For I/Os not currently dispatched to the ublk server, both ref and
1005  * task_registered_buffers are 0.
1006  *
1007  * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
1008  * exit to determine if all references have been released.
1009  *
1010  * Why Split Counters:
1011  * -------------------
1012  * Buffers registered on the IO daemon task can use the lightweight
1013  * task_registered_buffers counter (simple increment/decrement) instead of
1014  * atomic refcount operations. The ublk_io_release() callback checks if
1015  * current == io->task to decide which counter to update.
1016  *
1017  * This optimization only applies before IO completion. At completion,
1018  * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
1019  * After that, all subsequent buffer unregistrations must use the atomic ref
1020  * since they may be releasing the last reference.
1021  *
1022  * Reference Lifecycle:
1023  * --------------------
1024  * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
1025  *
1026  * 2. During IO processing:
1027  *    - On-task buffer reg: task_registered_buffers++ (no ref change)
1028  *    - Off-task buffer reg: ref++ via ublk_get_req_ref()
1029  *    - Buffer unregister callback (ublk_io_release):
1030  *      * If on-task: task_registered_buffers--
1031  *      * If off-task: ref-- via ublk_put_req_ref()
1032  *
1033  * 3. ublk_sub_req_ref() at IO completion:
1034  *    - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
1035  *    - Subtracts sub_refs from ref and zeroes task_registered_buffers
1036  *    - This effectively collapses task_registered_buffers into the atomic ref,
1037  *      accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
1038  *      buffers that were already counted
1039  *
1040  * Example (zero-copy, register on-task, unregister off-task):
1041  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1042  *   - Register buffer on-task: task_registered_buffers = 1
1043  *   - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1044  *   - Completion via ublk_sub_req_ref():
1045  *     sub_refs = UBLK_REFCOUNT_INIT - 1,
1046  *     ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
1047  *
1048  * Example (auto buffer registration):
1049  *   Auto buffer registration sets task_registered_buffers = 1 at dispatch.
1050  *
1051  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
1052  *   - Buffer unregister: task_registered_buffers-- (becomes 0)
1053  *   - Completion via ublk_sub_req_ref():
1054  *     sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
1055  *
1056  * Example (zero-copy, ublk server killed):
1057  *   When daemon is killed, io_uring cleanup unregisters buffers off-task.
1058  *   ublk_check_and_reset_active_ref() waits for the invariant to hold.
1059  *
1060  *   - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
1061  *   - Register buffer on-task: task_registered_buffers = 1
1062  *   - Daemon killed, io_uring cleanup unregisters buffer (off-task):
1063  *     ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
1064  *   - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
1065  *   - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
1066  *     ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
1067  *     and abort pending requests
1068  *
1069  * Batch IO Special Case:
1070  * ----------------------
1071  * In batch IO mode, io->task is NULL. This means ublk_io_release() always
1072  * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
1073  * task_registered_buffers counter still tracks registered buffers for the
1074  * invariant check, even though the callback doesn't decrement it.
1075  *
1076  * Note: updating task_registered_buffers is protected by io->lock.
1077  */
1078 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
1079 		struct ublk_io *io)
1080 {
1081 	if (ublk_need_req_ref(ubq))
1082 		refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
1083 }
1084 
1085 static inline bool ublk_get_req_ref(struct ublk_io *io)
1086 {
1087 	return refcount_inc_not_zero(&io->ref);
1088 }
1089 
1090 static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
1091 {
1092 	if (!refcount_dec_and_test(&io->ref))
1093 		return;
1094 
1095 	/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
1096 	__ublk_complete_rq(req, io, false, NULL);
1097 }
1098 
1099 static inline bool ublk_sub_req_ref(struct ublk_io *io)
1100 {
1101 	unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
1102 
1103 	io->task_registered_buffers = 0;
1104 	return refcount_sub_and_test(sub_refs, &io->ref);
1105 }
1106 
1107 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
1108 {
1109 	return ubq->flags & UBLK_F_NEED_GET_DATA;
1110 }
1111 
1112 static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
1113 {
1114 	return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
1115 }
1116 
1117 /* Called in slow path only, keep it noinline for trace purpose */
1118 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
1119 {
1120 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
1121 		return ub;
1122 	return NULL;
1123 }
1124 
1125 /* Called in slow path only, keep it noinline for trace purpose */
1126 static noinline void ublk_put_device(struct ublk_device *ub)
1127 {
1128 	put_device(&ub->cdev_dev);
1129 }
1130 
1131 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
1132 		int qid)
1133 {
1134 	return dev->queues[qid];
1135 }
1136 
1137 static inline bool ublk_rq_has_data(const struct request *rq)
1138 {
1139 	return bio_has_data(rq->bio);
1140 }
1141 
1142 static inline struct ublksrv_io_desc *
1143 ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
1144 {
1145 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
1146 }
1147 
1148 static inline int __ublk_queue_cmd_buf_size(int depth)
1149 {
1150 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
1151 }
1152 
1153 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
1154 {
1155 	return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
1156 }
1157 
1158 static int ublk_max_cmd_buf_size(void)
1159 {
1160 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
1161 }
1162 
1163 /*
1164  * Should I/O outstanding to the ublk server when it exits be reissued?
1165  * If not, outstanding I/O will get errors.
1166  */
1167 static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
1168 {
1169 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1170 	       (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
1171 }
1172 
1173 /*
1174  * Should I/O issued while there is no ublk server queue? If not, I/O
1175  * issued while there is no ublk server will get errors.
1176  */
1177 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
1178 {
1179 	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
1180 	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1181 }
1182 
1183 /*
1184  * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
1185  * of the device flags for smaller cache footprint - better for fast
1186  * paths.
1187  */
1188 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
1189 {
1190 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
1191 	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
1192 }
1193 
1194 /*
1195  * Should ublk devices be stopped (i.e. no recovery possible) when the
1196  * ublk server exits? If not, devices can be used again by a future
1197  * incarnation of a ublk server via the start_recovery/end_recovery
1198  * commands.
1199  */
1200 static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
1201 {
1202 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
1203 }
1204 
1205 static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
1206 {
1207 	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
1208 	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
1209 }
1210 
1211 static void ublk_free_disk(struct gendisk *disk)
1212 {
1213 	struct ublk_device *ub = disk->private_data;
1214 
1215 	clear_bit(UB_STATE_USED, &ub->state);
1216 	ublk_put_device(ub);
1217 }
1218 
1219 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
1220 		unsigned int *owner_gid)
1221 {
1222 	kuid_t uid;
1223 	kgid_t gid;
1224 
1225 	current_uid_gid(&uid, &gid);
1226 
1227 	*owner_uid = from_kuid(&init_user_ns, uid);
1228 	*owner_gid = from_kgid(&init_user_ns, gid);
1229 }
1230 
1231 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
1232 {
1233 	struct ublk_device *ub = disk->private_data;
1234 
1235 	if (capable(CAP_SYS_ADMIN))
1236 		return 0;
1237 
1238 	/*
1239 	 * If it is one unprivileged device, only owner can open
1240 	 * the disk. Otherwise it could be one trap made by one
1241 	 * evil user who grants this disk's privileges to other
1242 	 * users deliberately.
1243 	 *
1244 	 * This way is reasonable too given anyone can create
1245 	 * unprivileged device, and no need other's grant.
1246 	 */
1247 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
1248 		unsigned int curr_uid, curr_gid;
1249 
1250 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
1251 
1252 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
1253 				ub->dev_info.owner_gid)
1254 			return -EPERM;
1255 	}
1256 
1257 	if (ub->block_open)
1258 		return -ENXIO;
1259 
1260 	return 0;
1261 }
1262 
1263 static const struct block_device_operations ub_fops = {
1264 	.owner =	THIS_MODULE,
1265 	.open =		ublk_open,
1266 	.free_disk =	ublk_free_disk,
1267 	.report_zones =	ublk_report_zones,
1268 };
1269 
1270 static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
1271 				struct iov_iter *uiter, int dir, size_t *done)
1272 {
1273 	unsigned len;
1274 	void *bv_buf;
1275 	size_t copied;
1276 
1277 	if (*offset >= bv->bv_len) {
1278 		*offset -= bv->bv_len;
1279 		return true;
1280 	}
1281 
1282 	len = bv->bv_len - *offset;
1283 	bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
1284 	if (dir == ITER_DEST)
1285 		copied = copy_to_iter(bv_buf, len, uiter);
1286 	else
1287 		copied = copy_from_iter(bv_buf, len, uiter);
1288 
1289 	kunmap_local(bv_buf);
1290 
1291 	*done += copied;
1292 	if (copied < len)
1293 		return false;
1294 
1295 	*offset = 0;
1296 	return true;
1297 }
1298 
1299 /*
1300  * Copy data between request pages and io_iter, and 'offset'
1301  * is the start point of linear offset of request.
1302  */
1303 static size_t ublk_copy_user_pages(const struct request *req,
1304 		unsigned offset, struct iov_iter *uiter, int dir)
1305 {
1306 	struct req_iterator iter;
1307 	struct bio_vec bv;
1308 	size_t done = 0;
1309 
1310 	rq_for_each_segment(bv, req, iter) {
1311 		if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
1312 			break;
1313 	}
1314 	return done;
1315 }
1316 
1317 #ifdef CONFIG_BLK_DEV_INTEGRITY
1318 static size_t ublk_copy_user_integrity(const struct request *req,
1319 		unsigned offset, struct iov_iter *uiter, int dir)
1320 {
1321 	size_t done = 0;
1322 	struct bio *bio = req->bio;
1323 	struct bvec_iter iter;
1324 	struct bio_vec iv;
1325 
1326 	if (!blk_integrity_rq(req))
1327 		return 0;
1328 
1329 	bio_for_each_integrity_vec(iv, bio, iter) {
1330 		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
1331 			break;
1332 	}
1333 
1334 	return done;
1335 }
1336 #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1337 static size_t ublk_copy_user_integrity(const struct request *req,
1338 		unsigned offset, struct iov_iter *uiter, int dir)
1339 {
1340 	return 0;
1341 }
1342 #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
1343 
1344 static inline bool ublk_need_map_req(const struct request *req)
1345 {
1346 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1347 }
1348 
1349 static inline bool ublk_need_unmap_req(const struct request *req)
1350 {
1351 	return ublk_rq_has_data(req) &&
1352 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1353 }
1354 
1355 static unsigned int ublk_map_io(const struct ublk_queue *ubq,
1356 				const struct request *req,
1357 				const struct ublk_io *io)
1358 {
1359 	const unsigned int rq_bytes = blk_rq_bytes(req);
1360 
1361 	if (!ublk_need_map_io(ubq))
1362 		return rq_bytes;
1363 
1364 	/*
1365 	 * no zero copy, we delay copy WRITE request data into ublksrv
1366 	 * context and the big benefit is that pinning pages in current
1367 	 * context is pretty fast, see ublk_pin_user_pages
1368 	 */
1369 	if (ublk_need_map_req(req)) {
1370 		struct iov_iter iter;
1371 		const int dir = ITER_DEST;
1372 
1373 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
1374 		return ublk_copy_user_pages(req, 0, &iter, dir);
1375 	}
1376 	return rq_bytes;
1377 }
1378 
1379 static unsigned int ublk_unmap_io(bool need_map,
1380 		const struct request *req,
1381 		const struct ublk_io *io)
1382 {
1383 	const unsigned int rq_bytes = blk_rq_bytes(req);
1384 
1385 	if (!need_map)
1386 		return rq_bytes;
1387 
1388 	if (ublk_need_unmap_req(req)) {
1389 		struct iov_iter iter;
1390 		const int dir = ITER_SOURCE;
1391 
1392 		WARN_ON_ONCE(io->res > rq_bytes);
1393 
1394 		import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
1395 		return ublk_copy_user_pages(req, 0, &iter, dir);
1396 	}
1397 	return rq_bytes;
1398 }
1399 
1400 static inline unsigned int ublk_req_build_flags(struct request *req)
1401 {
1402 	unsigned flags = 0;
1403 
1404 	if (req->cmd_flags & REQ_FAILFAST_DEV)
1405 		flags |= UBLK_IO_F_FAILFAST_DEV;
1406 
1407 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1408 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1409 
1410 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1411 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
1412 
1413 	if (req->cmd_flags & REQ_META)
1414 		flags |= UBLK_IO_F_META;
1415 
1416 	if (req->cmd_flags & REQ_FUA)
1417 		flags |= UBLK_IO_F_FUA;
1418 
1419 	if (req->cmd_flags & REQ_NOUNMAP)
1420 		flags |= UBLK_IO_F_NOUNMAP;
1421 
1422 	if (req->cmd_flags & REQ_SWAP)
1423 		flags |= UBLK_IO_F_SWAP;
1424 
1425 	if (blk_integrity_rq(req))
1426 		flags |= UBLK_IO_F_INTEGRITY;
1427 
1428 	return flags;
1429 }
1430 
1431 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1432 {
1433 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1434 	struct ublk_io *io = &ubq->ios[req->tag];
1435 	u32 ublk_op;
1436 
1437 	switch (req_op(req)) {
1438 	case REQ_OP_READ:
1439 		ublk_op = UBLK_IO_OP_READ;
1440 		break;
1441 	case REQ_OP_WRITE:
1442 		ublk_op = UBLK_IO_OP_WRITE;
1443 		break;
1444 	case REQ_OP_FLUSH:
1445 		ublk_op = UBLK_IO_OP_FLUSH;
1446 		break;
1447 	case REQ_OP_DISCARD:
1448 		ublk_op = UBLK_IO_OP_DISCARD;
1449 		break;
1450 	case REQ_OP_WRITE_ZEROES:
1451 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1452 		break;
1453 	default:
1454 		if (ublk_queue_is_zoned(ubq))
1455 			return ublk_setup_iod_zoned(ubq, req);
1456 		return BLK_STS_IOERR;
1457 	}
1458 
1459 	/* need to translate since kernel may change */
1460 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
1461 	iod->nr_sectors = blk_rq_sectors(req);
1462 	iod->start_sector = blk_rq_pos(req);
1463 	iod->addr = io->buf.addr;
1464 
1465 	return BLK_STS_OK;
1466 }
1467 
1468 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1469 		struct io_uring_cmd *ioucmd)
1470 {
1471 	return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1472 }
1473 
1474 static void ublk_end_request(struct request *req, blk_status_t error)
1475 {
1476 	local_bh_disable();
1477 	blk_mq_end_request(req, error);
1478 	local_bh_enable();
1479 }
1480 
1481 /* todo: handle partial completion */
1482 static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1483 				      bool need_map, struct io_comp_batch *iob)
1484 {
1485 	unsigned int unmapped_bytes;
1486 	blk_status_t res = BLK_STS_OK;
1487 	bool requeue;
1488 
1489 	/* failed read IO if nothing is read */
1490 	if (!io->res && req_op(req) == REQ_OP_READ)
1491 		io->res = -EIO;
1492 
1493 	if (io->res < 0) {
1494 		res = errno_to_blk_status(io->res);
1495 		goto exit;
1496 	}
1497 
1498 	/*
1499 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1500 	 * directly.
1501 	 *
1502 	 * Both the two needn't unmap.
1503 	 */
1504 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1505 	    req_op(req) != REQ_OP_DRV_IN)
1506 		goto exit;
1507 
1508 	/* for READ request, writing data in iod->addr to rq buffers */
1509 	unmapped_bytes = ublk_unmap_io(need_map, req, io);
1510 
1511 	/*
1512 	 * Extremely impossible since we got data filled in just before
1513 	 *
1514 	 * Re-read simply for this unlikely case.
1515 	 */
1516 	if (unlikely(unmapped_bytes < io->res))
1517 		io->res = unmapped_bytes;
1518 
1519 	/*
1520 	 * Run bio->bi_end_io() with softirqs disabled. If the final fput
1521 	 * happens off this path, then that will prevent ublk's blkdev_release()
1522 	 * from being called on current's task work, see fput() implementation.
1523 	 *
1524 	 * Otherwise, ublk server may not provide forward progress in case of
1525 	 * reading the partition table from bdev_open() with disk->open_mutex
1526 	 * held, and causes dead lock as we could already be holding
1527 	 * disk->open_mutex here.
1528 	 *
1529 	 * Preferably we would not be doing IO with a mutex held that is also
1530 	 * used for release, but this work-around will suffice for now.
1531 	 */
1532 	local_bh_disable();
1533 	requeue = blk_update_request(req, BLK_STS_OK, io->res);
1534 	local_bh_enable();
1535 	if (requeue)
1536 		blk_mq_requeue_request(req, true);
1537 	else if (likely(!blk_should_fake_timeout(req->q))) {
1538 		if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
1539 			return;
1540 		__blk_mq_end_request(req, BLK_STS_OK);
1541 	}
1542 
1543 	return;
1544 exit:
1545 	ublk_end_request(req, res);
1546 }
1547 
1548 static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1549 						     struct request *req)
1550 {
1551 	/* read cmd first because req will overwrite it */
1552 	struct io_uring_cmd *cmd = io->cmd;
1553 
1554 	/* mark this cmd owned by ublksrv */
1555 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1556 
1557 	/*
1558 	 * clear ACTIVE since we are done with this sqe/cmd slot
1559 	 * We can only accept io cmd in case of being not active.
1560 	 */
1561 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1562 
1563 	io->req = req;
1564 	return cmd;
1565 }
1566 
1567 static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1568 				 int res, unsigned issue_flags)
1569 {
1570 	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1571 
1572 	/* tell ublksrv one io request is coming */
1573 	io_uring_cmd_done(cmd, res, issue_flags);
1574 }
1575 
1576 #define UBLK_REQUEUE_DELAY_MS	3
1577 
1578 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1579 		struct request *rq)
1580 {
1581 	/* We cannot process this rq so just requeue it. */
1582 	if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1583 		blk_mq_requeue_request(rq, false);
1584 	else
1585 		ublk_end_request(rq, BLK_STS_IOERR);
1586 }
1587 
1588 static void
1589 ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
1590 {
1591 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1592 
1593 	iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1594 }
1595 
1596 enum auto_buf_reg_res {
1597 	AUTO_BUF_REG_FAIL,
1598 	AUTO_BUF_REG_FALLBACK,
1599 	AUTO_BUF_REG_OK,
1600 };
1601 
1602 /*
1603  * Setup io state after auto buffer registration.
1604  *
1605  * Must be called after ublk_auto_buf_register() is done.
1606  * Caller must hold io->lock in batch context.
1607  */
1608 static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
1609 				   struct request *req, struct ublk_io *io,
1610 				   struct io_uring_cmd *cmd,
1611 				   enum auto_buf_reg_res res)
1612 {
1613 	if (res == AUTO_BUF_REG_OK) {
1614 		io->task_registered_buffers = 1;
1615 		io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
1616 		io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1617 	}
1618 	ublk_init_req_ref(ubq, io);
1619 	__ublk_prep_compl_io_cmd(io, req);
1620 }
1621 
1622 /* Register request bvec to io_uring for auto buffer registration. */
1623 static enum auto_buf_reg_res
1624 ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
1625 		       struct ublk_io *io, struct io_uring_cmd *cmd,
1626 		       unsigned int issue_flags)
1627 {
1628 	int ret;
1629 
1630 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
1631 				      io->buf.auto_reg.index, issue_flags);
1632 	if (ret) {
1633 		if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1634 			ublk_auto_buf_reg_fallback(ubq, req->tag);
1635 			return AUTO_BUF_REG_FALLBACK;
1636 		}
1637 		ublk_end_request(req, BLK_STS_IOERR);
1638 		return AUTO_BUF_REG_FAIL;
1639 	}
1640 
1641 	return AUTO_BUF_REG_OK;
1642 }
1643 
1644 /*
1645  * Dispatch IO to userspace with auto buffer registration.
1646  *
1647  * Only called in non-batch context from task work, io->lock not held.
1648  */
1649 static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
1650 				   struct request *req, struct ublk_io *io,
1651 				   struct io_uring_cmd *cmd,
1652 				   unsigned int issue_flags)
1653 {
1654 	enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
1655 			issue_flags);
1656 
1657 	if (res != AUTO_BUF_REG_FAIL) {
1658 		ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1659 		io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
1660 	}
1661 }
1662 
1663 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1664 			  struct ublk_io *io)
1665 {
1666 	unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1667 
1668 	/* partially mapped, update io descriptor */
1669 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1670 		/*
1671 		 * Nothing mapped, retry until we succeed.
1672 		 *
1673 		 * We may never succeed in mapping any bytes here because
1674 		 * of OOM. TODO: reserve one buffer with single page pinned
1675 		 * for providing forward progress guarantee.
1676 		 */
1677 		if (unlikely(!mapped_bytes)) {
1678 			blk_mq_requeue_request(req, false);
1679 			blk_mq_delay_kick_requeue_list(req->q,
1680 					UBLK_REQUEUE_DELAY_MS);
1681 			return false;
1682 		}
1683 
1684 		ublk_get_iod(ubq, req->tag)->nr_sectors =
1685 			mapped_bytes >> 9;
1686 	}
1687 
1688 	return true;
1689 }
1690 
1691 static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
1692 {
1693 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1694 	int tag = req->tag;
1695 	struct ublk_io *io = &ubq->ios[tag];
1696 
1697 	pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1698 			__func__, ubq->q_id, req->tag, io->flags,
1699 			ublk_get_iod(ubq, req->tag)->addr);
1700 
1701 	/*
1702 	 * Task is exiting if either:
1703 	 *
1704 	 * (1) current != io->task.
1705 	 * io_uring_cmd_complete_in_task() tries to run task_work
1706 	 * in a workqueue if cmd's task is PF_EXITING.
1707 	 *
1708 	 * (2) current->flags & PF_EXITING.
1709 	 */
1710 	if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1711 		__ublk_abort_rq(ubq, req);
1712 		return;
1713 	}
1714 
1715 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1716 		/*
1717 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1718 		 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1719 		 * and notify it.
1720 		 */
1721 		io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1722 		pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1723 				__func__, ubq->q_id, req->tag, io->flags);
1724 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1725 				     issue_flags);
1726 		return;
1727 	}
1728 
1729 	if (!ublk_start_io(ubq, req, io))
1730 		return;
1731 
1732 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1733 		ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
1734 	} else {
1735 		ublk_init_req_ref(ubq, io);
1736 		ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1737 	}
1738 }
1739 
1740 static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1741 				       const struct ublk_batch_io_data *data,
1742 				       unsigned short tag)
1743 {
1744 	struct ublk_device *ub = data->ub;
1745 	struct ublk_io *io = &ubq->ios[tag];
1746 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1747 	enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
1748 	struct io_uring_cmd *cmd = data->cmd;
1749 
1750 	if (!ublk_start_io(ubq, req, io))
1751 		return false;
1752 
1753 	if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
1754 		res = ublk_auto_buf_register(ubq, req, io, cmd,
1755 				data->issue_flags);
1756 
1757 		if (res == AUTO_BUF_REG_FAIL)
1758 			return false;
1759 	}
1760 
1761 	ublk_io_lock(io);
1762 	ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
1763 	ublk_io_unlock(io);
1764 
1765 	return true;
1766 }
1767 
1768 static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
1769 				     const struct ublk_batch_io_data *data,
1770 				     unsigned short *tag_buf,
1771 				     unsigned int len)
1772 {
1773 	bool has_unused = false;
1774 	unsigned int i;
1775 
1776 	for (i = 0; i < len; i++) {
1777 		unsigned short tag = tag_buf[i];
1778 
1779 		if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
1780 			tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
1781 			has_unused = true;
1782 		}
1783 	}
1784 
1785 	return has_unused;
1786 }
1787 
1788 /*
1789  * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
1790  * Returns the new length after filtering.
1791  */
1792 static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
1793 					    unsigned int len)
1794 {
1795 	unsigned int i, j;
1796 
1797 	for (i = 0, j = 0; i < len; i++) {
1798 		if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
1799 			if (i != j)
1800 				tag_buf[j] = tag_buf[i];
1801 			j++;
1802 		}
1803 	}
1804 
1805 	return j;
1806 }
1807 
1808 static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
1809 		const struct ublk_batch_io_data *data,
1810 		unsigned short *tag_buf, size_t len, int ret)
1811 {
1812 	int i, res;
1813 
1814 	/*
1815 	 * Undo prep state for all IOs since userspace never received them.
1816 	 * This restores IOs to pre-prepared state so they can be cleanly
1817 	 * re-prepared when tags are pulled from FIFO again.
1818 	 */
1819 	for (i = 0; i < len; i++) {
1820 		struct ublk_io *io = &ubq->ios[tag_buf[i]];
1821 		int index = -1;
1822 
1823 		ublk_io_lock(io);
1824 		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
1825 			index = io->buf.auto_reg.index;
1826 		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
1827 		io->flags |= UBLK_IO_FLAG_ACTIVE;
1828 		ublk_io_unlock(io);
1829 
1830 		if (index != -1)
1831 			io_buffer_unregister_bvec(data->cmd, index,
1832 					data->issue_flags);
1833 	}
1834 
1835 	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
1836 		tag_buf, len, &ubq->evts_lock);
1837 
1838 	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
1839 			"tags(%d %zu) ret %d\n", __func__, res, len,
1840 			ret);
1841 }
1842 
1843 #define MAX_NR_TAG 128
1844 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
1845 				 const struct ublk_batch_io_data *data,
1846 				 struct ublk_batch_fetch_cmd *fcmd)
1847 {
1848 	const unsigned int tag_sz = sizeof(unsigned short);
1849 	unsigned short tag_buf[MAX_NR_TAG];
1850 	struct io_br_sel sel;
1851 	size_t len = 0;
1852 	bool needs_filter;
1853 	int ret;
1854 
1855 	WARN_ON_ONCE(data->cmd != fcmd->cmd);
1856 
1857 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
1858 					 data->issue_flags);
1859 	if (sel.val < 0)
1860 		return sel.val;
1861 	if (!sel.addr)
1862 		return -ENOBUFS;
1863 
1864 	/* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
1865 	len = min(len, sizeof(tag_buf)) / tag_sz;
1866 	len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
1867 
1868 	needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
1869 	/* Filter out unused tags before posting to userspace */
1870 	if (unlikely(needs_filter)) {
1871 		int new_len = ublk_filter_unused_tags(tag_buf, len);
1872 
1873 		/* return actual length if all are failed or requeued */
1874 		if (!new_len) {
1875 			/* release the selected buffer */
1876 			sel.val = 0;
1877 			WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
1878 						&sel, data->issue_flags));
1879 			return len;
1880 		}
1881 		len = new_len;
1882 	}
1883 
1884 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
1885 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
1886 	if (unlikely(ret < 0))
1887 		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
1888 	return ret;
1889 }
1890 
1891 static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
1892 		struct ublk_queue *ubq)
1893 {
1894 	struct ublk_batch_fetch_cmd *fcmd;
1895 
1896 	lockdep_assert_held(&ubq->evts_lock);
1897 
1898 	/*
1899 	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
1900 	 *
1901 	 * The pair is the smp_mb() in ublk_batch_dispatch().
1902 	 *
1903 	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
1904 	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
1905 	 */
1906 	smp_mb();
1907 	if (READ_ONCE(ubq->active_fcmd)) {
1908 		fcmd = NULL;
1909 	} else {
1910 		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
1911 				struct ublk_batch_fetch_cmd, node);
1912 		WRITE_ONCE(ubq->active_fcmd, fcmd);
1913 	}
1914 	return fcmd;
1915 }
1916 
1917 static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1918 {
1919 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
1920 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1921 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1922 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
1923 	struct ublk_batch_io_data data = {
1924 		.ub = pdu->ubq->dev,
1925 		.cmd = fcmd->cmd,
1926 		.issue_flags = issue_flags,
1927 	};
1928 
1929 	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
1930 
1931 	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
1932 }
1933 
1934 static void
1935 ublk_batch_dispatch(struct ublk_queue *ubq,
1936 		    const struct ublk_batch_io_data *data,
1937 		    struct ublk_batch_fetch_cmd *fcmd)
1938 {
1939 	struct ublk_batch_fetch_cmd *new_fcmd;
1940 	unsigned tried = 0;
1941 	int ret = 0;
1942 
1943 again:
1944 	while (!ublk_io_evts_empty(ubq)) {
1945 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
1946 		if (ret <= 0)
1947 			break;
1948 	}
1949 
1950 	if (ret < 0) {
1951 		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
1952 		return;
1953 	}
1954 
1955 	__ublk_release_fcmd(ubq);
1956 	/*
1957 	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
1958 	 * checking ubq->evts_fifo.
1959 	 *
1960 	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
1961 	 */
1962 	smp_mb();
1963 	if (likely(ublk_io_evts_empty(ubq)))
1964 		return;
1965 
1966 	spin_lock(&ubq->evts_lock);
1967 	new_fcmd = __ublk_acquire_fcmd(ubq);
1968 	spin_unlock(&ubq->evts_lock);
1969 
1970 	if (!new_fcmd)
1971 		return;
1972 
1973 	/* Avoid lockup by allowing to handle at most 32 batches */
1974 	if (new_fcmd == fcmd && tried++ < 32)
1975 		goto again;
1976 
1977 	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
1978 }
1979 
1980 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
1981 {
1982 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
1983 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1984 	struct ublk_queue *ubq = pdu->ubq;
1985 
1986 	ublk_dispatch_req(ubq, pdu->req);
1987 }
1988 
1989 static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
1990 {
1991 	unsigned short tag = rq->tag;
1992 	struct ublk_batch_fetch_cmd *fcmd = NULL;
1993 
1994 	spin_lock(&ubq->evts_lock);
1995 	kfifo_put(&ubq->evts_fifo, tag);
1996 	if (last)
1997 		fcmd = __ublk_acquire_fcmd(ubq);
1998 	spin_unlock(&ubq->evts_lock);
1999 
2000 	if (fcmd)
2001 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2002 }
2003 
2004 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
2005 {
2006 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
2007 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2008 
2009 	pdu->req = rq;
2010 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
2011 }
2012 
2013 static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
2014 {
2015 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
2016 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2017 	struct request *rq = pdu->req_list;
2018 	struct request *next;
2019 
2020 	do {
2021 		next = rq->rq_next;
2022 		rq->rq_next = NULL;
2023 		ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
2024 		rq = next;
2025 	} while (rq);
2026 }
2027 
2028 static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
2029 {
2030 	struct io_uring_cmd *cmd = io->cmd;
2031 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2032 
2033 	pdu->req_list = rq_list_peek(l);
2034 	rq_list_init(l);
2035 	io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
2036 }
2037 
2038 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
2039 {
2040 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2041 	pid_t tgid = ubq->dev->ublksrv_tgid;
2042 	struct task_struct *p;
2043 	struct pid *pid;
2044 
2045 	if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
2046 		return BLK_EH_RESET_TIMER;
2047 
2048 	if (unlikely(!tgid))
2049 		return BLK_EH_RESET_TIMER;
2050 
2051 	rcu_read_lock();
2052 	pid = find_vpid(tgid);
2053 	p = pid_task(pid, PIDTYPE_PID);
2054 	if (p)
2055 		send_sig(SIGKILL, p, 0);
2056 	rcu_read_unlock();
2057 	return BLK_EH_DONE;
2058 }
2059 
2060 static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
2061 				  bool check_cancel)
2062 {
2063 	blk_status_t res;
2064 
2065 	if (unlikely(READ_ONCE(ubq->fail_io)))
2066 		return BLK_STS_TARGET;
2067 
2068 	/* With recovery feature enabled, force_abort is set in
2069 	 * ublk_stop_dev() before calling del_gendisk(). We have to
2070 	 * abort all requeued and new rqs here to let del_gendisk()
2071 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
2072 	 * to avoid UAF on io_uring ctx.
2073 	 *
2074 	 * Note: force_abort is guaranteed to be seen because it is set
2075 	 * before request queue is unqiuesced.
2076 	 */
2077 	if (ublk_nosrv_should_queue_io(ubq) &&
2078 	    unlikely(READ_ONCE(ubq->force_abort)))
2079 		return BLK_STS_IOERR;
2080 
2081 	if (check_cancel && unlikely(ubq->canceling))
2082 		return BLK_STS_IOERR;
2083 
2084 	/* fill iod to slot in io cmd buffer */
2085 	res = ublk_setup_iod(ubq, rq);
2086 	if (unlikely(res != BLK_STS_OK))
2087 		return BLK_STS_IOERR;
2088 
2089 	blk_mq_start_request(rq);
2090 	return BLK_STS_OK;
2091 }
2092 
2093 /*
2094  * Common helper for queue_rq that handles request preparation and
2095  * cancellation checks. Returns status and sets should_queue to indicate
2096  * whether the caller should proceed with queuing the request.
2097  */
2098 static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
2099 						   struct request *rq,
2100 						   bool *should_queue)
2101 {
2102 	blk_status_t res;
2103 
2104 	res = ublk_prep_req(ubq, rq, false);
2105 	if (res != BLK_STS_OK) {
2106 		*should_queue = false;
2107 		return res;
2108 	}
2109 
2110 	/*
2111 	 * ->canceling has to be handled after ->force_abort and ->fail_io
2112 	 * is dealt with, otherwise this request may not be failed in case
2113 	 * of recovery, and cause hang when deleting disk
2114 	 */
2115 	if (unlikely(ubq->canceling)) {
2116 		*should_queue = false;
2117 		__ublk_abort_rq(ubq, rq);
2118 		return BLK_STS_OK;
2119 	}
2120 
2121 	*should_queue = true;
2122 	return BLK_STS_OK;
2123 }
2124 
2125 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
2126 		const struct blk_mq_queue_data *bd)
2127 {
2128 	struct ublk_queue *ubq = hctx->driver_data;
2129 	struct request *rq = bd->rq;
2130 	bool should_queue;
2131 	blk_status_t res;
2132 
2133 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2134 	if (!should_queue)
2135 		return res;
2136 
2137 	ublk_queue_cmd(ubq, rq);
2138 	return BLK_STS_OK;
2139 }
2140 
2141 static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
2142 		const struct blk_mq_queue_data *bd)
2143 {
2144 	struct ublk_queue *ubq = hctx->driver_data;
2145 	struct request *rq = bd->rq;
2146 	bool should_queue;
2147 	blk_status_t res;
2148 
2149 	res = __ublk_queue_rq_common(ubq, rq, &should_queue);
2150 	if (!should_queue)
2151 		return res;
2152 
2153 	ublk_batch_queue_cmd(ubq, rq, bd->last);
2154 	return BLK_STS_OK;
2155 }
2156 
2157 static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
2158 					     const struct ublk_io *io2)
2159 {
2160 	return (io_uring_cmd_ctx_handle(io->cmd) ==
2161 		io_uring_cmd_ctx_handle(io2->cmd)) &&
2162 		(io->task == io2->task);
2163 }
2164 
2165 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
2166 {
2167 	struct ublk_queue *ubq = hctx->driver_data;
2168 	struct ublk_batch_fetch_cmd *fcmd;
2169 
2170 	spin_lock(&ubq->evts_lock);
2171 	fcmd = __ublk_acquire_fcmd(ubq);
2172 	spin_unlock(&ubq->evts_lock);
2173 
2174 	if (fcmd)
2175 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2176 }
2177 
2178 static void ublk_queue_rqs(struct rq_list *rqlist)
2179 {
2180 	struct rq_list requeue_list = { };
2181 	struct rq_list submit_list = { };
2182 	struct ublk_io *io = NULL;
2183 	struct request *req;
2184 
2185 	while ((req = rq_list_pop(rqlist))) {
2186 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2187 		struct ublk_io *this_io = &this_q->ios[req->tag];
2188 
2189 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2190 			rq_list_add_tail(&requeue_list, req);
2191 			continue;
2192 		}
2193 
2194 		if (io && !ublk_belong_to_same_batch(io, this_io) &&
2195 				!rq_list_empty(&submit_list))
2196 			ublk_queue_cmd_list(io, &submit_list);
2197 		io = this_io;
2198 		rq_list_add_tail(&submit_list, req);
2199 	}
2200 
2201 	if (!rq_list_empty(&submit_list))
2202 		ublk_queue_cmd_list(io, &submit_list);
2203 	*rqlist = requeue_list;
2204 }
2205 
2206 static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
2207 {
2208 	unsigned short tags[MAX_NR_TAG];
2209 	struct ublk_batch_fetch_cmd *fcmd;
2210 	struct request *rq;
2211 	unsigned cnt = 0;
2212 
2213 	spin_lock(&ubq->evts_lock);
2214 	rq_list_for_each(l, rq) {
2215 		tags[cnt++] = (unsigned short)rq->tag;
2216 		if (cnt >= MAX_NR_TAG) {
2217 			kfifo_in(&ubq->evts_fifo, tags, cnt);
2218 			cnt = 0;
2219 		}
2220 	}
2221 	if (cnt)
2222 		kfifo_in(&ubq->evts_fifo, tags, cnt);
2223 	fcmd = __ublk_acquire_fcmd(ubq);
2224 	spin_unlock(&ubq->evts_lock);
2225 
2226 	rq_list_init(l);
2227 	if (fcmd)
2228 		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
2229 }
2230 
2231 static void ublk_batch_queue_rqs(struct rq_list *rqlist)
2232 {
2233 	struct rq_list requeue_list = { };
2234 	struct rq_list submit_list = { };
2235 	struct ublk_queue *ubq = NULL;
2236 	struct request *req;
2237 
2238 	while ((req = rq_list_pop(rqlist))) {
2239 		struct ublk_queue *this_q = req->mq_hctx->driver_data;
2240 
2241 		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
2242 			rq_list_add_tail(&requeue_list, req);
2243 			continue;
2244 		}
2245 
2246 		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
2247 			ublk_batch_queue_cmd_list(ubq, &submit_list);
2248 		ubq = this_q;
2249 		rq_list_add_tail(&submit_list, req);
2250 	}
2251 
2252 	if (!rq_list_empty(&submit_list))
2253 		ublk_batch_queue_cmd_list(ubq, &submit_list);
2254 	*rqlist = requeue_list;
2255 }
2256 
2257 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
2258 		unsigned int hctx_idx)
2259 {
2260 	struct ublk_device *ub = driver_data;
2261 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
2262 
2263 	hctx->driver_data = ubq;
2264 	return 0;
2265 }
2266 
2267 static const struct blk_mq_ops ublk_mq_ops = {
2268 	.queue_rq       = ublk_queue_rq,
2269 	.queue_rqs      = ublk_queue_rqs,
2270 	.init_hctx	= ublk_init_hctx,
2271 	.timeout	= ublk_timeout,
2272 };
2273 
2274 static const struct blk_mq_ops ublk_batch_mq_ops = {
2275 	.commit_rqs	= ublk_commit_rqs,
2276 	.queue_rq       = ublk_batch_queue_rq,
2277 	.queue_rqs      = ublk_batch_queue_rqs,
2278 	.init_hctx	= ublk_init_hctx,
2279 	.timeout	= ublk_timeout,
2280 };
2281 
2282 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2283 {
2284 	int i;
2285 
2286 	ubq->nr_io_ready = 0;
2287 
2288 	for (i = 0; i < ubq->q_depth; i++) {
2289 		struct ublk_io *io = &ubq->ios[i];
2290 
2291 		/*
2292 		 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
2293 		 * io->cmd
2294 		 */
2295 		io->flags &= UBLK_IO_FLAG_CANCELED;
2296 		io->cmd = NULL;
2297 		io->buf.addr = 0;
2298 
2299 		/*
2300 		 * old task is PF_EXITING, put it now
2301 		 *
2302 		 * It could be NULL in case of closing one quiesced
2303 		 * device.
2304 		 */
2305 		if (io->task) {
2306 			put_task_struct(io->task);
2307 			io->task = NULL;
2308 		}
2309 
2310 		WARN_ON_ONCE(refcount_read(&io->ref));
2311 		WARN_ON_ONCE(io->task_registered_buffers);
2312 	}
2313 }
2314 
2315 static int ublk_ch_open(struct inode *inode, struct file *filp)
2316 {
2317 	struct ublk_device *ub = container_of(inode->i_cdev,
2318 			struct ublk_device, cdev);
2319 
2320 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
2321 		return -EBUSY;
2322 	filp->private_data = ub;
2323 	ub->ublksrv_tgid = current->tgid;
2324 	return 0;
2325 }
2326 
2327 static void ublk_reset_ch_dev(struct ublk_device *ub)
2328 {
2329 	int i;
2330 
2331 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2332 		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2333 
2334 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
2335 	ub->mm = NULL;
2336 	ub->nr_queue_ready = 0;
2337 	ub->unprivileged_daemons = false;
2338 	ub->ublksrv_tgid = -1;
2339 }
2340 
2341 static struct gendisk *ublk_get_disk(struct ublk_device *ub)
2342 {
2343 	struct gendisk *disk;
2344 
2345 	spin_lock(&ub->lock);
2346 	disk = ub->ub_disk;
2347 	if (disk)
2348 		get_device(disk_to_dev(disk));
2349 	spin_unlock(&ub->lock);
2350 
2351 	return disk;
2352 }
2353 
2354 static void ublk_put_disk(struct gendisk *disk)
2355 {
2356 	if (disk)
2357 		put_device(disk_to_dev(disk));
2358 }
2359 
2360 static void ublk_partition_scan_work(struct work_struct *work)
2361 {
2362 	struct ublk_device *ub =
2363 		container_of(work, struct ublk_device, partition_scan_work);
2364 	/* Hold disk reference to prevent UAF during concurrent teardown */
2365 	struct gendisk *disk = ublk_get_disk(ub);
2366 
2367 	if (!disk)
2368 		return;
2369 
2370 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
2371 					     &disk->state)))
2372 		goto out;
2373 
2374 	mutex_lock(&disk->open_mutex);
2375 	bdev_disk_changed(disk, false);
2376 	mutex_unlock(&disk->open_mutex);
2377 out:
2378 	ublk_put_disk(disk);
2379 }
2380 
2381 /*
2382  * Use this function to ensure that ->canceling is consistently set for
2383  * the device and all queues. Do not set these flags directly.
2384  *
2385  * Caller must ensure that:
2386  * - cancel_mutex is held. This ensures that there is no concurrent
2387  *   access to ub->canceling and no concurrent writes to ubq->canceling.
2388  * - there are no concurrent reads of ubq->canceling from the queue_rq
2389  *   path. This can be done by quiescing the queue, or through other
2390  *   means.
2391  */
2392 static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
2393 	__must_hold(&ub->cancel_mutex)
2394 {
2395 	int i;
2396 
2397 	ub->canceling = canceling;
2398 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2399 		ublk_get_queue(ub, i)->canceling = canceling;
2400 }
2401 
2402 static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
2403 {
2404 	int i, j;
2405 
2406 	if (!ublk_dev_need_req_ref(ub))
2407 		return false;
2408 
2409 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2410 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
2411 
2412 		for (j = 0; j < ubq->q_depth; j++) {
2413 			struct ublk_io *io = &ubq->ios[j];
2414 			unsigned int refs = refcount_read(&io->ref) +
2415 				io->task_registered_buffers;
2416 
2417 			/*
2418 			 * UBLK_REFCOUNT_INIT or zero means no active
2419 			 * reference
2420 			 */
2421 			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
2422 				return true;
2423 
2424 			/* reset to zero if the io hasn't active references */
2425 			refcount_set(&io->ref, 0);
2426 			io->task_registered_buffers = 0;
2427 		}
2428 	}
2429 	return false;
2430 }
2431 
2432 static void ublk_ch_release_work_fn(struct work_struct *work)
2433 {
2434 	struct ublk_device *ub =
2435 		container_of(work, struct ublk_device, exit_work.work);
2436 	struct gendisk *disk;
2437 	int i;
2438 
2439 	/*
2440 	 * For zero-copy and auto buffer register modes, I/O references
2441 	 * might not be dropped naturally when the daemon is killed, but
2442 	 * io_uring guarantees that registered bvec kernel buffers are
2443 	 * unregistered finally when freeing io_uring context, then the
2444 	 * active references are dropped.
2445 	 *
2446 	 * Wait until active references are dropped for avoiding use-after-free
2447 	 *
2448 	 * registered buffer may be unregistered in io_ring's release hander,
2449 	 * so have to wait by scheduling work function for avoiding the two
2450 	 * file release dependency.
2451 	 */
2452 	if (ublk_check_and_reset_active_ref(ub)) {
2453 		schedule_delayed_work(&ub->exit_work, 1);
2454 		return;
2455 	}
2456 
2457 	/*
2458 	 * disk isn't attached yet, either device isn't live, or it has
2459 	 * been removed already, so we needn't to do anything
2460 	 */
2461 	disk = ublk_get_disk(ub);
2462 	if (!disk)
2463 		goto out;
2464 
2465 	/*
2466 	 * All uring_cmd are done now, so abort any request outstanding to
2467 	 * the ublk server
2468 	 *
2469 	 * This can be done in lockless way because ublk server has been
2470 	 * gone
2471 	 *
2472 	 * More importantly, we have to provide forward progress guarantee
2473 	 * without holding ub->mutex, otherwise control task grabbing
2474 	 * ub->mutex triggers deadlock
2475 	 *
2476 	 * All requests may be inflight, so ->canceling may not be set, set
2477 	 * it now.
2478 	 */
2479 	mutex_lock(&ub->cancel_mutex);
2480 	ublk_set_canceling(ub, true);
2481 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2482 		ublk_abort_queue(ub, ublk_get_queue(ub, i));
2483 	mutex_unlock(&ub->cancel_mutex);
2484 	blk_mq_kick_requeue_list(disk->queue);
2485 
2486 	/*
2487 	 * All infligh requests have been completed or requeued and any new
2488 	 * request will be failed or requeued via `->canceling` now, so it is
2489 	 * fine to grab ub->mutex now.
2490 	 */
2491 	mutex_lock(&ub->mutex);
2492 
2493 	/* double check after grabbing lock */
2494 	if (!ub->ub_disk)
2495 		goto unlock;
2496 
2497 	/*
2498 	 * Transition the device to the nosrv state. What exactly this
2499 	 * means depends on the recovery flags
2500 	 */
2501 	if (ublk_nosrv_should_stop_dev(ub)) {
2502 		/*
2503 		 * Allow any pending/future I/O to pass through quickly
2504 		 * with an error. This is needed because del_gendisk
2505 		 * waits for all pending I/O to complete
2506 		 */
2507 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2508 			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
2509 
2510 		ublk_stop_dev_unlocked(ub);
2511 	} else {
2512 		if (ublk_nosrv_dev_should_queue_io(ub)) {
2513 			/* ->canceling is set and all requests are aborted */
2514 			ub->dev_info.state = UBLK_S_DEV_QUIESCED;
2515 		} else {
2516 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
2517 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2518 				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
2519 		}
2520 	}
2521 unlock:
2522 	mutex_unlock(&ub->mutex);
2523 	ublk_put_disk(disk);
2524 
2525 	/* all uring_cmd has been done now, reset device & ubq */
2526 	ublk_reset_ch_dev(ub);
2527 out:
2528 	clear_bit(UB_STATE_OPEN, &ub->state);
2529 
2530 	/* put the reference grabbed in ublk_ch_release() */
2531 	ublk_put_device(ub);
2532 }
2533 
2534 static int ublk_ch_release(struct inode *inode, struct file *filp)
2535 {
2536 	struct ublk_device *ub = filp->private_data;
2537 
2538 	/*
2539 	 * Grab ublk device reference, so it won't be gone until we are
2540 	 * really released from work function.
2541 	 */
2542 	ublk_get_device(ub);
2543 
2544 	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
2545 	schedule_delayed_work(&ub->exit_work, 0);
2546 	return 0;
2547 }
2548 
2549 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
2550 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
2551 {
2552 	struct ublk_device *ub = filp->private_data;
2553 	size_t sz = vma->vm_end - vma->vm_start;
2554 	unsigned max_sz = ublk_max_cmd_buf_size();
2555 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
2556 	int q_id, ret = 0;
2557 
2558 	spin_lock(&ub->lock);
2559 	if (!ub->mm)
2560 		ub->mm = current->mm;
2561 	if (current->mm != ub->mm)
2562 		ret = -EINVAL;
2563 	spin_unlock(&ub->lock);
2564 
2565 	if (ret)
2566 		return ret;
2567 
2568 	if (vma->vm_flags & VM_WRITE)
2569 		return -EPERM;
2570 
2571 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
2572 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
2573 		return -EINVAL;
2574 
2575 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
2576 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
2577 			__func__, q_id, current->pid, vma->vm_start,
2578 			phys_off, (unsigned long)sz);
2579 
2580 	if (sz != ublk_queue_cmd_buf_size(ub))
2581 		return -EINVAL;
2582 
2583 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
2584 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2585 }
2586 
2587 static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
2588 		struct request *req)
2589 {
2590 	WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
2591 			io->flags & UBLK_IO_FLAG_ACTIVE);
2592 
2593 	if (ublk_nosrv_should_reissue_outstanding(ub))
2594 		blk_mq_requeue_request(req, false);
2595 	else {
2596 		io->res = -EIO;
2597 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
2598 	}
2599 }
2600 
2601 /*
2602  * Request tag may just be filled to event kfifo, not get chance to
2603  * dispatch, abort these requests too
2604  */
2605 static void ublk_abort_batch_queue(struct ublk_device *ub,
2606 				   struct ublk_queue *ubq)
2607 {
2608 	unsigned short tag;
2609 
2610 	while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
2611 		struct request *req = blk_mq_tag_to_rq(
2612 				ub->tag_set.tags[ubq->q_id], tag);
2613 
2614 		if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
2615 			__ublk_fail_req(ub, &ubq->ios[tag], req);
2616 	}
2617 }
2618 
2619 /*
2620  * Called from ublk char device release handler, when any uring_cmd is
2621  * done, meantime request queue is "quiesced" since all inflight requests
2622  * can't be completed because ublk server is dead.
2623  *
2624  * So no one can hold our request IO reference any more, simply ignore the
2625  * reference, and complete the request immediately
2626  */
2627 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
2628 {
2629 	int i;
2630 
2631 	for (i = 0; i < ubq->q_depth; i++) {
2632 		struct ublk_io *io = &ubq->ios[i];
2633 
2634 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
2635 			__ublk_fail_req(ub, io, io->req);
2636 	}
2637 
2638 	if (ublk_support_batch_io(ubq))
2639 		ublk_abort_batch_queue(ub, ubq);
2640 }
2641 
2642 static void ublk_start_cancel(struct ublk_device *ub)
2643 {
2644 	struct gendisk *disk = ublk_get_disk(ub);
2645 
2646 	/* Our disk has been dead */
2647 	if (!disk)
2648 		return;
2649 
2650 	mutex_lock(&ub->cancel_mutex);
2651 	if (ub->canceling)
2652 		goto out;
2653 	/*
2654 	 * Now we are serialized with ublk_queue_rq()
2655 	 *
2656 	 * Make sure that ubq->canceling is set when queue is frozen,
2657 	 * because ublk_queue_rq() has to rely on this flag for avoiding to
2658 	 * touch completed uring_cmd
2659 	 */
2660 	blk_mq_quiesce_queue(disk->queue);
2661 	ublk_set_canceling(ub, true);
2662 	blk_mq_unquiesce_queue(disk->queue);
2663 out:
2664 	mutex_unlock(&ub->cancel_mutex);
2665 	ublk_put_disk(disk);
2666 }
2667 
2668 static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
2669 		unsigned int issue_flags)
2670 {
2671 	struct ublk_io *io = &ubq->ios[tag];
2672 	struct ublk_device *ub = ubq->dev;
2673 	struct request *req;
2674 	bool done;
2675 
2676 	if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
2677 		return;
2678 
2679 	/*
2680 	 * Don't try to cancel this command if the request is started for
2681 	 * avoiding race between io_uring_cmd_done() and
2682 	 * io_uring_cmd_complete_in_task().
2683 	 *
2684 	 * Either the started request will be aborted via __ublk_abort_rq(),
2685 	 * then this uring_cmd is canceled next time, or it will be done in
2686 	 * task work function ublk_dispatch_req() because io_uring guarantees
2687 	 * that ublk_dispatch_req() is always called
2688 	 */
2689 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
2690 	if (req && blk_mq_request_started(req) && req->tag == tag)
2691 		return;
2692 
2693 	spin_lock(&ubq->cancel_lock);
2694 	done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
2695 	if (!done)
2696 		io->flags |= UBLK_IO_FLAG_CANCELED;
2697 	spin_unlock(&ubq->cancel_lock);
2698 
2699 	if (!done)
2700 		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
2701 }
2702 
2703 /*
2704  * Cancel a batch fetch command if it hasn't been claimed by another path.
2705  *
2706  * An fcmd can only be cancelled if:
2707  * 1. It's not the active_fcmd (which is currently being processed)
2708  * 2. It's still on the list (!list_empty check) - once removed from the list,
2709  *    the fcmd is considered claimed and will be freed by whoever removed it
2710  *
2711  * Use list_del_init() so subsequent list_empty() checks work correctly.
2712  */
2713 static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
2714 				  struct ublk_batch_fetch_cmd *fcmd,
2715 				  unsigned int issue_flags)
2716 {
2717 	bool done;
2718 
2719 	spin_lock(&ubq->evts_lock);
2720 	done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
2721 	if (done)
2722 		list_del_init(&fcmd->node);
2723 	spin_unlock(&ubq->evts_lock);
2724 
2725 	if (done) {
2726 		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
2727 		ublk_batch_free_fcmd(fcmd);
2728 	}
2729 }
2730 
2731 static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
2732 {
2733 	struct ublk_batch_fetch_cmd *fcmd;
2734 	LIST_HEAD(fcmd_list);
2735 
2736 	spin_lock(&ubq->evts_lock);
2737 	ubq->force_abort = true;
2738 	list_splice_init(&ubq->fcmd_head, &fcmd_list);
2739 	fcmd = READ_ONCE(ubq->active_fcmd);
2740 	if (fcmd)
2741 		list_move(&fcmd->node, &ubq->fcmd_head);
2742 	spin_unlock(&ubq->evts_lock);
2743 
2744 	while (!list_empty(&fcmd_list)) {
2745 		fcmd = list_first_entry(&fcmd_list,
2746 				struct ublk_batch_fetch_cmd, node);
2747 		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
2748 	}
2749 }
2750 
2751 static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
2752 				 unsigned int issue_flags)
2753 {
2754 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2755 	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
2756 	struct ublk_queue *ubq = pdu->ubq;
2757 
2758 	ublk_start_cancel(ubq->dev);
2759 
2760 	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
2761 }
2762 
2763 /*
2764  * The ublk char device won't be closed when calling cancel fn, so both
2765  * ublk device and queue are guaranteed to be live
2766  *
2767  * Two-stage cancel:
2768  *
2769  * - make every active uring_cmd done in ->cancel_fn()
2770  *
2771  * - aborting inflight ublk IO requests in ublk char device release handler,
2772  *   which depends on 1st stage because device can only be closed iff all
2773  *   uring_cmd are done
2774  *
2775  * Do _not_ try to acquire ub->mutex before all inflight requests are
2776  * aborted, otherwise deadlock may be caused.
2777  */
2778 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
2779 		unsigned int issue_flags)
2780 {
2781 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2782 	struct ublk_queue *ubq = pdu->ubq;
2783 	struct task_struct *task;
2784 	struct ublk_io *io;
2785 
2786 	if (WARN_ON_ONCE(!ubq))
2787 		return;
2788 
2789 	if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
2790 		return;
2791 
2792 	task = io_uring_cmd_get_task(cmd);
2793 	io = &ubq->ios[pdu->tag];
2794 	if (WARN_ON_ONCE(task && task != io->task))
2795 		return;
2796 
2797 	ublk_start_cancel(ubq->dev);
2798 
2799 	WARN_ON_ONCE(io->cmd != cmd);
2800 	ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
2801 }
2802 
2803 static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
2804 {
2805 	return ubq->nr_io_ready == ubq->q_depth;
2806 }
2807 
2808 static inline bool ublk_dev_ready(const struct ublk_device *ub)
2809 {
2810 	return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
2811 }
2812 
2813 static void ublk_cancel_queue(struct ublk_queue *ubq)
2814 {
2815 	int i;
2816 
2817 	if (ublk_support_batch_io(ubq)) {
2818 		ublk_batch_cancel_queue(ubq);
2819 		return;
2820 	}
2821 
2822 	for (i = 0; i < ubq->q_depth; i++)
2823 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
2824 }
2825 
2826 /* Cancel all pending commands, must be called after del_gendisk() returns */
2827 static void ublk_cancel_dev(struct ublk_device *ub)
2828 {
2829 	int i;
2830 
2831 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2832 		ublk_cancel_queue(ublk_get_queue(ub, i));
2833 }
2834 
2835 static bool ublk_check_inflight_rq(struct request *rq, void *data)
2836 {
2837 	bool *idle = data;
2838 
2839 	if (blk_mq_request_started(rq)) {
2840 		*idle = false;
2841 		return false;
2842 	}
2843 	return true;
2844 }
2845 
2846 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
2847 {
2848 	bool idle;
2849 
2850 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
2851 	while (true) {
2852 		idle = true;
2853 		blk_mq_tagset_busy_iter(&ub->tag_set,
2854 				ublk_check_inflight_rq, &idle);
2855 		if (idle)
2856 			break;
2857 		msleep(UBLK_REQUEUE_DELAY_MS);
2858 	}
2859 }
2860 
2861 static void ublk_force_abort_dev(struct ublk_device *ub)
2862 {
2863 	int i;
2864 
2865 	pr_devel("%s: force abort ub: dev_id %d state %s\n",
2866 			__func__, ub->dev_info.dev_id,
2867 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
2868 			"LIVE" : "QUIESCED");
2869 	blk_mq_quiesce_queue(ub->ub_disk->queue);
2870 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2871 		ublk_wait_tagset_rqs_idle(ub);
2872 
2873 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2874 		ublk_get_queue(ub, i)->force_abort = true;
2875 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2876 	/* We may have requeued some rqs in ublk_quiesce_queue() */
2877 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2878 }
2879 
2880 static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2881 {
2882 	struct gendisk *disk;
2883 
2884 	/* Sync with ublk_abort_queue() by holding the lock */
2885 	spin_lock(&ub->lock);
2886 	disk = ub->ub_disk;
2887 	ub->dev_info.state = UBLK_S_DEV_DEAD;
2888 	ub->dev_info.ublksrv_pid = -1;
2889 	ub->ub_disk = NULL;
2890 	spin_unlock(&ub->lock);
2891 
2892 	return disk;
2893 }
2894 
2895 static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2896 	__must_hold(&ub->mutex)
2897 {
2898 	struct gendisk *disk;
2899 
2900 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2901 		return;
2902 
2903 	if (ublk_nosrv_dev_should_queue_io(ub))
2904 		ublk_force_abort_dev(ub);
2905 	del_gendisk(ub->ub_disk);
2906 	disk = ublk_detach_disk(ub);
2907 	put_disk(disk);
2908 }
2909 
2910 static void ublk_stop_dev(struct ublk_device *ub)
2911 {
2912 	mutex_lock(&ub->mutex);
2913 	ublk_stop_dev_unlocked(ub);
2914 	mutex_unlock(&ub->mutex);
2915 	cancel_work_sync(&ub->partition_scan_work);
2916 	ublk_cancel_dev(ub);
2917 }
2918 
2919 /* reset per-queue io flags */
2920 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
2921 {
2922 	int j;
2923 
2924 	/* UBLK_IO_FLAG_CANCELED can be cleared now */
2925 	spin_lock(&ubq->cancel_lock);
2926 	for (j = 0; j < ubq->q_depth; j++)
2927 		ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2928 	ubq->canceling = false;
2929 	spin_unlock(&ubq->cancel_lock);
2930 	ubq->fail_io = false;
2931 }
2932 
2933 /* device can only be started after all IOs are ready */
2934 static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
2935 	__must_hold(&ub->mutex)
2936 {
2937 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2938 
2939 	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2940 		ub->unprivileged_daemons = true;
2941 
2942 	ubq->nr_io_ready++;
2943 
2944 	/* Check if this specific queue is now fully ready */
2945 	if (ublk_queue_ready(ubq)) {
2946 		ub->nr_queue_ready++;
2947 
2948 		/*
2949 		 * Reset queue flags as soon as this queue is ready.
2950 		 * This clears the canceling flag, allowing batch FETCH commands
2951 		 * to succeed during recovery without waiting for all queues.
2952 		 */
2953 		ublk_queue_reset_io_flags(ubq);
2954 	}
2955 
2956 	/* Check if all queues are ready */
2957 	if (ublk_dev_ready(ub)) {
2958 		/*
2959 		 * All queues ready - clear device-level canceling flag
2960 		 * and complete the recovery/initialization.
2961 		 */
2962 		mutex_lock(&ub->cancel_mutex);
2963 		ub->canceling = false;
2964 		mutex_unlock(&ub->cancel_mutex);
2965 		complete_all(&ub->completion);
2966 	}
2967 }
2968 
2969 static inline int ublk_check_cmd_op(u32 cmd_op)
2970 {
2971 	u32 ioc_type = _IOC_TYPE(cmd_op);
2972 
2973 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2974 		return -EOPNOTSUPP;
2975 
2976 	if (ioc_type != 'u' && ioc_type != 0)
2977 		return -EOPNOTSUPP;
2978 
2979 	return 0;
2980 }
2981 
2982 static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2983 {
2984 	struct ublk_auto_buf_reg buf;
2985 
2986 	buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2987 
2988 	if (buf.reserved0 || buf.reserved1)
2989 		return -EINVAL;
2990 
2991 	if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2992 		return -EINVAL;
2993 	io->buf.auto_reg = buf;
2994 	return 0;
2995 }
2996 
2997 static void ublk_clear_auto_buf_reg(struct ublk_io *io,
2998 				    struct io_uring_cmd *cmd,
2999 				    u16 *buf_idx)
3000 {
3001 	if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
3002 		io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
3003 
3004 		/*
3005 		 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
3006 		 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
3007 		 * `io_ring_ctx`.
3008 		 *
3009 		 * If this uring_cmd's io_ring_ctx isn't same with the
3010 		 * one for registering the buffer, it is ublk server's
3011 		 * responsibility for unregistering the buffer, otherwise
3012 		 * this ublk request gets stuck.
3013 		 */
3014 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
3015 			*buf_idx = io->buf.auto_reg.index;
3016 	}
3017 }
3018 
3019 static int ublk_handle_auto_buf_reg(struct ublk_io *io,
3020 				    struct io_uring_cmd *cmd,
3021 				    u16 *buf_idx)
3022 {
3023 	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
3024 	return ublk_set_auto_buf_reg(io, cmd);
3025 }
3026 
3027 /* Once we return, `io->req` can't be used any more */
3028 static inline struct request *
3029 ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
3030 {
3031 	struct request *req = io->req;
3032 
3033 	io->cmd = cmd;
3034 	io->flags |= UBLK_IO_FLAG_ACTIVE;
3035 	/* now this cmd slot is owned by ublk driver */
3036 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
3037 
3038 	return req;
3039 }
3040 
3041 static inline int
3042 ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
3043 		   struct io_uring_cmd *cmd, unsigned long buf_addr,
3044 		   u16 *buf_idx)
3045 {
3046 	if (ublk_dev_support_auto_buf_reg(ub))
3047 		return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
3048 
3049 	io->buf.addr = buf_addr;
3050 	return 0;
3051 }
3052 
3053 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
3054 				    unsigned int issue_flags,
3055 				    struct ublk_queue *ubq, unsigned int tag)
3056 {
3057 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
3058 
3059 	/*
3060 	 * Safe to refer to @ubq since ublk_queue won't be died until its
3061 	 * commands are completed
3062 	 */
3063 	pdu->ubq = ubq;
3064 	pdu->tag = tag;
3065 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
3066 }
3067 
3068 static void ublk_io_release(void *priv)
3069 {
3070 	struct request *rq = priv;
3071 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
3072 	struct ublk_io *io = &ubq->ios[rq->tag];
3073 
3074 	/*
3075 	 * task_registered_buffers may be 0 if buffers were registered off task
3076 	 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
3077 	 */
3078 	if (current == io->task && io->task_registered_buffers)
3079 		io->task_registered_buffers--;
3080 	else
3081 		ublk_put_req_ref(io, rq);
3082 }
3083 
3084 static int ublk_register_io_buf(struct io_uring_cmd *cmd,
3085 				struct ublk_device *ub,
3086 				u16 q_id, u16 tag,
3087 				struct ublk_io *io,
3088 				unsigned int index, unsigned int issue_flags)
3089 {
3090 	struct request *req;
3091 	int ret;
3092 
3093 	if (!ublk_dev_support_zero_copy(ub))
3094 		return -EINVAL;
3095 
3096 	req = __ublk_check_and_get_req(ub, q_id, tag, io);
3097 	if (!req)
3098 		return -EINVAL;
3099 
3100 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3101 				      issue_flags);
3102 	if (ret) {
3103 		ublk_put_req_ref(io, req);
3104 		return ret;
3105 	}
3106 
3107 	return 0;
3108 }
3109 
3110 static int
3111 ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
3112 			    struct ublk_device *ub,
3113 			    u16 q_id, u16 tag, struct ublk_io *io,
3114 			    unsigned index, unsigned issue_flags)
3115 {
3116 	unsigned new_registered_buffers;
3117 	struct request *req = io->req;
3118 	int ret;
3119 
3120 	/*
3121 	 * Ensure there are still references for ublk_sub_req_ref() to release.
3122 	 * If not, fall back on the thread-safe buffer registration.
3123 	 */
3124 	new_registered_buffers = io->task_registered_buffers + 1;
3125 	if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
3126 		return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3127 					    issue_flags);
3128 
3129 	if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
3130 		return -EINVAL;
3131 
3132 	ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
3133 				      issue_flags);
3134 	if (ret)
3135 		return ret;
3136 
3137 	io->task_registered_buffers = new_registered_buffers;
3138 	return 0;
3139 }
3140 
3141 static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
3142 				  const struct ublk_device *ub,
3143 				  unsigned int index, unsigned int issue_flags)
3144 {
3145 	if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
3146 		return -EINVAL;
3147 
3148 	return io_buffer_unregister_bvec(cmd, index, issue_flags);
3149 }
3150 
3151 static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
3152 {
3153 	if (ublk_dev_need_map_io(ub)) {
3154 		/*
3155 		 * FETCH_RQ has to provide IO buffer if NEED GET
3156 		 * DATA is not enabled
3157 		 */
3158 		if (!buf_addr && !ublk_dev_need_get_data(ub))
3159 			return -EINVAL;
3160 	} else if (buf_addr) {
3161 		/* User copy requires addr to be unset */
3162 		return -EINVAL;
3163 	}
3164 	return 0;
3165 }
3166 
3167 static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3168 			struct ublk_io *io, u16 q_id)
3169 {
3170 	/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
3171 	if (ublk_dev_ready(ub))
3172 		return -EBUSY;
3173 
3174 	/* allow each command to be FETCHed at most once */
3175 	if (io->flags & UBLK_IO_FLAG_ACTIVE)
3176 		return -EINVAL;
3177 
3178 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
3179 
3180 	ublk_fill_io_cmd(io, cmd);
3181 
3182 	if (ublk_dev_support_batch_io(ub))
3183 		WRITE_ONCE(io->task, NULL);
3184 	else
3185 		WRITE_ONCE(io->task, get_task_struct(current));
3186 
3187 	return 0;
3188 }
3189 
3190 static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
3191 		      struct ublk_io *io, __u64 buf_addr, u16 q_id)
3192 {
3193 	int ret;
3194 
3195 	/*
3196 	 * When handling FETCH command for setting up ublk uring queue,
3197 	 * ub->mutex is the innermost lock, and we won't block for handling
3198 	 * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
3199 	 */
3200 	mutex_lock(&ub->mutex);
3201 	ret = __ublk_fetch(cmd, ub, io, q_id);
3202 	if (!ret)
3203 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
3204 	if (!ret)
3205 		ublk_mark_io_ready(ub, q_id);
3206 	mutex_unlock(&ub->mutex);
3207 	return ret;
3208 }
3209 
3210 static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
3211 				       struct ublk_io *io, __u64 buf_addr)
3212 {
3213 	struct request *req = io->req;
3214 
3215 	if (ublk_dev_need_map_io(ub)) {
3216 		/*
3217 		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
3218 		 * NEED GET DATA is not enabled or it is Read IO.
3219 		 */
3220 		if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
3221 					req_op(req) == REQ_OP_READ))
3222 			return -EINVAL;
3223 	} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
3224 		/*
3225 		 * User copy requires addr to be unset when command is
3226 		 * not zone append
3227 		 */
3228 		return -EINVAL;
3229 	}
3230 
3231 	return 0;
3232 }
3233 
3234 static bool ublk_need_complete_req(const struct ublk_device *ub,
3235 				   struct ublk_io *io)
3236 {
3237 	if (ublk_dev_need_req_ref(ub))
3238 		return ublk_sub_req_ref(io);
3239 	return true;
3240 }
3241 
3242 static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
3243 			  struct request *req)
3244 {
3245 	/*
3246 	 * We have handled UBLK_IO_NEED_GET_DATA command,
3247 	 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
3248 	 * do the copy work.
3249 	 */
3250 	io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
3251 	/* update iod->addr because ublksrv may have passed a new io buffer */
3252 	ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
3253 	pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
3254 			__func__, ubq->q_id, req->tag, io->flags,
3255 			ublk_get_iod(ubq, req->tag)->addr);
3256 
3257 	return ublk_start_io(ubq, req, io);
3258 }
3259 
3260 static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
3261 		unsigned int issue_flags)
3262 {
3263 	/* May point to userspace-mapped memory */
3264 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
3265 							       struct ublksrv_io_cmd);
3266 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3267 	struct ublk_device *ub = cmd->file->private_data;
3268 	struct ublk_queue *ubq;
3269 	struct ublk_io *io = NULL;
3270 	u32 cmd_op = cmd->cmd_op;
3271 	u16 q_id = READ_ONCE(ub_src->q_id);
3272 	u16 tag = READ_ONCE(ub_src->tag);
3273 	s32 result = READ_ONCE(ub_src->result);
3274 	u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
3275 	struct request *req;
3276 	int ret;
3277 	bool compl;
3278 
3279 	WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
3280 
3281 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
3282 			__func__, cmd->cmd_op, q_id, tag, result);
3283 
3284 	ret = ublk_check_cmd_op(cmd_op);
3285 	if (ret)
3286 		goto out;
3287 
3288 	/*
3289 	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
3290 	 * so no need to validate the q_id, tag, or task
3291 	 */
3292 	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
3293 		return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
3294 
3295 	ret = -EINVAL;
3296 	if (q_id >= ub->dev_info.nr_hw_queues)
3297 		goto out;
3298 
3299 	ubq = ublk_get_queue(ub, q_id);
3300 
3301 	if (tag >= ub->dev_info.queue_depth)
3302 		goto out;
3303 
3304 	io = &ubq->ios[tag];
3305 	/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
3306 	if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
3307 		ret = ublk_check_fetch_buf(ub, addr);
3308 		if (ret)
3309 			goto out;
3310 		ret = ublk_fetch(cmd, ub, io, addr, q_id);
3311 		if (ret)
3312 			goto out;
3313 
3314 		ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3315 		return -EIOCBQUEUED;
3316 	}
3317 
3318 	if (READ_ONCE(io->task) != current) {
3319 		/*
3320 		 * ublk_register_io_buf() accesses only the io's refcount,
3321 		 * so can be handled on any task
3322 		 */
3323 		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
3324 			return ublk_register_io_buf(cmd, ub, q_id, tag, io,
3325 						    addr, issue_flags);
3326 
3327 		goto out;
3328 	}
3329 
3330 	/* there is pending io cmd, something must be wrong */
3331 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
3332 		ret = -EBUSY;
3333 		goto out;
3334 	}
3335 
3336 	/*
3337 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
3338 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
3339 	 */
3340 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
3341 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
3342 		goto out;
3343 
3344 	switch (_IOC_NR(cmd_op)) {
3345 	case UBLK_IO_REGISTER_IO_BUF:
3346 		return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
3347 						   issue_flags);
3348 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
3349 		ret = ublk_check_commit_and_fetch(ub, io, addr);
3350 		if (ret)
3351 			goto out;
3352 		io->res = result;
3353 		req = ublk_fill_io_cmd(io, cmd);
3354 		ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
3355 		if (buf_idx != UBLK_INVALID_BUF_IDX)
3356 			io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
3357 		compl = ublk_need_complete_req(ub, io);
3358 
3359 		if (req_op(req) == REQ_OP_ZONE_APPEND)
3360 			req->__sector = addr;
3361 		if (compl)
3362 			__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
3363 
3364 		if (ret)
3365 			goto out;
3366 		break;
3367 	case UBLK_IO_NEED_GET_DATA:
3368 		/*
3369 		 * ublk_get_data() may fail and fallback to requeue, so keep
3370 		 * uring_cmd active first and prepare for handling new requeued
3371 		 * request
3372 		 */
3373 		req = ublk_fill_io_cmd(io, cmd);
3374 		ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
3375 		WARN_ON_ONCE(ret);
3376 		if (likely(ublk_get_data(ubq, io, req))) {
3377 			__ublk_prep_compl_io_cmd(io, req);
3378 			return UBLK_IO_RES_OK;
3379 		}
3380 		break;
3381 	default:
3382 		goto out;
3383 	}
3384 	ublk_prep_cancel(cmd, issue_flags, ubq, tag);
3385 	return -EIOCBQUEUED;
3386 
3387  out:
3388 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
3389 			__func__, cmd_op, tag, ret, io ? io->flags : 0);
3390 	return ret;
3391 }
3392 
3393 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
3394 		u16 q_id, u16 tag, struct ublk_io *io)
3395 {
3396 	struct request *req;
3397 
3398 	/*
3399 	 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
3400 	 * which would overwrite it with io->cmd
3401 	 */
3402 	req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
3403 	if (!req)
3404 		return NULL;
3405 
3406 	if (!ublk_get_req_ref(io))
3407 		return NULL;
3408 
3409 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
3410 		goto fail_put;
3411 
3412 	if (!ublk_rq_has_data(req))
3413 		goto fail_put;
3414 
3415 	return req;
3416 fail_put:
3417 	ublk_put_req_ref(io, req);
3418 	return NULL;
3419 }
3420 
3421 static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
3422 {
3423 	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
3424 	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
3425 	int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
3426 
3427 	if (ret != -EIOCBQUEUED)
3428 		io_uring_cmd_done(cmd, ret, issue_flags);
3429 }
3430 
3431 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
3432 {
3433 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3434 		ublk_uring_cmd_cancel_fn(cmd, issue_flags);
3435 		return 0;
3436 	}
3437 
3438 	/* well-implemented server won't run into unlocked */
3439 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
3440 		io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
3441 		return -EIOCBQUEUED;
3442 	}
3443 
3444 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
3445 }
3446 
3447 static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
3448 					const struct ublk_elem_header *elem)
3449 {
3450 	const void *buf = elem;
3451 
3452 	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
3453 		return *(const __u64 *)(buf + sizeof(*elem));
3454 	return 0;
3455 }
3456 
3457 static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
3458 					const struct ublk_elem_header *elem)
3459 {
3460 	const void *buf = elem;
3461 
3462 	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
3463 		return *(const __u64 *)(buf + sizeof(*elem) +
3464 				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
3465 	return -1;
3466 }
3467 
3468 static struct ublk_auto_buf_reg
3469 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
3470 			const struct ublk_elem_header *elem)
3471 {
3472 	struct ublk_auto_buf_reg reg = {
3473 		.index = elem->buf_index,
3474 		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
3475 			UBLK_AUTO_BUF_REG_FALLBACK : 0,
3476 	};
3477 
3478 	return reg;
3479 }
3480 
3481 /*
3482  * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
3483  * it is the least common multiple(LCM) of 8, 16 and 24
3484  */
3485 #define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
3486 struct ublk_batch_io_iter {
3487 	void __user *uaddr;
3488 	unsigned done, total;
3489 	unsigned char elem_bytes;
3490 	/* copy to this buffer from user space */
3491 	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
3492 };
3493 
3494 static inline int
3495 __ublk_walk_cmd_buf(struct ublk_queue *ubq,
3496 		    struct ublk_batch_io_iter *iter,
3497 		    const struct ublk_batch_io_data *data,
3498 		    unsigned bytes,
3499 		    int (*cb)(struct ublk_queue *q,
3500 			    const struct ublk_batch_io_data *data,
3501 			    const struct ublk_elem_header *elem))
3502 {
3503 	unsigned int i;
3504 	int ret = 0;
3505 
3506 	for (i = 0; i < bytes; i += iter->elem_bytes) {
3507 		const struct ublk_elem_header *elem =
3508 			(const struct ublk_elem_header *)&iter->buf[i];
3509 
3510 		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
3511 			ret = -EINVAL;
3512 			break;
3513 		}
3514 
3515 		ret = cb(ubq, data, elem);
3516 		if (unlikely(ret))
3517 			break;
3518 	}
3519 
3520 	iter->done += i;
3521 	return ret;
3522 }
3523 
3524 static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
3525 			     const struct ublk_batch_io_data *data,
3526 			     int (*cb)(struct ublk_queue *q,
3527 				     const struct ublk_batch_io_data *data,
3528 				     const struct ublk_elem_header *elem))
3529 {
3530 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3531 	int ret = 0;
3532 
3533 	while (iter->done < iter->total) {
3534 		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
3535 
3536 		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
3537 			pr_warn("ublk%d: read batch cmd buffer failed\n",
3538 					data->ub->dev_info.dev_id);
3539 			return -EFAULT;
3540 		}
3541 
3542 		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
3543 		if (ret)
3544 			return ret;
3545 	}
3546 	return 0;
3547 }
3548 
3549 static int ublk_batch_unprep_io(struct ublk_queue *ubq,
3550 				const struct ublk_batch_io_data *data,
3551 				const struct ublk_elem_header *elem)
3552 {
3553 	struct ublk_io *io = &ubq->ios[elem->tag];
3554 
3555 	/*
3556 	 * If queue was ready before this decrement, it won't be anymore,
3557 	 * so we need to decrement the queue ready count and restore the
3558 	 * canceling flag to prevent new requests from being queued.
3559 	 */
3560 	if (ublk_queue_ready(ubq)) {
3561 		data->ub->nr_queue_ready--;
3562 		spin_lock(&ubq->cancel_lock);
3563 		ubq->canceling = true;
3564 		spin_unlock(&ubq->cancel_lock);
3565 	}
3566 	ubq->nr_io_ready--;
3567 
3568 	ublk_io_lock(io);
3569 	io->flags = 0;
3570 	ublk_io_unlock(io);
3571 	return 0;
3572 }
3573 
3574 static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
3575 				       const struct ublk_batch_io_data *data)
3576 {
3577 	int ret;
3578 
3579 	/* Re-process only what we've already processed, starting from beginning */
3580 	iter->total = iter->done;
3581 	iter->done = 0;
3582 
3583 	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
3584 	WARN_ON_ONCE(ret);
3585 }
3586 
3587 static int ublk_batch_prep_io(struct ublk_queue *ubq,
3588 			      const struct ublk_batch_io_data *data,
3589 			      const struct ublk_elem_header *elem)
3590 {
3591 	struct ublk_io *io = &ubq->ios[elem->tag];
3592 	const struct ublk_batch_io *uc = &data->header;
3593 	union ublk_io_buf buf = { 0 };
3594 	int ret;
3595 
3596 	if (ublk_dev_support_auto_buf_reg(data->ub))
3597 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3598 	else if (ublk_dev_need_map_io(data->ub)) {
3599 		buf.addr = ublk_batch_buf_addr(uc, elem);
3600 
3601 		ret = ublk_check_fetch_buf(data->ub, buf.addr);
3602 		if (ret)
3603 			return ret;
3604 	}
3605 
3606 	ublk_io_lock(io);
3607 	ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
3608 	if (!ret)
3609 		io->buf = buf;
3610 	ublk_io_unlock(io);
3611 
3612 	if (!ret)
3613 		ublk_mark_io_ready(data->ub, ubq->q_id);
3614 
3615 	return ret;
3616 }
3617 
3618 static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
3619 {
3620 	const struct ublk_batch_io *uc = &data->header;
3621 	struct io_uring_cmd *cmd = data->cmd;
3622 	struct ublk_batch_io_iter iter = {
3623 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3624 		.total = uc->nr_elem * uc->elem_bytes,
3625 		.elem_bytes = uc->elem_bytes,
3626 	};
3627 	int ret;
3628 
3629 	mutex_lock(&data->ub->mutex);
3630 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
3631 
3632 	if (ret && iter.done)
3633 		ublk_batch_revert_prep_cmd(&iter, data);
3634 	mutex_unlock(&data->ub->mutex);
3635 	return ret;
3636 }
3637 
3638 static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
3639 				      struct ublk_io *io,
3640 				      union ublk_io_buf *buf)
3641 {
3642 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3643 		return -EBUSY;
3644 
3645 	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
3646 	if (ublk_need_map_io(ubq) && !buf->addr)
3647 		return -EINVAL;
3648 	return 0;
3649 }
3650 
3651 static int ublk_batch_commit_io(struct ublk_queue *ubq,
3652 				const struct ublk_batch_io_data *data,
3653 				const struct ublk_elem_header *elem)
3654 {
3655 	struct ublk_io *io = &ubq->ios[elem->tag];
3656 	const struct ublk_batch_io *uc = &data->header;
3657 	u16 buf_idx = UBLK_INVALID_BUF_IDX;
3658 	union ublk_io_buf buf = { 0 };
3659 	struct request *req = NULL;
3660 	bool auto_reg = false;
3661 	bool compl = false;
3662 	int ret;
3663 
3664 	if (ublk_dev_support_auto_buf_reg(data->ub)) {
3665 		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
3666 		auto_reg = true;
3667 	} else if (ublk_dev_need_map_io(data->ub))
3668 		buf.addr = ublk_batch_buf_addr(uc, elem);
3669 
3670 	ublk_io_lock(io);
3671 	ret = ublk_batch_commit_io_check(ubq, io, &buf);
3672 	if (!ret) {
3673 		io->res = elem->result;
3674 		io->buf = buf;
3675 		req = ublk_fill_io_cmd(io, data->cmd);
3676 
3677 		if (auto_reg)
3678 			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
3679 		compl = ublk_need_complete_req(data->ub, io);
3680 	}
3681 	ublk_io_unlock(io);
3682 
3683 	if (unlikely(ret)) {
3684 		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
3685 			__func__, data->ub->dev_info.dev_id, ubq->q_id,
3686 			elem->tag, ret);
3687 		return ret;
3688 	}
3689 
3690 	if (buf_idx != UBLK_INVALID_BUF_IDX)
3691 		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
3692 	if (req_op(req) == REQ_OP_ZONE_APPEND)
3693 		req->__sector = ublk_batch_zone_lba(uc, elem);
3694 	if (compl)
3695 		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
3696 	return 0;
3697 }
3698 
3699 static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
3700 {
3701 	const struct ublk_batch_io *uc = &data->header;
3702 	struct io_uring_cmd *cmd = data->cmd;
3703 	struct ublk_batch_io_iter iter = {
3704 		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
3705 		.total = uc->nr_elem * uc->elem_bytes,
3706 		.elem_bytes = uc->elem_bytes,
3707 	};
3708 	DEFINE_IO_COMP_BATCH(iob);
3709 	int ret;
3710 
3711 	data->iob = &iob;
3712 	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
3713 
3714 	if (iob.complete)
3715 		iob.complete(&iob);
3716 
3717 	return iter.done == 0 ? ret : iter.done;
3718 }
3719 
3720 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
3721 {
3722 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
3723 
3724 	if (uc->flags & ~UBLK_BATCH_F_ALL)
3725 		return -EINVAL;
3726 
3727 	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
3728 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3729 			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
3730 		return -EINVAL;
3731 
3732 	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
3733 		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
3734 	if (uc->elem_bytes != elem_bytes)
3735 		return -EINVAL;
3736 	return 0;
3737 }
3738 
3739 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
3740 {
3741 	const struct ublk_batch_io *uc = &data->header;
3742 
3743 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3744 		return -EINVAL;
3745 
3746 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
3747 		return -E2BIG;
3748 
3749 	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
3750 			!ublk_dev_is_zoned(data->ub))
3751 		return -EINVAL;
3752 
3753 	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
3754 			!ublk_dev_need_map_io(data->ub))
3755 		return -EINVAL;
3756 
3757 	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
3758 			!ublk_dev_support_auto_buf_reg(data->ub))
3759 		return -EINVAL;
3760 
3761 	return ublk_check_batch_cmd_flags(uc);
3762 }
3763 
3764 static int ublk_batch_attach(struct ublk_queue *ubq,
3765 			     struct ublk_batch_io_data *data,
3766 			     struct ublk_batch_fetch_cmd *fcmd)
3767 {
3768 	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
3769 	bool free = false;
3770 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
3771 
3772 	spin_lock(&ubq->evts_lock);
3773 	if (unlikely(ubq->force_abort || ubq->canceling)) {
3774 		free = true;
3775 	} else {
3776 		list_add_tail(&fcmd->node, &ubq->fcmd_head);
3777 		new_fcmd = __ublk_acquire_fcmd(ubq);
3778 	}
3779 	spin_unlock(&ubq->evts_lock);
3780 
3781 	if (unlikely(free)) {
3782 		ublk_batch_free_fcmd(fcmd);
3783 		return -ENODEV;
3784 	}
3785 
3786 	pdu->ubq = ubq;
3787 	pdu->fcmd = fcmd;
3788 	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
3789 
3790 	if (!new_fcmd)
3791 		goto out;
3792 
3793 	/*
3794 	 * If the two fetch commands are originated from same io_ring_ctx,
3795 	 * run batch dispatch directly. Otherwise, schedule task work for
3796 	 * doing it.
3797 	 */
3798 	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
3799 			io_uring_cmd_ctx_handle(fcmd->cmd)) {
3800 		data->cmd = new_fcmd->cmd;
3801 		ublk_batch_dispatch(ubq, data, new_fcmd);
3802 	} else {
3803 		io_uring_cmd_complete_in_task(new_fcmd->cmd,
3804 				ublk_batch_tw_cb);
3805 	}
3806 out:
3807 	return -EIOCBQUEUED;
3808 }
3809 
3810 static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
3811 {
3812 	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
3813 	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
3814 
3815 	if (!fcmd)
3816 		return -ENOMEM;
3817 
3818 	return ublk_batch_attach(ubq, data, fcmd);
3819 }
3820 
3821 static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
3822 {
3823 	const struct ublk_batch_io *uc = &data->header;
3824 
3825 	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
3826 		return -EINVAL;
3827 
3828 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
3829 		return -EINVAL;
3830 
3831 	if (uc->elem_bytes != sizeof(__u16))
3832 		return -EINVAL;
3833 
3834 	if (uc->flags != 0)
3835 		return -EINVAL;
3836 
3837 	return 0;
3838 }
3839 
3840 static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
3841 				     unsigned int issue_flags)
3842 {
3843 	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
3844 							       struct ublksrv_io_cmd);
3845 	struct ublk_device *ub = cmd->file->private_data;
3846 	unsigned tag = READ_ONCE(ub_cmd->tag);
3847 	unsigned q_id = READ_ONCE(ub_cmd->q_id);
3848 	unsigned index = READ_ONCE(ub_cmd->addr);
3849 	struct ublk_queue *ubq;
3850 	struct ublk_io *io;
3851 
3852 	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
3853 		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
3854 
3855 	if (q_id >= ub->dev_info.nr_hw_queues)
3856 		return -EINVAL;
3857 
3858 	if (tag >= ub->dev_info.queue_depth)
3859 		return -EINVAL;
3860 
3861 	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
3862 		return -EOPNOTSUPP;
3863 
3864 	ubq = ublk_get_queue(ub, q_id);
3865 	io = &ubq->ios[tag];
3866 	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
3867 			issue_flags);
3868 }
3869 
3870 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
3871 				       unsigned int issue_flags)
3872 {
3873 	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
3874 							  struct ublk_batch_io);
3875 	struct ublk_device *ub = cmd->file->private_data;
3876 	struct ublk_batch_io_data data = {
3877 		.ub  = ub,
3878 		.cmd = cmd,
3879 		.header = (struct ublk_batch_io) {
3880 			.q_id = READ_ONCE(uc->q_id),
3881 			.flags = READ_ONCE(uc->flags),
3882 			.nr_elem = READ_ONCE(uc->nr_elem),
3883 			.elem_bytes = READ_ONCE(uc->elem_bytes),
3884 		},
3885 		.issue_flags = issue_flags,
3886 	};
3887 	u32 cmd_op = cmd->cmd_op;
3888 	int ret = -EINVAL;
3889 
3890 	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
3891 		ublk_batch_cancel_fn(cmd, issue_flags);
3892 		return 0;
3893 	}
3894 
3895 	switch (cmd_op) {
3896 	case UBLK_U_IO_PREP_IO_CMDS:
3897 		ret = ublk_check_batch_cmd(&data);
3898 		if (ret)
3899 			goto out;
3900 		ret = ublk_handle_batch_prep_cmd(&data);
3901 		break;
3902 	case UBLK_U_IO_COMMIT_IO_CMDS:
3903 		ret = ublk_check_batch_cmd(&data);
3904 		if (ret)
3905 			goto out;
3906 		ret = ublk_handle_batch_commit_cmd(&data);
3907 		break;
3908 	case UBLK_U_IO_FETCH_IO_CMDS:
3909 		ret = ublk_validate_batch_fetch_cmd(&data);
3910 		if (ret)
3911 			goto out;
3912 		ret = ublk_handle_batch_fetch_cmd(&data);
3913 		break;
3914 	default:
3915 		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
3916 		break;
3917 	}
3918 out:
3919 	return ret;
3920 }
3921 
3922 static inline bool ublk_check_ubuf_dir(const struct request *req,
3923 		int ubuf_dir)
3924 {
3925 	/* copy ubuf to request pages */
3926 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
3927 	    ubuf_dir == ITER_SOURCE)
3928 		return true;
3929 
3930 	/* copy request pages to ubuf */
3931 	if ((req_op(req) == REQ_OP_WRITE ||
3932 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
3933 	    ubuf_dir == ITER_DEST)
3934 		return true;
3935 
3936 	return false;
3937 }
3938 
3939 static ssize_t
3940 ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
3941 {
3942 	struct ublk_device *ub = iocb->ki_filp->private_data;
3943 	struct ublk_queue *ubq;
3944 	struct request *req;
3945 	struct ublk_io *io;
3946 	unsigned data_len;
3947 	bool is_integrity;
3948 	bool on_daemon;
3949 	size_t buf_off;
3950 	u16 tag, q_id;
3951 	ssize_t ret;
3952 
3953 	if (!user_backed_iter(iter))
3954 		return -EACCES;
3955 
3956 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3957 		return -EACCES;
3958 
3959 	tag = ublk_pos_to_tag(iocb->ki_pos);
3960 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
3961 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
3962 	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
3963 
3964 	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
3965 		return -EINVAL;
3966 
3967 	if (q_id >= ub->dev_info.nr_hw_queues)
3968 		return -EINVAL;
3969 
3970 	ubq = ublk_get_queue(ub, q_id);
3971 	if (!ublk_dev_support_user_copy(ub))
3972 		return -EACCES;
3973 
3974 	if (tag >= ub->dev_info.queue_depth)
3975 		return -EINVAL;
3976 
3977 	io = &ubq->ios[tag];
3978 	on_daemon = current == READ_ONCE(io->task);
3979 	if (on_daemon) {
3980 		/* On daemon, io can't be completed concurrently, so skip ref */
3981 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
3982 			return -EINVAL;
3983 
3984 		req = io->req;
3985 		if (!ublk_rq_has_data(req))
3986 			return -EINVAL;
3987 	} else {
3988 		req = __ublk_check_and_get_req(ub, q_id, tag, io);
3989 		if (!req)
3990 			return -EINVAL;
3991 	}
3992 
3993 	if (is_integrity) {
3994 		struct blk_integrity *bi = &req->q->limits.integrity;
3995 
3996 		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
3997 	} else {
3998 		data_len = blk_rq_bytes(req);
3999 	}
4000 	if (buf_off > data_len) {
4001 		ret = -EINVAL;
4002 		goto out;
4003 	}
4004 
4005 	if (!ublk_check_ubuf_dir(req, dir)) {
4006 		ret = -EACCES;
4007 		goto out;
4008 	}
4009 
4010 	if (is_integrity)
4011 		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
4012 	else
4013 		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
4014 
4015 out:
4016 	if (!on_daemon)
4017 		ublk_put_req_ref(io, req);
4018 	return ret;
4019 }
4020 
4021 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
4022 {
4023 	return ublk_user_copy(iocb, to, ITER_DEST);
4024 }
4025 
4026 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
4027 {
4028 	return ublk_user_copy(iocb, from, ITER_SOURCE);
4029 }
4030 
4031 static const struct file_operations ublk_ch_fops = {
4032 	.owner = THIS_MODULE,
4033 	.open = ublk_ch_open,
4034 	.release = ublk_ch_release,
4035 	.read_iter = ublk_ch_read_iter,
4036 	.write_iter = ublk_ch_write_iter,
4037 	.uring_cmd = ublk_ch_uring_cmd,
4038 	.mmap = ublk_ch_mmap,
4039 };
4040 
4041 static const struct file_operations ublk_ch_batch_io_fops = {
4042 	.owner = THIS_MODULE,
4043 	.open = ublk_ch_open,
4044 	.release = ublk_ch_release,
4045 	.read_iter = ublk_ch_read_iter,
4046 	.write_iter = ublk_ch_write_iter,
4047 	.uring_cmd = ublk_ch_batch_io_uring_cmd,
4048 	.mmap = ublk_ch_mmap,
4049 };
4050 
4051 static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
4052 {
4053 	int size, i;
4054 
4055 	size = ublk_queue_cmd_buf_size(ub);
4056 
4057 	for (i = 0; i < ubq->q_depth; i++) {
4058 		struct ublk_io *io = &ubq->ios[i];
4059 		if (io->task)
4060 			put_task_struct(io->task);
4061 		WARN_ON_ONCE(refcount_read(&io->ref));
4062 		WARN_ON_ONCE(io->task_registered_buffers);
4063 	}
4064 
4065 	if (ubq->io_cmd_buf)
4066 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
4067 
4068 	if (ublk_dev_support_batch_io(ub))
4069 		ublk_io_evts_deinit(ubq);
4070 
4071 	kvfree(ubq);
4072 }
4073 
4074 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
4075 {
4076 	struct ublk_queue *ubq = ub->queues[q_id];
4077 
4078 	if (!ubq)
4079 		return;
4080 
4081 	__ublk_deinit_queue(ub, ubq);
4082 	ub->queues[q_id] = NULL;
4083 }
4084 
4085 static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
4086 {
4087 	unsigned int cpu;
4088 
4089 	/* Find first CPU mapped to this queue */
4090 	for_each_possible_cpu(cpu) {
4091 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
4092 			return cpu_to_node(cpu);
4093 	}
4094 
4095 	return NUMA_NO_NODE;
4096 }
4097 
4098 static int ublk_init_queue(struct ublk_device *ub, int q_id)
4099 {
4100 	int depth = ub->dev_info.queue_depth;
4101 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
4102 	struct ublk_queue *ubq;
4103 	struct page *page;
4104 	int numa_node;
4105 	int size, i, ret;
4106 
4107 	/* Determine NUMA node based on queue's CPU affinity */
4108 	numa_node = ublk_get_queue_numa_node(ub, q_id);
4109 
4110 	/* Allocate queue structure on local NUMA node */
4111 	ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
4112 			    numa_node);
4113 	if (!ubq)
4114 		return -ENOMEM;
4115 
4116 	spin_lock_init(&ubq->cancel_lock);
4117 	ubq->flags = ub->dev_info.flags;
4118 	ubq->q_id = q_id;
4119 	ubq->q_depth = depth;
4120 	size = ublk_queue_cmd_buf_size(ub);
4121 
4122 	/* Allocate I/O command buffer on local NUMA node */
4123 	page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
4124 	if (!page) {
4125 		kvfree(ubq);
4126 		return -ENOMEM;
4127 	}
4128 	ubq->io_cmd_buf = page_address(page);
4129 
4130 	for (i = 0; i < ubq->q_depth; i++)
4131 		spin_lock_init(&ubq->ios[i].lock);
4132 
4133 	if (ublk_dev_support_batch_io(ub)) {
4134 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
4135 		if (ret)
4136 			goto fail;
4137 		INIT_LIST_HEAD(&ubq->fcmd_head);
4138 	}
4139 	ub->queues[q_id] = ubq;
4140 	ubq->dev = ub;
4141 
4142 	return 0;
4143 fail:
4144 	__ublk_deinit_queue(ub, ubq);
4145 	return ret;
4146 }
4147 
4148 static void ublk_deinit_queues(struct ublk_device *ub)
4149 {
4150 	int i;
4151 
4152 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
4153 		ublk_deinit_queue(ub, i);
4154 }
4155 
4156 static int ublk_init_queues(struct ublk_device *ub)
4157 {
4158 	int i, ret;
4159 
4160 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
4161 		ret = ublk_init_queue(ub, i);
4162 		if (ret)
4163 			goto fail;
4164 	}
4165 
4166 	init_completion(&ub->completion);
4167 	return 0;
4168 
4169  fail:
4170 	ublk_deinit_queues(ub);
4171 	return ret;
4172 }
4173 
4174 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
4175 {
4176 	int i = idx;
4177 	int err;
4178 
4179 	spin_lock(&ublk_idr_lock);
4180 	/* allocate id, if @id >= 0, we're requesting that specific id */
4181 	if (i >= 0) {
4182 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
4183 		if (err == -ENOSPC)
4184 			err = -EEXIST;
4185 	} else {
4186 		err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
4187 				GFP_NOWAIT);
4188 	}
4189 	spin_unlock(&ublk_idr_lock);
4190 
4191 	if (err >= 0)
4192 		ub->ub_number = err;
4193 
4194 	return err;
4195 }
4196 
4197 static void ublk_free_dev_number(struct ublk_device *ub)
4198 {
4199 	spin_lock(&ublk_idr_lock);
4200 	idr_remove(&ublk_index_idr, ub->ub_number);
4201 	wake_up_all(&ublk_idr_wq);
4202 	spin_unlock(&ublk_idr_lock);
4203 }
4204 
4205 static void ublk_cdev_rel(struct device *dev)
4206 {
4207 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
4208 
4209 	blk_mq_free_tag_set(&ub->tag_set);
4210 	ublk_deinit_queues(ub);
4211 	ublk_free_dev_number(ub);
4212 	mutex_destroy(&ub->mutex);
4213 	mutex_destroy(&ub->cancel_mutex);
4214 	kfree(ub);
4215 }
4216 
4217 static int ublk_add_chdev(struct ublk_device *ub)
4218 {
4219 	struct device *dev = &ub->cdev_dev;
4220 	int minor = ub->ub_number;
4221 	int ret;
4222 
4223 	dev->parent = ublk_misc.this_device;
4224 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
4225 	dev->class = &ublk_chr_class;
4226 	dev->release = ublk_cdev_rel;
4227 	device_initialize(dev);
4228 
4229 	ret = dev_set_name(dev, "ublkc%d", minor);
4230 	if (ret)
4231 		goto fail;
4232 
4233 	if (ublk_dev_support_batch_io(ub))
4234 		cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
4235 	else
4236 		cdev_init(&ub->cdev, &ublk_ch_fops);
4237 	ret = cdev_device_add(&ub->cdev, dev);
4238 	if (ret)
4239 		goto fail;
4240 
4241 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
4242 		unprivileged_ublks_added++;
4243 	return 0;
4244  fail:
4245 	put_device(dev);
4246 	return ret;
4247 }
4248 
4249 /* align max io buffer size with PAGE_SIZE */
4250 static void ublk_align_max_io_size(struct ublk_device *ub)
4251 {
4252 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
4253 
4254 	ub->dev_info.max_io_buf_bytes =
4255 		round_down(max_io_bytes, PAGE_SIZE);
4256 }
4257 
4258 static int ublk_add_tag_set(struct ublk_device *ub)
4259 {
4260 	if (ublk_dev_support_batch_io(ub))
4261 		ub->tag_set.ops = &ublk_batch_mq_ops;
4262 	else
4263 		ub->tag_set.ops = &ublk_mq_ops;
4264 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
4265 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
4266 	ub->tag_set.numa_node = NUMA_NO_NODE;
4267 	ub->tag_set.driver_data = ub;
4268 	return blk_mq_alloc_tag_set(&ub->tag_set);
4269 }
4270 
4271 static void ublk_remove(struct ublk_device *ub)
4272 {
4273 	bool unprivileged;
4274 
4275 	ublk_stop_dev(ub);
4276 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
4277 	unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
4278 	ublk_put_device(ub);
4279 
4280 	if (unprivileged)
4281 		unprivileged_ublks_added--;
4282 }
4283 
4284 static struct ublk_device *ublk_get_device_from_id(int idx)
4285 {
4286 	struct ublk_device *ub = NULL;
4287 
4288 	if (idx < 0)
4289 		return NULL;
4290 
4291 	spin_lock(&ublk_idr_lock);
4292 	ub = idr_find(&ublk_index_idr, idx);
4293 	if (ub)
4294 		ub = ublk_get_device(ub);
4295 	spin_unlock(&ublk_idr_lock);
4296 
4297 	return ub;
4298 }
4299 
4300 static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
4301 {
4302 	rcu_read_lock();
4303 	ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
4304 	rcu_read_unlock();
4305 
4306 	return ub->ublksrv_tgid == ublksrv_pid;
4307 }
4308 
4309 static int ublk_ctrl_start_dev(struct ublk_device *ub,
4310 		const struct ublksrv_ctrl_cmd *header)
4311 {
4312 	const struct ublk_param_basic *p = &ub->params.basic;
4313 	int ublksrv_pid = (int)header->data[0];
4314 	struct queue_limits lim = {
4315 		.logical_block_size	= 1 << p->logical_bs_shift,
4316 		.physical_block_size	= 1 << p->physical_bs_shift,
4317 		.io_min			= 1 << p->io_min_shift,
4318 		.io_opt			= 1 << p->io_opt_shift,
4319 		.max_hw_sectors		= p->max_sectors,
4320 		.chunk_sectors		= p->chunk_sectors,
4321 		.virt_boundary_mask	= p->virt_boundary_mask,
4322 		.max_segments		= USHRT_MAX,
4323 		.max_segment_size	= UINT_MAX,
4324 		.dma_alignment		= 3,
4325 	};
4326 	struct gendisk *disk;
4327 	int ret = -EINVAL;
4328 
4329 	if (ublksrv_pid <= 0)
4330 		return -EINVAL;
4331 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
4332 		return -EINVAL;
4333 
4334 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
4335 		const struct ublk_param_discard *pd = &ub->params.discard;
4336 
4337 		lim.discard_alignment = pd->discard_alignment;
4338 		lim.discard_granularity = pd->discard_granularity;
4339 		lim.max_hw_discard_sectors = pd->max_discard_sectors;
4340 		lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
4341 		lim.max_discard_segments = pd->max_discard_segments;
4342 	}
4343 
4344 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
4345 		const struct ublk_param_zoned *p = &ub->params.zoned;
4346 
4347 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
4348 			return -EOPNOTSUPP;
4349 
4350 		lim.features |= BLK_FEAT_ZONED;
4351 		lim.max_active_zones = p->max_active_zones;
4352 		lim.max_open_zones =  p->max_open_zones;
4353 		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
4354 	}
4355 
4356 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
4357 		lim.features |= BLK_FEAT_WRITE_CACHE;
4358 		if (ub->params.basic.attrs & UBLK_ATTR_FUA)
4359 			lim.features |= BLK_FEAT_FUA;
4360 	}
4361 
4362 	if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
4363 		lim.features |= BLK_FEAT_ROTATIONAL;
4364 
4365 	if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
4366 		lim.dma_alignment = ub->params.dma.alignment;
4367 
4368 	if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
4369 		lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
4370 		lim.max_segment_size = ub->params.seg.max_segment_size;
4371 		lim.max_segments = ub->params.seg.max_segments;
4372 	}
4373 
4374 	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
4375 		const struct ublk_param_integrity *p = &ub->params.integrity;
4376 		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
4377 
4378 		lim.max_integrity_segments =
4379 			p->max_integrity_segments ?: USHRT_MAX;
4380 		lim.integrity = (struct blk_integrity) {
4381 			.flags = ublk_integrity_flags(p->flags),
4382 			.csum_type = ublk_integrity_csum_type(p->csum_type),
4383 			.metadata_size = p->metadata_size,
4384 			.pi_offset = p->pi_offset,
4385 			.interval_exp = p->interval_exp,
4386 			.tag_size = p->tag_size,
4387 			.pi_tuple_size = pi_tuple_size,
4388 		};
4389 	}
4390 
4391 	if (wait_for_completion_interruptible(&ub->completion) != 0)
4392 		return -EINTR;
4393 
4394 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4395 		return -EINVAL;
4396 
4397 	mutex_lock(&ub->mutex);
4398 	/* device may become not ready in case of F_BATCH */
4399 	if (!ublk_dev_ready(ub)) {
4400 		ret = -EINVAL;
4401 		goto out_unlock;
4402 	}
4403 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
4404 	    test_bit(UB_STATE_USED, &ub->state)) {
4405 		ret = -EEXIST;
4406 		goto out_unlock;
4407 	}
4408 
4409 	disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
4410 	if (IS_ERR(disk)) {
4411 		ret = PTR_ERR(disk);
4412 		goto out_unlock;
4413 	}
4414 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
4415 	disk->fops = &ub_fops;
4416 	disk->private_data = ub;
4417 
4418 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4419 	ub->ub_disk = disk;
4420 
4421 	ublk_apply_params(ub);
4422 
4423 	/*
4424 	 * Suppress partition scan to avoid potential IO hang.
4425 	 *
4426 	 * If ublk server error occurs during partition scan, the IO may
4427 	 * wait while holding ub->mutex, which can deadlock with other
4428 	 * operations that need the mutex. Defer partition scan to async
4429 	 * work.
4430 	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
4431 	 * permanently.
4432 	 */
4433 	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4434 
4435 	ublk_get_device(ub);
4436 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4437 
4438 	if (ublk_dev_is_zoned(ub)) {
4439 		ret = ublk_revalidate_disk_zones(ub);
4440 		if (ret)
4441 			goto out_put_cdev;
4442 	}
4443 
4444 	ret = add_disk(disk);
4445 	if (ret)
4446 		goto out_put_cdev;
4447 
4448 	set_bit(UB_STATE_USED, &ub->state);
4449 
4450 	/* Skip partition scan if disabled by user */
4451 	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
4452 		clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
4453 	} else {
4454 		/* Schedule async partition scan for trusted daemons */
4455 		if (!ub->unprivileged_daemons)
4456 			schedule_work(&ub->partition_scan_work);
4457 	}
4458 
4459 out_put_cdev:
4460 	if (ret) {
4461 		ublk_detach_disk(ub);
4462 		ublk_put_device(ub);
4463 	}
4464 	if (ret)
4465 		put_disk(disk);
4466 out_unlock:
4467 	mutex_unlock(&ub->mutex);
4468 	return ret;
4469 }
4470 
4471 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
4472 		const struct ublksrv_ctrl_cmd *header)
4473 {
4474 	void __user *argp = (void __user *)(unsigned long)header->addr;
4475 	cpumask_var_t cpumask;
4476 	unsigned long queue;
4477 	unsigned int retlen;
4478 	unsigned int i;
4479 	int ret;
4480 
4481 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
4482 		return -EINVAL;
4483 	if (header->len & (sizeof(unsigned long)-1))
4484 		return -EINVAL;
4485 	if (!header->addr)
4486 		return -EINVAL;
4487 
4488 	queue = header->data[0];
4489 	if (queue >= ub->dev_info.nr_hw_queues)
4490 		return -EINVAL;
4491 
4492 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
4493 		return -ENOMEM;
4494 
4495 	for_each_possible_cpu(i) {
4496 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
4497 			cpumask_set_cpu(i, cpumask);
4498 	}
4499 
4500 	ret = -EFAULT;
4501 	retlen = min_t(unsigned short, header->len, cpumask_size());
4502 	if (copy_to_user(argp, cpumask, retlen))
4503 		goto out_free_cpumask;
4504 	if (retlen != header->len &&
4505 	    clear_user(argp + retlen, header->len - retlen))
4506 		goto out_free_cpumask;
4507 
4508 	ret = 0;
4509 out_free_cpumask:
4510 	free_cpumask_var(cpumask);
4511 	return ret;
4512 }
4513 
4514 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
4515 {
4516 	pr_devel("%s: dev id %d flags %llx\n", __func__,
4517 			info->dev_id, info->flags);
4518 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
4519 			info->nr_hw_queues, info->queue_depth);
4520 }
4521 
4522 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
4523 {
4524 	void __user *argp = (void __user *)(unsigned long)header->addr;
4525 	struct ublksrv_ctrl_dev_info info;
4526 	struct ublk_device *ub;
4527 	int ret = -EINVAL;
4528 
4529 	if (header->len < sizeof(info) || !header->addr)
4530 		return -EINVAL;
4531 	if (header->queue_id != (u16)-1) {
4532 		pr_warn("%s: queue_id is wrong %x\n",
4533 			__func__, header->queue_id);
4534 		return -EINVAL;
4535 	}
4536 
4537 	if (copy_from_user(&info, argp, sizeof(info)))
4538 		return -EFAULT;
4539 
4540 	if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
4541 	    info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
4542 		return -EINVAL;
4543 
4544 	if (capable(CAP_SYS_ADMIN))
4545 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
4546 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
4547 		return -EPERM;
4548 
4549 	/* forbid nonsense combinations of recovery flags */
4550 	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
4551 	case 0:
4552 	case UBLK_F_USER_RECOVERY:
4553 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
4554 	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
4555 		break;
4556 	default:
4557 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
4558 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
4559 		return -EINVAL;
4560 	}
4561 
4562 	if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
4563 		pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
4564 		return -EINVAL;
4565 	}
4566 
4567 	/*
4568 	 * unprivileged device can't be trusted, but RECOVERY and
4569 	 * RECOVERY_REISSUE still may hang error handling, so can't
4570 	 * support recovery features for unprivileged ublk now
4571 	 *
4572 	 * TODO: provide forward progress for RECOVERY handler, so that
4573 	 * unprivileged device can benefit from it
4574 	 */
4575 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
4576 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
4577 				UBLK_F_USER_RECOVERY);
4578 
4579 		/*
4580 		 * For USER_COPY, we depends on userspace to fill request
4581 		 * buffer by pwrite() to ublk char device, which can't be
4582 		 * used for unprivileged device
4583 		 *
4584 		 * Same with zero copy or auto buffer register.
4585 		 */
4586 		if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4587 					UBLK_F_AUTO_BUF_REG))
4588 			return -EINVAL;
4589 	}
4590 
4591 	/* User copy is required to access integrity buffer */
4592 	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
4593 		return -EINVAL;
4594 
4595 	/* the created device is always owned by current user */
4596 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
4597 
4598 	if (header->dev_id != info.dev_id) {
4599 		pr_warn("%s: dev id not match %u %u\n",
4600 			__func__, header->dev_id, info.dev_id);
4601 		return -EINVAL;
4602 	}
4603 
4604 	if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
4605 		pr_warn("%s: dev id is too large. Max supported is %d\n",
4606 			__func__, UBLK_MAX_UBLKS - 1);
4607 		return -EINVAL;
4608 	}
4609 
4610 	ublk_dump_dev_info(&info);
4611 
4612 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4613 	if (ret)
4614 		return ret;
4615 
4616 	ret = -EACCES;
4617 	if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
4618 	    unprivileged_ublks_added >= unprivileged_ublks_max)
4619 		goto out_unlock;
4620 
4621 	ret = -ENOMEM;
4622 	ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
4623 	if (!ub)
4624 		goto out_unlock;
4625 	mutex_init(&ub->mutex);
4626 	spin_lock_init(&ub->lock);
4627 	mutex_init(&ub->cancel_mutex);
4628 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
4629 
4630 	ret = ublk_alloc_dev_number(ub, header->dev_id);
4631 	if (ret < 0)
4632 		goto out_free_ub;
4633 
4634 	memcpy(&ub->dev_info, &info, sizeof(info));
4635 
4636 	/* update device id */
4637 	ub->dev_info.dev_id = ub->ub_number;
4638 
4639 	/*
4640 	 * 64bit flags will be copied back to userspace as feature
4641 	 * negotiation result, so have to clear flags which driver
4642 	 * doesn't support yet, then userspace can get correct flags
4643 	 * (features) to handle.
4644 	 */
4645 	ub->dev_info.flags &= UBLK_F_ALL;
4646 
4647 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
4648 		UBLK_F_URING_CMD_COMP_IN_TASK |
4649 		UBLK_F_PER_IO_DAEMON |
4650 		UBLK_F_BUF_REG_OFF_DAEMON |
4651 		UBLK_F_SAFE_STOP_DEV;
4652 
4653 	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
4654 	if (ublk_dev_support_batch_io(ub))
4655 		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
4656 
4657 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
4658 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
4659 				UBLK_F_AUTO_BUF_REG))
4660 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4661 
4662 	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
4663 	if (ublk_dev_support_batch_io(ub))
4664 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
4665 
4666 	/*
4667 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
4668 	 * returning write_append_lba, which is only allowed in case of
4669 	 * user copy or zero copy
4670 	 */
4671 	if (ublk_dev_is_zoned(ub) &&
4672 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
4673 	     (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
4674 		ret = -EINVAL;
4675 		goto out_free_dev_number;
4676 	}
4677 
4678 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
4679 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
4680 	ublk_align_max_io_size(ub);
4681 
4682 	ret = ublk_add_tag_set(ub);
4683 	if (ret)
4684 		goto out_free_dev_number;
4685 
4686 	ret = ublk_init_queues(ub);
4687 	if (ret)
4688 		goto out_free_tag_set;
4689 
4690 	ret = -EFAULT;
4691 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
4692 		goto out_deinit_queues;
4693 
4694 	/*
4695 	 * Add the char dev so that ublksrv daemon can be setup.
4696 	 * ublk_add_chdev() will cleanup everything if it fails.
4697 	 */
4698 	ret = ublk_add_chdev(ub);
4699 	goto out_unlock;
4700 
4701 out_deinit_queues:
4702 	ublk_deinit_queues(ub);
4703 out_free_tag_set:
4704 	blk_mq_free_tag_set(&ub->tag_set);
4705 out_free_dev_number:
4706 	ublk_free_dev_number(ub);
4707 out_free_ub:
4708 	mutex_destroy(&ub->mutex);
4709 	mutex_destroy(&ub->cancel_mutex);
4710 	kfree(ub);
4711 out_unlock:
4712 	mutex_unlock(&ublk_ctl_mutex);
4713 	return ret;
4714 }
4715 
4716 static inline bool ublk_idr_freed(int id)
4717 {
4718 	void *ptr;
4719 
4720 	spin_lock(&ublk_idr_lock);
4721 	ptr = idr_find(&ublk_index_idr, id);
4722 	spin_unlock(&ublk_idr_lock);
4723 
4724 	return ptr == NULL;
4725 }
4726 
4727 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
4728 {
4729 	struct ublk_device *ub = *p_ub;
4730 	int idx = ub->ub_number;
4731 	int ret;
4732 
4733 	ret = mutex_lock_killable(&ublk_ctl_mutex);
4734 	if (ret)
4735 		return ret;
4736 
4737 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
4738 		ublk_remove(ub);
4739 		set_bit(UB_STATE_DELETED, &ub->state);
4740 	}
4741 
4742 	/* Mark the reference as consumed */
4743 	*p_ub = NULL;
4744 	ublk_put_device(ub);
4745 	mutex_unlock(&ublk_ctl_mutex);
4746 
4747 	/*
4748 	 * Wait until the idr is removed, then it can be reused after
4749 	 * DEL_DEV command is returned.
4750 	 *
4751 	 * If we returns because of user interrupt, future delete command
4752 	 * may come:
4753 	 *
4754 	 * - the device number isn't freed, this device won't or needn't
4755 	 *   be deleted again, since UB_STATE_DELETED is set, and device
4756 	 *   will be released after the last reference is dropped
4757 	 *
4758 	 * - the device number is freed already, we will not find this
4759 	 *   device via ublk_get_device_from_id()
4760 	 */
4761 	if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
4762 		return -EINTR;
4763 	return 0;
4764 }
4765 
4766 static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
4767 				      const struct ublksrv_ctrl_cmd *header)
4768 {
4769 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
4770 			__func__, cmd_op, header->dev_id, header->queue_id,
4771 			header->data[0], header->addr, header->len);
4772 }
4773 
4774 static void ublk_ctrl_stop_dev(struct ublk_device *ub)
4775 {
4776 	ublk_stop_dev(ub);
4777 }
4778 
4779 static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
4780 {
4781 	struct gendisk *disk;
4782 	int ret = 0;
4783 
4784 	disk = ublk_get_disk(ub);
4785 	if (!disk)
4786 		return -ENODEV;
4787 
4788 	mutex_lock(&disk->open_mutex);
4789 	if (disk_openers(disk) > 0) {
4790 		ret = -EBUSY;
4791 		goto unlock;
4792 	}
4793 	ub->block_open = true;
4794 	/* release open_mutex as del_gendisk() will reacquire it */
4795 	mutex_unlock(&disk->open_mutex);
4796 
4797 	ublk_ctrl_stop_dev(ub);
4798 	goto out;
4799 
4800 unlock:
4801 	mutex_unlock(&disk->open_mutex);
4802 out:
4803 	ublk_put_disk(disk);
4804 	return ret;
4805 }
4806 
4807 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
4808 		const struct ublksrv_ctrl_cmd *header)
4809 {
4810 	struct task_struct *p;
4811 	struct pid *pid;
4812 	struct ublksrv_ctrl_dev_info dev_info;
4813 	pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
4814 	void __user *argp = (void __user *)(unsigned long)header->addr;
4815 
4816 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
4817 		return -EINVAL;
4818 
4819 	memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
4820 	dev_info.ublksrv_pid = -1;
4821 
4822 	if (init_ublksrv_tgid > 0) {
4823 		rcu_read_lock();
4824 		pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
4825 		p = pid_task(pid, PIDTYPE_TGID);
4826 		if (p) {
4827 			int vnr = task_tgid_vnr(p);
4828 
4829 			if (vnr)
4830 				dev_info.ublksrv_pid = vnr;
4831 		}
4832 		rcu_read_unlock();
4833 	}
4834 
4835 	if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
4836 		return -EFAULT;
4837 
4838 	return 0;
4839 }
4840 
4841 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
4842 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
4843 {
4844 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
4845 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
4846 
4847 	if (ub->ub_disk) {
4848 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
4849 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
4850 	} else {
4851 		ub->params.devt.disk_major = 0;
4852 		ub->params.devt.disk_minor = 0;
4853 	}
4854 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
4855 }
4856 
4857 static int ublk_ctrl_get_params(struct ublk_device *ub,
4858 		const struct ublksrv_ctrl_cmd *header)
4859 {
4860 	void __user *argp = (void __user *)(unsigned long)header->addr;
4861 	struct ublk_params_header ph;
4862 	int ret;
4863 
4864 	if (header->len <= sizeof(ph) || !header->addr)
4865 		return -EINVAL;
4866 
4867 	if (copy_from_user(&ph, argp, sizeof(ph)))
4868 		return -EFAULT;
4869 
4870 	if (ph.len > header->len || !ph.len)
4871 		return -EINVAL;
4872 
4873 	if (ph.len > sizeof(struct ublk_params))
4874 		ph.len = sizeof(struct ublk_params);
4875 
4876 	mutex_lock(&ub->mutex);
4877 	ublk_ctrl_fill_params_devt(ub);
4878 	if (copy_to_user(argp, &ub->params, ph.len))
4879 		ret = -EFAULT;
4880 	else
4881 		ret = 0;
4882 	mutex_unlock(&ub->mutex);
4883 
4884 	return ret;
4885 }
4886 
4887 static int ublk_ctrl_set_params(struct ublk_device *ub,
4888 		const struct ublksrv_ctrl_cmd *header)
4889 {
4890 	void __user *argp = (void __user *)(unsigned long)header->addr;
4891 	struct ublk_params_header ph;
4892 	int ret = -EFAULT;
4893 
4894 	if (header->len <= sizeof(ph) || !header->addr)
4895 		return -EINVAL;
4896 
4897 	if (copy_from_user(&ph, argp, sizeof(ph)))
4898 		return -EFAULT;
4899 
4900 	if (ph.len > header->len || !ph.len || !ph.types)
4901 		return -EINVAL;
4902 
4903 	if (ph.len > sizeof(struct ublk_params))
4904 		ph.len = sizeof(struct ublk_params);
4905 
4906 	mutex_lock(&ub->mutex);
4907 	if (test_bit(UB_STATE_USED, &ub->state)) {
4908 		/*
4909 		 * Parameters can only be changed when device hasn't
4910 		 * been started yet
4911 		 */
4912 		ret = -EACCES;
4913 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
4914 		ret = -EFAULT;
4915 	} else {
4916 		/* clear all we don't support yet */
4917 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
4918 		ret = ublk_validate_params(ub);
4919 		if (ret)
4920 			ub->params.types = 0;
4921 	}
4922 	mutex_unlock(&ub->mutex);
4923 
4924 	return ret;
4925 }
4926 
4927 static int ublk_ctrl_start_recovery(struct ublk_device *ub)
4928 {
4929 	int ret = -EINVAL;
4930 
4931 	mutex_lock(&ub->mutex);
4932 	if (ublk_nosrv_should_stop_dev(ub))
4933 		goto out_unlock;
4934 	/*
4935 	 * START_RECOVERY is only allowd after:
4936 	 *
4937 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
4938 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
4939 	 *     released.
4940 	 *
4941 	 * and one of the following holds
4942 	 *
4943 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
4944 	 *     (a)has quiesced request queue
4945 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
4946 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
4947 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
4948 	 *
4949 	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
4950 	 *     quiesced, but all I/O is being immediately errored
4951 	 */
4952 	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
4953 		ret = -EBUSY;
4954 		goto out_unlock;
4955 	}
4956 	pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
4957 	init_completion(&ub->completion);
4958 	ret = 0;
4959  out_unlock:
4960 	mutex_unlock(&ub->mutex);
4961 	return ret;
4962 }
4963 
4964 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
4965 		const struct ublksrv_ctrl_cmd *header)
4966 {
4967 	int ublksrv_pid = (int)header->data[0];
4968 	int ret = -EINVAL;
4969 
4970 	pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
4971 		 header->dev_id);
4972 
4973 	if (wait_for_completion_interruptible(&ub->completion))
4974 		return -EINTR;
4975 
4976 	pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
4977 		 header->dev_id);
4978 
4979 	if (!ublk_validate_user_pid(ub, ublksrv_pid))
4980 		return -EINVAL;
4981 
4982 	mutex_lock(&ub->mutex);
4983 	if (ublk_nosrv_should_stop_dev(ub))
4984 		goto out_unlock;
4985 
4986 	if (!ublk_dev_in_recoverable_state(ub)) {
4987 		ret = -EBUSY;
4988 		goto out_unlock;
4989 	}
4990 	ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
4991 	ub->dev_info.state = UBLK_S_DEV_LIVE;
4992 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
4993 			__func__, ublksrv_pid, header->dev_id);
4994 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
4995 	ret = 0;
4996  out_unlock:
4997 	mutex_unlock(&ub->mutex);
4998 	return ret;
4999 }
5000 
5001 static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
5002 {
5003 	void __user *argp = (void __user *)(unsigned long)header->addr;
5004 	u64 features = UBLK_F_ALL;
5005 
5006 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
5007 		return -EINVAL;
5008 
5009 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
5010 		return -EFAULT;
5011 
5012 	return 0;
5013 }
5014 
5015 static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
5016 {
5017 	struct ublk_param_basic *p = &ub->params.basic;
5018 	u64 new_size = header->data[0];
5019 
5020 	mutex_lock(&ub->mutex);
5021 	p->dev_sectors = new_size;
5022 	set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
5023 	mutex_unlock(&ub->mutex);
5024 }
5025 
5026 struct count_busy {
5027 	const struct ublk_queue *ubq;
5028 	unsigned int nr_busy;
5029 };
5030 
5031 static bool ublk_count_busy_req(struct request *rq, void *data)
5032 {
5033 	struct count_busy *idle = data;
5034 
5035 	if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
5036 		idle->nr_busy += 1;
5037 	return true;
5038 }
5039 
5040 /* uring_cmd is guaranteed to be active if the associated request is idle */
5041 static bool ubq_has_idle_io(const struct ublk_queue *ubq)
5042 {
5043 	struct count_busy data = {
5044 		.ubq = ubq,
5045 	};
5046 
5047 	blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
5048 	return data.nr_busy < ubq->q_depth;
5049 }
5050 
5051 /* Wait until each hw queue has at least one idle IO */
5052 static int ublk_wait_for_idle_io(struct ublk_device *ub,
5053 				 unsigned int timeout_ms)
5054 {
5055 	unsigned int elapsed = 0;
5056 	int ret;
5057 
5058 	/*
5059 	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
5060 	 * or new fetch command, so needn't wait any more
5061 	 */
5062 	if (ublk_dev_support_batch_io(ub))
5063 		return 0;
5064 
5065 	while (elapsed < timeout_ms && !signal_pending(current)) {
5066 		unsigned int queues_cancelable = 0;
5067 		int i;
5068 
5069 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
5070 			struct ublk_queue *ubq = ublk_get_queue(ub, i);
5071 
5072 			queues_cancelable += !!ubq_has_idle_io(ubq);
5073 		}
5074 
5075 		/*
5076 		 * Each queue needs at least one active command for
5077 		 * notifying ublk server
5078 		 */
5079 		if (queues_cancelable == ub->dev_info.nr_hw_queues)
5080 			break;
5081 
5082 		msleep(UBLK_REQUEUE_DELAY_MS);
5083 		elapsed += UBLK_REQUEUE_DELAY_MS;
5084 	}
5085 
5086 	if (signal_pending(current))
5087 		ret = -EINTR;
5088 	else if (elapsed >= timeout_ms)
5089 		ret = -EBUSY;
5090 	else
5091 		ret = 0;
5092 
5093 	return ret;
5094 }
5095 
5096 static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
5097 				 const struct ublksrv_ctrl_cmd *header)
5098 {
5099 	/* zero means wait forever */
5100 	u64 timeout_ms = header->data[0];
5101 	struct gendisk *disk;
5102 	int ret = -ENODEV;
5103 
5104 	if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
5105 		return -EOPNOTSUPP;
5106 
5107 	mutex_lock(&ub->mutex);
5108 	disk = ublk_get_disk(ub);
5109 	if (!disk)
5110 		goto unlock;
5111 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
5112 		goto put_disk;
5113 
5114 	ret = 0;
5115 	/* already in expected state */
5116 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
5117 		goto put_disk;
5118 
5119 	/* Mark the device as canceling */
5120 	mutex_lock(&ub->cancel_mutex);
5121 	blk_mq_quiesce_queue(disk->queue);
5122 	ublk_set_canceling(ub, true);
5123 	blk_mq_unquiesce_queue(disk->queue);
5124 	mutex_unlock(&ub->cancel_mutex);
5125 
5126 	if (!timeout_ms)
5127 		timeout_ms = UINT_MAX;
5128 	ret = ublk_wait_for_idle_io(ub, timeout_ms);
5129 
5130 put_disk:
5131 	ublk_put_disk(disk);
5132 unlock:
5133 	mutex_unlock(&ub->mutex);
5134 
5135 	/* Cancel pending uring_cmd */
5136 	if (!ret)
5137 		ublk_cancel_dev(ub);
5138 	return ret;
5139 }
5140 
5141 /*
5142  * All control commands are sent via /dev/ublk-control, so we have to check
5143  * the destination device's permission
5144  */
5145 static int ublk_char_dev_permission(struct ublk_device *ub,
5146 		const char *dev_path, int mask)
5147 {
5148 	int err;
5149 	struct path path;
5150 	struct kstat stat;
5151 
5152 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
5153 	if (err)
5154 		return err;
5155 
5156 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
5157 	if (err)
5158 		goto exit;
5159 
5160 	err = -EPERM;
5161 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
5162 		goto exit;
5163 
5164 	err = inode_permission(&nop_mnt_idmap,
5165 			d_backing_inode(path.dentry), mask);
5166 exit:
5167 	path_put(&path);
5168 	return err;
5169 }
5170 
5171 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
5172 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
5173 {
5174 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
5175 	void __user *argp = (void __user *)(unsigned long)header->addr;
5176 	char *dev_path = NULL;
5177 	int ret = 0;
5178 	int mask;
5179 
5180 	if (!unprivileged) {
5181 		if (!capable(CAP_SYS_ADMIN))
5182 			return -EPERM;
5183 		/*
5184 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
5185 		 * char_dev_path in payload too, since userspace may not
5186 		 * know if the specified device is created as unprivileged
5187 		 * mode.
5188 		 */
5189 		if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
5190 			return 0;
5191 	}
5192 
5193 	/*
5194 	 * User has to provide the char device path for unprivileged ublk
5195 	 *
5196 	 * header->addr always points to the dev path buffer, and
5197 	 * header->dev_path_len records length of dev path buffer.
5198 	 */
5199 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
5200 		return -EINVAL;
5201 
5202 	if (header->len < header->dev_path_len)
5203 		return -EINVAL;
5204 
5205 	dev_path = memdup_user_nul(argp, header->dev_path_len);
5206 	if (IS_ERR(dev_path))
5207 		return PTR_ERR(dev_path);
5208 
5209 	ret = -EINVAL;
5210 	switch (_IOC_NR(cmd_op)) {
5211 	case UBLK_CMD_GET_DEV_INFO:
5212 	case UBLK_CMD_GET_DEV_INFO2:
5213 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5214 	case UBLK_CMD_GET_PARAMS:
5215 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
5216 		mask = MAY_READ;
5217 		break;
5218 	case UBLK_CMD_START_DEV:
5219 	case UBLK_CMD_STOP_DEV:
5220 	case UBLK_CMD_ADD_DEV:
5221 	case UBLK_CMD_DEL_DEV:
5222 	case UBLK_CMD_SET_PARAMS:
5223 	case UBLK_CMD_START_USER_RECOVERY:
5224 	case UBLK_CMD_END_USER_RECOVERY:
5225 	case UBLK_CMD_UPDATE_SIZE:
5226 	case UBLK_CMD_QUIESCE_DEV:
5227 	case UBLK_CMD_TRY_STOP_DEV:
5228 		mask = MAY_READ | MAY_WRITE;
5229 		break;
5230 	default:
5231 		goto exit;
5232 	}
5233 
5234 	ret = ublk_char_dev_permission(ub, dev_path, mask);
5235 	if (!ret) {
5236 		header->len -= header->dev_path_len;
5237 		header->addr += header->dev_path_len;
5238 	}
5239 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
5240 			__func__, ub->ub_number, cmd_op,
5241 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
5242 			dev_path, ret);
5243 exit:
5244 	kfree(dev_path);
5245 	return ret;
5246 }
5247 
5248 static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
5249 {
5250 	switch (_IOC_NR(cmd_op)) {
5251 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5252 	case UBLK_CMD_GET_DEV_INFO:
5253 	case UBLK_CMD_GET_DEV_INFO2:
5254 	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
5255 		return false;
5256 	default:
5257 		return true;
5258 	}
5259 }
5260 
5261 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
5262 		unsigned int issue_flags)
5263 {
5264 	/* May point to userspace-mapped memory */
5265 	const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
5266 								    struct ublksrv_ctrl_cmd);
5267 	struct ublksrv_ctrl_cmd header;
5268 	struct ublk_device *ub = NULL;
5269 	u32 cmd_op = cmd->cmd_op;
5270 	int ret = -EINVAL;
5271 
5272 	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
5273 	    issue_flags & IO_URING_F_NONBLOCK)
5274 		return -EAGAIN;
5275 
5276 	if (!(issue_flags & IO_URING_F_SQE128))
5277 		return -EINVAL;
5278 
5279 	header.dev_id = READ_ONCE(ub_src->dev_id);
5280 	header.queue_id = READ_ONCE(ub_src->queue_id);
5281 	header.len = READ_ONCE(ub_src->len);
5282 	header.addr = READ_ONCE(ub_src->addr);
5283 	header.data[0] = READ_ONCE(ub_src->data[0]);
5284 	header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
5285 	ublk_ctrl_cmd_dump(cmd_op, &header);
5286 
5287 	ret = ublk_check_cmd_op(cmd_op);
5288 	if (ret)
5289 		goto out;
5290 
5291 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
5292 		ret = ublk_ctrl_get_features(&header);
5293 		goto out;
5294 	}
5295 
5296 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
5297 		ret = -ENODEV;
5298 		ub = ublk_get_device_from_id(header.dev_id);
5299 		if (!ub)
5300 			goto out;
5301 
5302 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
5303 		if (ret)
5304 			goto put_dev;
5305 	}
5306 
5307 	switch (_IOC_NR(cmd_op)) {
5308 	case UBLK_CMD_START_DEV:
5309 		ret = ublk_ctrl_start_dev(ub, &header);
5310 		break;
5311 	case UBLK_CMD_STOP_DEV:
5312 		ublk_ctrl_stop_dev(ub);
5313 		ret = 0;
5314 		break;
5315 	case UBLK_CMD_GET_DEV_INFO:
5316 	case UBLK_CMD_GET_DEV_INFO2:
5317 		ret = ublk_ctrl_get_dev_info(ub, &header);
5318 		break;
5319 	case UBLK_CMD_ADD_DEV:
5320 		ret = ublk_ctrl_add_dev(&header);
5321 		break;
5322 	case UBLK_CMD_DEL_DEV:
5323 		ret = ublk_ctrl_del_dev(&ub, true);
5324 		break;
5325 	case UBLK_CMD_DEL_DEV_ASYNC:
5326 		ret = ublk_ctrl_del_dev(&ub, false);
5327 		break;
5328 	case UBLK_CMD_GET_QUEUE_AFFINITY:
5329 		ret = ublk_ctrl_get_queue_affinity(ub, &header);
5330 		break;
5331 	case UBLK_CMD_GET_PARAMS:
5332 		ret = ublk_ctrl_get_params(ub, &header);
5333 		break;
5334 	case UBLK_CMD_SET_PARAMS:
5335 		ret = ublk_ctrl_set_params(ub, &header);
5336 		break;
5337 	case UBLK_CMD_START_USER_RECOVERY:
5338 		ret = ublk_ctrl_start_recovery(ub);
5339 		break;
5340 	case UBLK_CMD_END_USER_RECOVERY:
5341 		ret = ublk_ctrl_end_recovery(ub, &header);
5342 		break;
5343 	case UBLK_CMD_UPDATE_SIZE:
5344 		ublk_ctrl_set_size(ub, &header);
5345 		ret = 0;
5346 		break;
5347 	case UBLK_CMD_QUIESCE_DEV:
5348 		ret = ublk_ctrl_quiesce_dev(ub, &header);
5349 		break;
5350 	case UBLK_CMD_TRY_STOP_DEV:
5351 		ret = ublk_ctrl_try_stop_dev(ub);
5352 		break;
5353 	default:
5354 		ret = -EOPNOTSUPP;
5355 		break;
5356 	}
5357 
5358  put_dev:
5359 	if (ub)
5360 		ublk_put_device(ub);
5361  out:
5362 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
5363 			__func__, ret, cmd_op, header.dev_id, header.queue_id);
5364 	return ret;
5365 }
5366 
5367 static const struct file_operations ublk_ctl_fops = {
5368 	.open		= nonseekable_open,
5369 	.uring_cmd      = ublk_ctrl_uring_cmd,
5370 	.owner		= THIS_MODULE,
5371 	.llseek		= noop_llseek,
5372 };
5373 
5374 static struct miscdevice ublk_misc = {
5375 	.minor		= MISC_DYNAMIC_MINOR,
5376 	.name		= "ublk-control",
5377 	.fops		= &ublk_ctl_fops,
5378 };
5379 
5380 static int __init ublk_init(void)
5381 {
5382 	int ret;
5383 
5384 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
5385 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
5386 	/*
5387 	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
5388 	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
5389 	 */
5390 	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
5391 		     UBLKSRV_IO_INTEGRITY_FLAG);
5392 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
5393 
5394 	init_waitqueue_head(&ublk_idr_wq);
5395 
5396 	ret = misc_register(&ublk_misc);
5397 	if (ret)
5398 		return ret;
5399 
5400 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
5401 	if (ret)
5402 		goto unregister_mis;
5403 
5404 	ret = class_register(&ublk_chr_class);
5405 	if (ret)
5406 		goto free_chrdev_region;
5407 
5408 	return 0;
5409 
5410 free_chrdev_region:
5411 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5412 unregister_mis:
5413 	misc_deregister(&ublk_misc);
5414 	return ret;
5415 }
5416 
5417 static void __exit ublk_exit(void)
5418 {
5419 	struct ublk_device *ub;
5420 	int id;
5421 
5422 	idr_for_each_entry(&ublk_index_idr, ub, id)
5423 		ublk_remove(ub);
5424 
5425 	class_unregister(&ublk_chr_class);
5426 	misc_deregister(&ublk_misc);
5427 
5428 	idr_destroy(&ublk_index_idr);
5429 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
5430 }
5431 
5432 module_init(ublk_init);
5433 module_exit(ublk_exit);
5434 
5435 static int ublk_set_max_unprivileged_ublks(const char *buf,
5436 					   const struct kernel_param *kp)
5437 {
5438 	return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
5439 }
5440 
5441 static int ublk_get_max_unprivileged_ublks(char *buf,
5442 					   const struct kernel_param *kp)
5443 {
5444 	return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
5445 }
5446 
5447 static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
5448 	.set = ublk_set_max_unprivileged_ublks,
5449 	.get = ublk_get_max_unprivileged_ublks,
5450 };
5451 
5452 module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
5453 		&unprivileged_ublks_max, 0644);
5454 MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
5455 
5456 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
5457 MODULE_DESCRIPTION("Userspace block device");
5458 MODULE_LICENSE("GPL");
5459