xref: /linux/drivers/nvme/host/ioctl.c (revision a9fc2304972b1db28b88af8203dffef23e1e92ba)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2011-2014, Intel Corporation.
4  * Copyright (c) 2017-2021 Christoph Hellwig.
5  */
6 #include <linux/blk-integrity.h>
7 #include <linux/ptrace.h>	/* for force_successful_syscall_return */
8 #include <linux/nvme_ioctl.h>
9 #include <linux/io_uring/cmd.h>
10 #include "nvme.h"
11 
12 enum {
13 	NVME_IOCTL_VEC		= (1 << 0),
14 	NVME_IOCTL_PARTITION	= (1 << 1),
15 };
16 
17 static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
18 		unsigned int flags, bool open_for_write)
19 {
20 	u32 effects;
21 
22 	/*
23 	 * Do not allow unprivileged passthrough on partitions, as that allows an
24 	 * escape from the containment of the partition.
25 	 */
26 	if (flags & NVME_IOCTL_PARTITION)
27 		goto admin;
28 
29 	/*
30 	 * Do not allow unprivileged processes to send vendor specific or fabrics
31 	 * commands as we can't be sure about their effects.
32 	 */
33 	if (c->common.opcode >= nvme_cmd_vendor_start ||
34 	    c->common.opcode == nvme_fabrics_command)
35 		goto admin;
36 
37 	/*
38 	 * Do not allow unprivileged passthrough of admin commands except
39 	 * for a subset of identify commands that contain information required
40 	 * to form proper I/O commands in userspace and do not expose any
41 	 * potentially sensitive information.
42 	 */
43 	if (!ns) {
44 		if (c->common.opcode == nvme_admin_identify) {
45 			switch (c->identify.cns) {
46 			case NVME_ID_CNS_NS:
47 			case NVME_ID_CNS_CS_NS:
48 			case NVME_ID_CNS_NS_CS_INDEP:
49 			case NVME_ID_CNS_CS_CTRL:
50 			case NVME_ID_CNS_CTRL:
51 				return true;
52 			}
53 		}
54 		goto admin;
55 	}
56 
57 	/*
58 	 * Check if the controller provides a Commands Supported and Effects log
59 	 * and marks this command as supported.  If not reject unprivileged
60 	 * passthrough.
61 	 */
62 	effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode);
63 	if (!(effects & NVME_CMD_EFFECTS_CSUPP))
64 		goto admin;
65 
66 	/*
67 	 * Don't allow passthrough for command that have intrusive (or unknown)
68 	 * effects.
69 	 */
70 	if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
71 			NVME_CMD_EFFECTS_UUID_SEL |
72 			NVME_CMD_EFFECTS_SCOPE_MASK))
73 		goto admin;
74 
75 	/*
76 	 * Only allow I/O commands that transfer data to the controller or that
77 	 * change the logical block contents if the file descriptor is open for
78 	 * writing.
79 	 */
80 	if ((nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) &&
81 	    !open_for_write)
82 		goto admin;
83 
84 	return true;
85 admin:
86 	return capable(CAP_SYS_ADMIN);
87 }
88 
89 /*
90  * Convert integer values from ioctl structures to user pointers, silently
91  * ignoring the upper bits in the compat case to match behaviour of 32-bit
92  * kernels.
93  */
94 static void __user *nvme_to_user_ptr(uintptr_t ptrval)
95 {
96 	if (in_compat_syscall())
97 		ptrval = (compat_uptr_t)ptrval;
98 	return (void __user *)ptrval;
99 }
100 
101 static struct request *nvme_alloc_user_request(struct request_queue *q,
102 		struct nvme_command *cmd, blk_opf_t rq_flags,
103 		blk_mq_req_flags_t blk_flags)
104 {
105 	struct request *req;
106 
107 	req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags);
108 	if (IS_ERR(req))
109 		return req;
110 	nvme_init_request(req, cmd);
111 	nvme_req(req)->flags |= NVME_REQ_USERCMD;
112 	return req;
113 }
114 
115 static int nvme_map_user_request(struct request *req, u64 ubuffer,
116 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
117 		struct io_uring_cmd *ioucmd, unsigned int flags,
118 		unsigned int iou_issue_flags)
119 {
120 	struct request_queue *q = req->q;
121 	struct nvme_ns *ns = q->queuedata;
122 	struct block_device *bdev = ns ? ns->disk->part0 : NULL;
123 	bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
124 	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
125 	bool has_metadata = meta_buffer && meta_len;
126 	struct bio *bio = NULL;
127 	int ret;
128 
129 	if (!nvme_ctrl_sgl_supported(ctrl))
130 		dev_warn_once(ctrl->device, "using unchecked data buffer\n");
131 	if (has_metadata) {
132 		if (!supports_metadata) {
133 			ret = -EINVAL;
134 			goto out;
135 		}
136 		if (!nvme_ctrl_meta_sgl_supported(ctrl))
137 			dev_warn_once(ctrl->device,
138 				      "using unchecked metadata buffer\n");
139 	}
140 
141 	if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
142 		struct iov_iter iter;
143 
144 		/* fixedbufs is only for non-vectored io */
145 		if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) {
146 			ret = -EINVAL;
147 			goto out;
148 		}
149 		ret = io_uring_cmd_import_fixed(ubuffer, bufflen,
150 				rq_data_dir(req), &iter, ioucmd,
151 				iou_issue_flags);
152 		if (ret < 0)
153 			goto out;
154 		ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL);
155 	} else {
156 		ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer),
157 				bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0,
158 				0, rq_data_dir(req));
159 	}
160 
161 	if (ret)
162 		goto out;
163 
164 	bio = req->bio;
165 	if (bdev)
166 		bio_set_dev(bio, bdev);
167 
168 	if (has_metadata) {
169 		ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len);
170 		if (ret)
171 			goto out_unmap;
172 	}
173 
174 	return ret;
175 
176 out_unmap:
177 	if (bio)
178 		blk_rq_unmap_user(bio);
179 out:
180 	blk_mq_free_request(req);
181 	return ret;
182 }
183 
184 static int nvme_submit_user_cmd(struct request_queue *q,
185 		struct nvme_command *cmd, u64 ubuffer, unsigned bufflen,
186 		void __user *meta_buffer, unsigned meta_len,
187 		u64 *result, unsigned timeout, unsigned int flags)
188 {
189 	struct nvme_ns *ns = q->queuedata;
190 	struct nvme_ctrl *ctrl;
191 	struct request *req;
192 	struct bio *bio;
193 	u32 effects;
194 	int ret;
195 
196 	req = nvme_alloc_user_request(q, cmd, 0, 0);
197 	if (IS_ERR(req))
198 		return PTR_ERR(req);
199 
200 	req->timeout = timeout;
201 	if (ubuffer && bufflen) {
202 		ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
203 				meta_len, NULL, flags, 0);
204 		if (ret)
205 			return ret;
206 	}
207 
208 	bio = req->bio;
209 	ctrl = nvme_req(req)->ctrl;
210 
211 	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
212 	ret = nvme_execute_rq(req, false);
213 	if (result)
214 		*result = le64_to_cpu(nvme_req(req)->result.u64);
215 	if (bio)
216 		blk_rq_unmap_user(bio);
217 	blk_mq_free_request(req);
218 
219 	if (effects)
220 		nvme_passthru_end(ctrl, ns, effects, cmd, ret);
221 
222 	return ret;
223 }
224 
225 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
226 {
227 	struct nvme_user_io io;
228 	struct nvme_command c;
229 	unsigned length, meta_len;
230 	void __user *metadata;
231 
232 	if (copy_from_user(&io, uio, sizeof(io)))
233 		return -EFAULT;
234 	if (io.flags)
235 		return -EINVAL;
236 
237 	switch (io.opcode) {
238 	case nvme_cmd_write:
239 	case nvme_cmd_read:
240 	case nvme_cmd_compare:
241 		break;
242 	default:
243 		return -EINVAL;
244 	}
245 
246 	length = (io.nblocks + 1) << ns->head->lba_shift;
247 
248 	if ((io.control & NVME_RW_PRINFO_PRACT) &&
249 	    (ns->head->ms == ns->head->pi_size)) {
250 		/*
251 		 * Protection information is stripped/inserted by the
252 		 * controller.
253 		 */
254 		if (nvme_to_user_ptr(io.metadata))
255 			return -EINVAL;
256 		meta_len = 0;
257 		metadata = NULL;
258 	} else {
259 		meta_len = (io.nblocks + 1) * ns->head->ms;
260 		metadata = nvme_to_user_ptr(io.metadata);
261 	}
262 
263 	if (ns->head->features & NVME_NS_EXT_LBAS) {
264 		length += meta_len;
265 		meta_len = 0;
266 	} else if (meta_len) {
267 		if ((io.metadata & 3) || !io.metadata)
268 			return -EINVAL;
269 	}
270 
271 	memset(&c, 0, sizeof(c));
272 	c.rw.opcode = io.opcode;
273 	c.rw.flags = io.flags;
274 	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
275 	c.rw.slba = cpu_to_le64(io.slba);
276 	c.rw.length = cpu_to_le16(io.nblocks);
277 	c.rw.control = cpu_to_le16(io.control);
278 	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
279 	c.rw.reftag = cpu_to_le32(io.reftag);
280 	c.rw.lbat = cpu_to_le16(io.apptag);
281 	c.rw.lbatm = cpu_to_le16(io.appmask);
282 
283 	return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata,
284 			meta_len, NULL, 0, 0);
285 }
286 
287 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
288 					struct nvme_ns *ns, __u32 nsid)
289 {
290 	if (ns && nsid != ns->head->ns_id) {
291 		dev_err(ctrl->device,
292 			"%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n",
293 			current->comm, nsid, ns->head->ns_id);
294 		return false;
295 	}
296 
297 	return true;
298 }
299 
300 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
301 		struct nvme_passthru_cmd __user *ucmd, unsigned int flags,
302 		bool open_for_write)
303 {
304 	struct nvme_passthru_cmd cmd;
305 	struct nvme_command c;
306 	unsigned timeout = 0;
307 	u64 result;
308 	int status;
309 
310 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
311 		return -EFAULT;
312 	if (cmd.flags)
313 		return -EINVAL;
314 	if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid))
315 		return -EINVAL;
316 
317 	memset(&c, 0, sizeof(c));
318 	c.common.opcode = cmd.opcode;
319 	c.common.flags = cmd.flags;
320 	c.common.nsid = cpu_to_le32(cmd.nsid);
321 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
322 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
323 	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
324 	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
325 	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
326 	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
327 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
328 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
329 
330 	if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
331 		return -EACCES;
332 
333 	if (cmd.timeout_ms)
334 		timeout = msecs_to_jiffies(cmd.timeout_ms);
335 
336 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
337 			cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
338 			cmd.metadata_len, &result, timeout, 0);
339 
340 	if (status >= 0) {
341 		if (put_user(result, &ucmd->result))
342 			return -EFAULT;
343 	}
344 
345 	return status;
346 }
347 
348 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
349 		struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags,
350 		bool open_for_write)
351 {
352 	struct nvme_passthru_cmd64 cmd;
353 	struct nvme_command c;
354 	unsigned timeout = 0;
355 	int status;
356 
357 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
358 		return -EFAULT;
359 	if (cmd.flags)
360 		return -EINVAL;
361 	if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid))
362 		return -EINVAL;
363 
364 	memset(&c, 0, sizeof(c));
365 	c.common.opcode = cmd.opcode;
366 	c.common.flags = cmd.flags;
367 	c.common.nsid = cpu_to_le32(cmd.nsid);
368 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
369 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
370 	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
371 	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
372 	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
373 	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
374 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
375 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
376 
377 	if (!nvme_cmd_allowed(ns, &c, flags, open_for_write))
378 		return -EACCES;
379 
380 	if (cmd.timeout_ms)
381 		timeout = msecs_to_jiffies(cmd.timeout_ms);
382 
383 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
384 			cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
385 			cmd.metadata_len, &cmd.result, timeout, flags);
386 
387 	if (status >= 0) {
388 		if (put_user(cmd.result, &ucmd->result))
389 			return -EFAULT;
390 	}
391 
392 	return status;
393 }
394 
395 struct nvme_uring_data {
396 	__u64	metadata;
397 	__u64	addr;
398 	__u32	data_len;
399 	__u32	metadata_len;
400 	__u32	timeout_ms;
401 };
402 
403 /*
404  * This overlays struct io_uring_cmd pdu.
405  * Expect build errors if this grows larger than that.
406  */
407 struct nvme_uring_cmd_pdu {
408 	struct request *req;
409 	struct bio *bio;
410 	u64 result;
411 	int status;
412 };
413 
414 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
415 		struct io_uring_cmd *ioucmd)
416 {
417 	return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu);
418 }
419 
420 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
421 			       unsigned issue_flags)
422 {
423 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
424 
425 	if (pdu->bio)
426 		blk_rq_unmap_user(pdu->bio);
427 	io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags);
428 }
429 
430 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
431 						blk_status_t err)
432 {
433 	struct io_uring_cmd *ioucmd = req->end_io_data;
434 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
435 
436 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED) {
437 		pdu->status = -EINTR;
438 	} else {
439 		pdu->status = nvme_req(req)->status;
440 		if (!pdu->status)
441 			pdu->status = blk_status_to_errno(err);
442 	}
443 	pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
444 
445 	/*
446 	 * For iopoll, complete it directly. Note that using the uring_cmd
447 	 * helper for this is safe only because we check blk_rq_is_poll().
448 	 * As that returns false if we're NOT on a polled queue, then it's
449 	 * safe to use the polled completion helper.
450 	 *
451 	 * Otherwise, move the completion to task work.
452 	 */
453 	if (blk_rq_is_poll(req)) {
454 		if (pdu->bio)
455 			blk_rq_unmap_user(pdu->bio);
456 		io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
457 	} else {
458 		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
459 	}
460 
461 	return RQ_END_IO_FREE;
462 }
463 
464 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
465 		struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
466 {
467 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
468 	const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe);
469 	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
470 	struct nvme_uring_data d;
471 	struct nvme_command c;
472 	struct request *req;
473 	blk_opf_t rq_flags = REQ_ALLOC_CACHE;
474 	blk_mq_req_flags_t blk_flags = 0;
475 	int ret;
476 
477 	c.common.opcode = READ_ONCE(cmd->opcode);
478 	c.common.flags = READ_ONCE(cmd->flags);
479 	if (c.common.flags)
480 		return -EINVAL;
481 
482 	c.common.command_id = 0;
483 	c.common.nsid = cpu_to_le32(cmd->nsid);
484 	if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid)))
485 		return -EINVAL;
486 
487 	c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2));
488 	c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3));
489 	c.common.metadata = 0;
490 	c.common.dptr.prp1 = c.common.dptr.prp2 = 0;
491 	c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10));
492 	c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11));
493 	c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12));
494 	c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13));
495 	c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
496 	c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
497 
498 	if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
499 		return -EACCES;
500 
501 	d.metadata = READ_ONCE(cmd->metadata);
502 	d.addr = READ_ONCE(cmd->addr);
503 	d.data_len = READ_ONCE(cmd->data_len);
504 	d.metadata_len = READ_ONCE(cmd->metadata_len);
505 	d.timeout_ms = READ_ONCE(cmd->timeout_ms);
506 
507 	if (issue_flags & IO_URING_F_NONBLOCK) {
508 		rq_flags |= REQ_NOWAIT;
509 		blk_flags = BLK_MQ_REQ_NOWAIT;
510 	}
511 	if (issue_flags & IO_URING_F_IOPOLL)
512 		rq_flags |= REQ_POLLED;
513 
514 	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
515 	if (IS_ERR(req))
516 		return PTR_ERR(req);
517 	req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0;
518 
519 	if (d.data_len) {
520 		ret = nvme_map_user_request(req, d.addr,
521 			d.data_len, nvme_to_user_ptr(d.metadata),
522 			d.metadata_len, ioucmd, vec, issue_flags);
523 		if (ret)
524 			return ret;
525 	}
526 
527 	/* to free bio on completion, as req->bio will be null at that time */
528 	pdu->bio = req->bio;
529 	pdu->req = req;
530 	req->end_io_data = ioucmd;
531 	req->end_io = nvme_uring_cmd_end_io;
532 	blk_execute_rq_nowait(req, false);
533 	return -EIOCBQUEUED;
534 }
535 
536 static bool is_ctrl_ioctl(unsigned int cmd)
537 {
538 	if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
539 		return true;
540 	if (is_sed_ioctl(cmd))
541 		return true;
542 	return false;
543 }
544 
545 static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
546 		void __user *argp, bool open_for_write)
547 {
548 	switch (cmd) {
549 	case NVME_IOCTL_ADMIN_CMD:
550 		return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write);
551 	case NVME_IOCTL_ADMIN64_CMD:
552 		return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write);
553 	default:
554 		return sed_ioctl(ctrl->opal_dev, cmd, argp);
555 	}
556 }
557 
558 #ifdef COMPAT_FOR_U64_ALIGNMENT
559 struct nvme_user_io32 {
560 	__u8	opcode;
561 	__u8	flags;
562 	__u16	control;
563 	__u16	nblocks;
564 	__u16	rsvd;
565 	__u64	metadata;
566 	__u64	addr;
567 	__u64	slba;
568 	__u32	dsmgmt;
569 	__u32	reftag;
570 	__u16	apptag;
571 	__u16	appmask;
572 } __attribute__((__packed__));
573 #define NVME_IOCTL_SUBMIT_IO32	_IOW('N', 0x42, struct nvme_user_io32)
574 #endif /* COMPAT_FOR_U64_ALIGNMENT */
575 
576 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
577 		void __user *argp, unsigned int flags, bool open_for_write)
578 {
579 	switch (cmd) {
580 	case NVME_IOCTL_ID:
581 		force_successful_syscall_return();
582 		return ns->head->ns_id;
583 	case NVME_IOCTL_IO_CMD:
584 		return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write);
585 	/*
586 	 * struct nvme_user_io can have different padding on some 32-bit ABIs.
587 	 * Just accept the compat version as all fields that are used are the
588 	 * same size and at the same offset.
589 	 */
590 #ifdef COMPAT_FOR_U64_ALIGNMENT
591 	case NVME_IOCTL_SUBMIT_IO32:
592 #endif
593 	case NVME_IOCTL_SUBMIT_IO:
594 		return nvme_submit_io(ns, argp);
595 	case NVME_IOCTL_IO64_CMD_VEC:
596 		flags |= NVME_IOCTL_VEC;
597 		fallthrough;
598 	case NVME_IOCTL_IO64_CMD:
599 		return nvme_user_cmd64(ns->ctrl, ns, argp, flags,
600 				       open_for_write);
601 	default:
602 		return -ENOTTY;
603 	}
604 }
605 
606 int nvme_ioctl(struct block_device *bdev, blk_mode_t mode,
607 		unsigned int cmd, unsigned long arg)
608 {
609 	struct nvme_ns *ns = bdev->bd_disk->private_data;
610 	bool open_for_write = mode & BLK_OPEN_WRITE;
611 	void __user *argp = (void __user *)arg;
612 	unsigned int flags = 0;
613 
614 	if (bdev_is_partition(bdev))
615 		flags |= NVME_IOCTL_PARTITION;
616 
617 	if (is_ctrl_ioctl(cmd))
618 		return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
619 	return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write);
620 }
621 
622 long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
623 {
624 	struct nvme_ns *ns =
625 		container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
626 	bool open_for_write = file->f_mode & FMODE_WRITE;
627 	void __user *argp = (void __user *)arg;
628 
629 	if (is_ctrl_ioctl(cmd))
630 		return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
631 	return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write);
632 }
633 
634 static int nvme_uring_cmd_checks(unsigned int issue_flags)
635 {
636 
637 	/* NVMe passthrough requires big SQE/CQE support */
638 	if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
639 	    (IO_URING_F_SQE128|IO_URING_F_CQE32))
640 		return -EOPNOTSUPP;
641 	return 0;
642 }
643 
644 static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
645 			     unsigned int issue_flags)
646 {
647 	struct nvme_ctrl *ctrl = ns->ctrl;
648 	int ret;
649 
650 	ret = nvme_uring_cmd_checks(issue_flags);
651 	if (ret)
652 		return ret;
653 
654 	switch (ioucmd->cmd_op) {
655 	case NVME_URING_CMD_IO:
656 		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false);
657 		break;
658 	case NVME_URING_CMD_IO_VEC:
659 		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true);
660 		break;
661 	default:
662 		ret = -ENOTTY;
663 	}
664 
665 	return ret;
666 }
667 
668 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
669 {
670 	struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
671 			struct nvme_ns, cdev);
672 
673 	return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
674 }
675 
676 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
677 				 struct io_comp_batch *iob,
678 				 unsigned int poll_flags)
679 {
680 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
681 	struct request *req = pdu->req;
682 
683 	if (req && blk_rq_is_poll(req))
684 		return blk_rq_poll(req, iob, poll_flags);
685 	return 0;
686 }
687 #ifdef CONFIG_NVME_MULTIPATH
688 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
689 		void __user *argp, struct nvme_ns_head *head, int srcu_idx,
690 		bool open_for_write)
691 	__releases(&head->srcu)
692 {
693 	struct nvme_ctrl *ctrl = ns->ctrl;
694 	int ret;
695 
696 	nvme_get_ctrl(ns->ctrl);
697 	srcu_read_unlock(&head->srcu, srcu_idx);
698 	ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
699 
700 	nvme_put_ctrl(ctrl);
701 	return ret;
702 }
703 
704 int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
705 		unsigned int cmd, unsigned long arg)
706 {
707 	struct nvme_ns_head *head = bdev->bd_disk->private_data;
708 	bool open_for_write = mode & BLK_OPEN_WRITE;
709 	void __user *argp = (void __user *)arg;
710 	struct nvme_ns *ns;
711 	int srcu_idx, ret = -EWOULDBLOCK;
712 	unsigned int flags = 0;
713 
714 	if (bdev_is_partition(bdev))
715 		flags |= NVME_IOCTL_PARTITION;
716 
717 	srcu_idx = srcu_read_lock(&head->srcu);
718 	ns = nvme_find_path(head);
719 	if (!ns)
720 		goto out_unlock;
721 
722 	/*
723 	 * Handle ioctls that apply to the controller instead of the namespace
724 	 * seperately and drop the ns SRCU reference early.  This avoids a
725 	 * deadlock when deleting namespaces using the passthrough interface.
726 	 */
727 	if (is_ctrl_ioctl(cmd))
728 		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
729 					       open_for_write);
730 
731 	ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write);
732 out_unlock:
733 	srcu_read_unlock(&head->srcu, srcu_idx);
734 	return ret;
735 }
736 
737 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
738 		unsigned long arg)
739 {
740 	bool open_for_write = file->f_mode & FMODE_WRITE;
741 	struct cdev *cdev = file_inode(file)->i_cdev;
742 	struct nvme_ns_head *head =
743 		container_of(cdev, struct nvme_ns_head, cdev);
744 	void __user *argp = (void __user *)arg;
745 	struct nvme_ns *ns;
746 	int srcu_idx, ret = -EWOULDBLOCK;
747 
748 	srcu_idx = srcu_read_lock(&head->srcu);
749 	ns = nvme_find_path(head);
750 	if (!ns)
751 		goto out_unlock;
752 
753 	if (is_ctrl_ioctl(cmd))
754 		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
755 				open_for_write);
756 
757 	ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write);
758 out_unlock:
759 	srcu_read_unlock(&head->srcu, srcu_idx);
760 	return ret;
761 }
762 
763 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
764 		unsigned int issue_flags)
765 {
766 	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
767 	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
768 	int srcu_idx = srcu_read_lock(&head->srcu);
769 	struct nvme_ns *ns = nvme_find_path(head);
770 	int ret = -EINVAL;
771 
772 	if (ns)
773 		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
774 	srcu_read_unlock(&head->srcu, srcu_idx);
775 	return ret;
776 }
777 #endif /* CONFIG_NVME_MULTIPATH */
778 
779 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
780 {
781 	struct nvme_ctrl *ctrl = ioucmd->file->private_data;
782 	int ret;
783 
784 	/* IOPOLL not supported yet */
785 	if (issue_flags & IO_URING_F_IOPOLL)
786 		return -EOPNOTSUPP;
787 
788 	ret = nvme_uring_cmd_checks(issue_flags);
789 	if (ret)
790 		return ret;
791 
792 	switch (ioucmd->cmd_op) {
793 	case NVME_URING_CMD_ADMIN:
794 		ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false);
795 		break;
796 	case NVME_URING_CMD_ADMIN_VEC:
797 		ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true);
798 		break;
799 	default:
800 		ret = -ENOTTY;
801 	}
802 
803 	return ret;
804 }
805 
806 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
807 		bool open_for_write)
808 {
809 	struct nvme_ns *ns;
810 	int ret, srcu_idx;
811 
812 	srcu_idx = srcu_read_lock(&ctrl->srcu);
813 	if (list_empty(&ctrl->namespaces)) {
814 		ret = -ENOTTY;
815 		goto out_unlock;
816 	}
817 
818 	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
819 	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
820 		dev_warn(ctrl->device,
821 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
822 		ret = -EINVAL;
823 		goto out_unlock;
824 	}
825 
826 	dev_warn(ctrl->device,
827 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
828 	if (!nvme_get_ns(ns)) {
829 		ret = -ENXIO;
830 		goto out_unlock;
831 	}
832 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
833 
834 	ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
835 	nvme_put_ns(ns);
836 	return ret;
837 
838 out_unlock:
839 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
840 	return ret;
841 }
842 
843 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
844 		unsigned long arg)
845 {
846 	bool open_for_write = file->f_mode & FMODE_WRITE;
847 	struct nvme_ctrl *ctrl = file->private_data;
848 	void __user *argp = (void __user *)arg;
849 
850 	switch (cmd) {
851 	case NVME_IOCTL_ADMIN_CMD:
852 		return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write);
853 	case NVME_IOCTL_ADMIN64_CMD:
854 		return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write);
855 	case NVME_IOCTL_IO_CMD:
856 		return nvme_dev_user_cmd(ctrl, argp, open_for_write);
857 	case NVME_IOCTL_RESET:
858 		if (!capable(CAP_SYS_ADMIN))
859 			return -EACCES;
860 		dev_warn(ctrl->device, "resetting controller\n");
861 		return nvme_reset_ctrl_sync(ctrl);
862 	case NVME_IOCTL_SUBSYS_RESET:
863 		if (!capable(CAP_SYS_ADMIN))
864 			return -EACCES;
865 		return nvme_reset_subsystem(ctrl);
866 	case NVME_IOCTL_RESCAN:
867 		if (!capable(CAP_SYS_ADMIN))
868 			return -EACCES;
869 		nvme_queue_scan(ctrl);
870 		return 0;
871 	default:
872 		return -ENOTTY;
873 	}
874 }
875