1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * Copyright (c) 2017-2021 Christoph Hellwig. 5 */ 6 #include <linux/ptrace.h> /* for force_successful_syscall_return */ 7 #include <linux/nvme_ioctl.h> 8 #include <linux/io_uring.h> 9 #include "nvme.h" 10 11 enum { 12 NVME_IOCTL_VEC = (1 << 0), 13 NVME_IOCTL_PARTITION = (1 << 1), 14 }; 15 16 static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, 17 unsigned int flags, bool open_for_write) 18 { 19 u32 effects; 20 21 if (capable(CAP_SYS_ADMIN)) 22 return true; 23 24 /* 25 * Do not allow unprivileged passthrough on partitions, as that allows an 26 * escape from the containment of the partition. 27 */ 28 if (flags & NVME_IOCTL_PARTITION) 29 return false; 30 31 /* 32 * Do not allow unprivileged processes to send vendor specific or fabrics 33 * commands as we can't be sure about their effects. 34 */ 35 if (c->common.opcode >= nvme_cmd_vendor_start || 36 c->common.opcode == nvme_fabrics_command) 37 return false; 38 39 /* 40 * Do not allow unprivileged passthrough of admin commands except 41 * for a subset of identify commands that contain information required 42 * to form proper I/O commands in userspace and do not expose any 43 * potentially sensitive information. 44 */ 45 if (!ns) { 46 if (c->common.opcode == nvme_admin_identify) { 47 switch (c->identify.cns) { 48 case NVME_ID_CNS_NS: 49 case NVME_ID_CNS_CS_NS: 50 case NVME_ID_CNS_NS_CS_INDEP: 51 case NVME_ID_CNS_CS_CTRL: 52 case NVME_ID_CNS_CTRL: 53 return true; 54 } 55 } 56 return false; 57 } 58 59 /* 60 * Check if the controller provides a Commands Supported and Effects log 61 * and marks this command as supported. If not reject unprivileged 62 * passthrough. 63 */ 64 effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); 65 if (!(effects & NVME_CMD_EFFECTS_CSUPP)) 66 return false; 67 68 /* 69 * Don't allow passthrough for command that have intrusive (or unknown) 70 * effects. 71 */ 72 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | 73 NVME_CMD_EFFECTS_UUID_SEL | 74 NVME_CMD_EFFECTS_SCOPE_MASK)) 75 return false; 76 77 /* 78 * Only allow I/O commands that transfer data to the controller or that 79 * change the logical block contents if the file descriptor is open for 80 * writing. 81 */ 82 if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) 83 return open_for_write; 84 return true; 85 } 86 87 /* 88 * Convert integer values from ioctl structures to user pointers, silently 89 * ignoring the upper bits in the compat case to match behaviour of 32-bit 90 * kernels. 91 */ 92 static void __user *nvme_to_user_ptr(uintptr_t ptrval) 93 { 94 if (in_compat_syscall()) 95 ptrval = (compat_uptr_t)ptrval; 96 return (void __user *)ptrval; 97 } 98 99 static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, 100 unsigned len, u32 seed) 101 { 102 struct bio_integrity_payload *bip; 103 int ret = -ENOMEM; 104 void *buf; 105 struct bio *bio = req->bio; 106 107 buf = kmalloc(len, GFP_KERNEL); 108 if (!buf) 109 goto out; 110 111 ret = -EFAULT; 112 if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len)) 113 goto out_free_meta; 114 115 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 116 if (IS_ERR(bip)) { 117 ret = PTR_ERR(bip); 118 goto out_free_meta; 119 } 120 121 bip->bip_iter.bi_sector = seed; 122 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 123 offset_in_page(buf)); 124 if (ret != len) { 125 ret = -ENOMEM; 126 goto out_free_meta; 127 } 128 129 req->cmd_flags |= REQ_INTEGRITY; 130 return buf; 131 out_free_meta: 132 kfree(buf); 133 out: 134 return ERR_PTR(ret); 135 } 136 137 static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, 138 void *meta, unsigned len, int ret) 139 { 140 if (!ret && req_op(req) == REQ_OP_DRV_IN && 141 copy_to_user(ubuf, meta, len)) 142 ret = -EFAULT; 143 kfree(meta); 144 return ret; 145 } 146 147 static struct request *nvme_alloc_user_request(struct request_queue *q, 148 struct nvme_command *cmd, blk_opf_t rq_flags, 149 blk_mq_req_flags_t blk_flags) 150 { 151 struct request *req; 152 153 req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); 154 if (IS_ERR(req)) 155 return req; 156 nvme_init_request(req, cmd); 157 nvme_req(req)->flags |= NVME_REQ_USERCMD; 158 return req; 159 } 160 161 static int nvme_map_user_request(struct request *req, u64 ubuffer, 162 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 163 u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, 164 unsigned int flags) 165 { 166 struct request_queue *q = req->q; 167 struct nvme_ns *ns = q->queuedata; 168 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 169 struct bio *bio = NULL; 170 void *meta = NULL; 171 int ret; 172 173 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 174 struct iov_iter iter; 175 176 /* fixedbufs is only for non-vectored io */ 177 if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) 178 return -EINVAL; 179 ret = io_uring_cmd_import_fixed(ubuffer, bufflen, 180 rq_data_dir(req), &iter, ioucmd); 181 if (ret < 0) 182 goto out; 183 ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); 184 } else { 185 ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), 186 bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 187 0, rq_data_dir(req)); 188 } 189 190 if (ret) 191 goto out; 192 bio = req->bio; 193 if (bdev) 194 bio_set_dev(bio, bdev); 195 196 if (bdev && meta_buffer && meta_len) { 197 meta = nvme_add_user_metadata(req, meta_buffer, meta_len, 198 meta_seed); 199 if (IS_ERR(meta)) { 200 ret = PTR_ERR(meta); 201 goto out_unmap; 202 } 203 *metap = meta; 204 } 205 206 return ret; 207 208 out_unmap: 209 if (bio) 210 blk_rq_unmap_user(bio); 211 out: 212 blk_mq_free_request(req); 213 return ret; 214 } 215 216 static int nvme_submit_user_cmd(struct request_queue *q, 217 struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, 218 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 219 u64 *result, unsigned timeout, unsigned int flags) 220 { 221 struct nvme_ns *ns = q->queuedata; 222 struct nvme_ctrl *ctrl; 223 struct request *req; 224 void *meta = NULL; 225 struct bio *bio; 226 u32 effects; 227 int ret; 228 229 req = nvme_alloc_user_request(q, cmd, 0, 0); 230 if (IS_ERR(req)) 231 return PTR_ERR(req); 232 233 req->timeout = timeout; 234 if (ubuffer && bufflen) { 235 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 236 meta_len, meta_seed, &meta, NULL, flags); 237 if (ret) 238 return ret; 239 } 240 241 bio = req->bio; 242 ctrl = nvme_req(req)->ctrl; 243 244 effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 245 ret = nvme_execute_rq(req, false); 246 if (result) 247 *result = le64_to_cpu(nvme_req(req)->result.u64); 248 if (meta) 249 ret = nvme_finish_user_metadata(req, meta_buffer, meta, 250 meta_len, ret); 251 if (bio) 252 blk_rq_unmap_user(bio); 253 blk_mq_free_request(req); 254 255 if (effects) 256 nvme_passthru_end(ctrl, ns, effects, cmd, ret); 257 258 return ret; 259 } 260 261 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 262 { 263 struct nvme_user_io io; 264 struct nvme_command c; 265 unsigned length, meta_len; 266 void __user *metadata; 267 268 if (copy_from_user(&io, uio, sizeof(io))) 269 return -EFAULT; 270 if (io.flags) 271 return -EINVAL; 272 273 switch (io.opcode) { 274 case nvme_cmd_write: 275 case nvme_cmd_read: 276 case nvme_cmd_compare: 277 break; 278 default: 279 return -EINVAL; 280 } 281 282 length = (io.nblocks + 1) << ns->lba_shift; 283 284 if ((io.control & NVME_RW_PRINFO_PRACT) && 285 ns->ms == sizeof(struct t10_pi_tuple)) { 286 /* 287 * Protection information is stripped/inserted by the 288 * controller. 289 */ 290 if (nvme_to_user_ptr(io.metadata)) 291 return -EINVAL; 292 meta_len = 0; 293 metadata = NULL; 294 } else { 295 meta_len = (io.nblocks + 1) * ns->ms; 296 metadata = nvme_to_user_ptr(io.metadata); 297 } 298 299 if (ns->features & NVME_NS_EXT_LBAS) { 300 length += meta_len; 301 meta_len = 0; 302 } else if (meta_len) { 303 if ((io.metadata & 3) || !io.metadata) 304 return -EINVAL; 305 } 306 307 memset(&c, 0, sizeof(c)); 308 c.rw.opcode = io.opcode; 309 c.rw.flags = io.flags; 310 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 311 c.rw.slba = cpu_to_le64(io.slba); 312 c.rw.length = cpu_to_le16(io.nblocks); 313 c.rw.control = cpu_to_le16(io.control); 314 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 315 c.rw.reftag = cpu_to_le32(io.reftag); 316 c.rw.apptag = cpu_to_le16(io.apptag); 317 c.rw.appmask = cpu_to_le16(io.appmask); 318 319 return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, 320 meta_len, lower_32_bits(io.slba), NULL, 0, 0); 321 } 322 323 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, 324 struct nvme_ns *ns, __u32 nsid) 325 { 326 if (ns && nsid != ns->head->ns_id) { 327 dev_err(ctrl->device, 328 "%s: nsid (%u) in cmd does not match nsid (%u)" 329 "of namespace\n", 330 current->comm, nsid, ns->head->ns_id); 331 return false; 332 } 333 334 return true; 335 } 336 337 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 338 struct nvme_passthru_cmd __user *ucmd, unsigned int flags, 339 bool open_for_write) 340 { 341 struct nvme_passthru_cmd cmd; 342 struct nvme_command c; 343 unsigned timeout = 0; 344 u64 result; 345 int status; 346 347 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 348 return -EFAULT; 349 if (cmd.flags) 350 return -EINVAL; 351 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 352 return -EINVAL; 353 354 memset(&c, 0, sizeof(c)); 355 c.common.opcode = cmd.opcode; 356 c.common.flags = cmd.flags; 357 c.common.nsid = cpu_to_le32(cmd.nsid); 358 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 359 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 360 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 361 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 362 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 363 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 364 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 365 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 366 367 if (!nvme_cmd_allowed(ns, &c, 0, open_for_write)) 368 return -EACCES; 369 370 if (cmd.timeout_ms) 371 timeout = msecs_to_jiffies(cmd.timeout_ms); 372 373 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 374 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 375 cmd.metadata_len, 0, &result, timeout, 0); 376 377 if (status >= 0) { 378 if (put_user(result, &ucmd->result)) 379 return -EFAULT; 380 } 381 382 return status; 383 } 384 385 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 386 struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags, 387 bool open_for_write) 388 { 389 struct nvme_passthru_cmd64 cmd; 390 struct nvme_command c; 391 unsigned timeout = 0; 392 int status; 393 394 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 395 return -EFAULT; 396 if (cmd.flags) 397 return -EINVAL; 398 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 399 return -EINVAL; 400 401 memset(&c, 0, sizeof(c)); 402 c.common.opcode = cmd.opcode; 403 c.common.flags = cmd.flags; 404 c.common.nsid = cpu_to_le32(cmd.nsid); 405 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 406 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 407 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 408 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 409 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 410 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 411 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 412 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 413 414 if (!nvme_cmd_allowed(ns, &c, flags, open_for_write)) 415 return -EACCES; 416 417 if (cmd.timeout_ms) 418 timeout = msecs_to_jiffies(cmd.timeout_ms); 419 420 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 421 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 422 cmd.metadata_len, 0, &cmd.result, timeout, flags); 423 424 if (status >= 0) { 425 if (put_user(cmd.result, &ucmd->result)) 426 return -EFAULT; 427 } 428 429 return status; 430 } 431 432 struct nvme_uring_data { 433 __u64 metadata; 434 __u64 addr; 435 __u32 data_len; 436 __u32 metadata_len; 437 __u32 timeout_ms; 438 }; 439 440 /* 441 * This overlays struct io_uring_cmd pdu. 442 * Expect build errors if this grows larger than that. 443 */ 444 struct nvme_uring_cmd_pdu { 445 union { 446 struct bio *bio; 447 struct request *req; 448 }; 449 u32 meta_len; 450 u32 nvme_status; 451 union { 452 struct { 453 void *meta; /* kernel-resident buffer */ 454 void __user *meta_buffer; 455 }; 456 u64 result; 457 } u; 458 }; 459 460 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( 461 struct io_uring_cmd *ioucmd) 462 { 463 return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; 464 } 465 466 static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd, 467 unsigned issue_flags) 468 { 469 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 470 struct request *req = pdu->req; 471 int status; 472 u64 result; 473 474 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 475 status = -EINTR; 476 else 477 status = nvme_req(req)->status; 478 479 result = le64_to_cpu(nvme_req(req)->result.u64); 480 481 if (pdu->meta_len) 482 status = nvme_finish_user_metadata(req, pdu->u.meta_buffer, 483 pdu->u.meta, pdu->meta_len, status); 484 if (req->bio) 485 blk_rq_unmap_user(req->bio); 486 blk_mq_free_request(req); 487 488 io_uring_cmd_done(ioucmd, status, result, issue_flags); 489 } 490 491 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, 492 unsigned issue_flags) 493 { 494 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 495 496 if (pdu->bio) 497 blk_rq_unmap_user(pdu->bio); 498 499 io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags); 500 } 501 502 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, 503 blk_status_t err) 504 { 505 struct io_uring_cmd *ioucmd = req->end_io_data; 506 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 507 508 req->bio = pdu->bio; 509 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 510 pdu->nvme_status = -EINTR; 511 else 512 pdu->nvme_status = nvme_req(req)->status; 513 pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64); 514 515 /* 516 * For iopoll, complete it directly. 517 * Otherwise, move the completion to task work. 518 */ 519 if (blk_rq_is_poll(req)) { 520 WRITE_ONCE(ioucmd->cookie, NULL); 521 nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); 522 } else { 523 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); 524 } 525 526 return RQ_END_IO_FREE; 527 } 528 529 static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, 530 blk_status_t err) 531 { 532 struct io_uring_cmd *ioucmd = req->end_io_data; 533 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 534 535 req->bio = pdu->bio; 536 pdu->req = req; 537 538 /* 539 * For iopoll, complete it directly. 540 * Otherwise, move the completion to task work. 541 */ 542 if (blk_rq_is_poll(req)) { 543 WRITE_ONCE(ioucmd->cookie, NULL); 544 nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); 545 } else { 546 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); 547 } 548 549 return RQ_END_IO_NONE; 550 } 551 552 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 553 struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) 554 { 555 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 556 const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); 557 struct request_queue *q = ns ? ns->queue : ctrl->admin_q; 558 struct nvme_uring_data d; 559 struct nvme_command c; 560 struct request *req; 561 blk_opf_t rq_flags = REQ_ALLOC_CACHE; 562 blk_mq_req_flags_t blk_flags = 0; 563 void *meta = NULL; 564 int ret; 565 566 c.common.opcode = READ_ONCE(cmd->opcode); 567 c.common.flags = READ_ONCE(cmd->flags); 568 if (c.common.flags) 569 return -EINVAL; 570 571 c.common.command_id = 0; 572 c.common.nsid = cpu_to_le32(cmd->nsid); 573 if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) 574 return -EINVAL; 575 576 c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); 577 c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); 578 c.common.metadata = 0; 579 c.common.dptr.prp1 = c.common.dptr.prp2 = 0; 580 c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); 581 c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); 582 c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); 583 c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); 584 c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); 585 c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); 586 587 if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE)) 588 return -EACCES; 589 590 d.metadata = READ_ONCE(cmd->metadata); 591 d.addr = READ_ONCE(cmd->addr); 592 d.data_len = READ_ONCE(cmd->data_len); 593 d.metadata_len = READ_ONCE(cmd->metadata_len); 594 d.timeout_ms = READ_ONCE(cmd->timeout_ms); 595 596 if (issue_flags & IO_URING_F_NONBLOCK) { 597 rq_flags |= REQ_NOWAIT; 598 blk_flags = BLK_MQ_REQ_NOWAIT; 599 } 600 if (issue_flags & IO_URING_F_IOPOLL) 601 rq_flags |= REQ_POLLED; 602 603 req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); 604 if (IS_ERR(req)) 605 return PTR_ERR(req); 606 req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; 607 608 if (d.addr && d.data_len) { 609 ret = nvme_map_user_request(req, d.addr, 610 d.data_len, nvme_to_user_ptr(d.metadata), 611 d.metadata_len, 0, &meta, ioucmd, vec); 612 if (ret) 613 return ret; 614 } 615 616 if (blk_rq_is_poll(req)) { 617 ioucmd->flags |= IORING_URING_CMD_POLLED; 618 WRITE_ONCE(ioucmd->cookie, req); 619 } 620 621 /* to free bio on completion, as req->bio will be null at that time */ 622 pdu->bio = req->bio; 623 pdu->meta_len = d.metadata_len; 624 req->end_io_data = ioucmd; 625 if (pdu->meta_len) { 626 pdu->u.meta = meta; 627 pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata); 628 req->end_io = nvme_uring_cmd_end_io_meta; 629 } else { 630 req->end_io = nvme_uring_cmd_end_io; 631 } 632 blk_execute_rq_nowait(req, false); 633 return -EIOCBQUEUED; 634 } 635 636 static bool is_ctrl_ioctl(unsigned int cmd) 637 { 638 if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) 639 return true; 640 if (is_sed_ioctl(cmd)) 641 return true; 642 return false; 643 } 644 645 static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, 646 void __user *argp, bool open_for_write) 647 { 648 switch (cmd) { 649 case NVME_IOCTL_ADMIN_CMD: 650 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 651 case NVME_IOCTL_ADMIN64_CMD: 652 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 653 default: 654 return sed_ioctl(ctrl->opal_dev, cmd, argp); 655 } 656 } 657 658 #ifdef COMPAT_FOR_U64_ALIGNMENT 659 struct nvme_user_io32 { 660 __u8 opcode; 661 __u8 flags; 662 __u16 control; 663 __u16 nblocks; 664 __u16 rsvd; 665 __u64 metadata; 666 __u64 addr; 667 __u64 slba; 668 __u32 dsmgmt; 669 __u32 reftag; 670 __u16 apptag; 671 __u16 appmask; 672 } __attribute__((__packed__)); 673 #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) 674 #endif /* COMPAT_FOR_U64_ALIGNMENT */ 675 676 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, 677 void __user *argp, unsigned int flags, bool open_for_write) 678 { 679 switch (cmd) { 680 case NVME_IOCTL_ID: 681 force_successful_syscall_return(); 682 return ns->head->ns_id; 683 case NVME_IOCTL_IO_CMD: 684 return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write); 685 /* 686 * struct nvme_user_io can have different padding on some 32-bit ABIs. 687 * Just accept the compat version as all fields that are used are the 688 * same size and at the same offset. 689 */ 690 #ifdef COMPAT_FOR_U64_ALIGNMENT 691 case NVME_IOCTL_SUBMIT_IO32: 692 #endif 693 case NVME_IOCTL_SUBMIT_IO: 694 return nvme_submit_io(ns, argp); 695 case NVME_IOCTL_IO64_CMD_VEC: 696 flags |= NVME_IOCTL_VEC; 697 fallthrough; 698 case NVME_IOCTL_IO64_CMD: 699 return nvme_user_cmd64(ns->ctrl, ns, argp, flags, 700 open_for_write); 701 default: 702 return -ENOTTY; 703 } 704 } 705 706 int nvme_ioctl(struct block_device *bdev, blk_mode_t mode, 707 unsigned int cmd, unsigned long arg) 708 { 709 struct nvme_ns *ns = bdev->bd_disk->private_data; 710 bool open_for_write = mode & BLK_OPEN_WRITE; 711 void __user *argp = (void __user *)arg; 712 unsigned int flags = 0; 713 714 if (bdev_is_partition(bdev)) 715 flags |= NVME_IOCTL_PARTITION; 716 717 if (is_ctrl_ioctl(cmd)) 718 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 719 return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 720 } 721 722 long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 723 { 724 struct nvme_ns *ns = 725 container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); 726 bool open_for_write = file->f_mode & FMODE_WRITE; 727 void __user *argp = (void __user *)arg; 728 729 if (is_ctrl_ioctl(cmd)) 730 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 731 return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 732 } 733 734 static int nvme_uring_cmd_checks(unsigned int issue_flags) 735 { 736 737 /* NVMe passthrough requires big SQE/CQE support */ 738 if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != 739 (IO_URING_F_SQE128|IO_URING_F_CQE32)) 740 return -EOPNOTSUPP; 741 return 0; 742 } 743 744 static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, 745 unsigned int issue_flags) 746 { 747 struct nvme_ctrl *ctrl = ns->ctrl; 748 int ret; 749 750 BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); 751 752 ret = nvme_uring_cmd_checks(issue_flags); 753 if (ret) 754 return ret; 755 756 switch (ioucmd->cmd_op) { 757 case NVME_URING_CMD_IO: 758 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false); 759 break; 760 case NVME_URING_CMD_IO_VEC: 761 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true); 762 break; 763 default: 764 ret = -ENOTTY; 765 } 766 767 return ret; 768 } 769 770 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 771 { 772 struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, 773 struct nvme_ns, cdev); 774 775 return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 776 } 777 778 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 779 struct io_comp_batch *iob, 780 unsigned int poll_flags) 781 { 782 struct request *req; 783 int ret = 0; 784 785 if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) 786 return 0; 787 788 req = READ_ONCE(ioucmd->cookie); 789 if (req && blk_rq_is_poll(req)) 790 ret = blk_rq_poll(req, iob, poll_flags); 791 return ret; 792 } 793 #ifdef CONFIG_NVME_MULTIPATH 794 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 795 void __user *argp, struct nvme_ns_head *head, int srcu_idx, 796 bool open_for_write) 797 __releases(&head->srcu) 798 { 799 struct nvme_ctrl *ctrl = ns->ctrl; 800 int ret; 801 802 nvme_get_ctrl(ns->ctrl); 803 srcu_read_unlock(&head->srcu, srcu_idx); 804 ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 805 806 nvme_put_ctrl(ctrl); 807 return ret; 808 } 809 810 int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, 811 unsigned int cmd, unsigned long arg) 812 { 813 struct nvme_ns_head *head = bdev->bd_disk->private_data; 814 bool open_for_write = mode & BLK_OPEN_WRITE; 815 void __user *argp = (void __user *)arg; 816 struct nvme_ns *ns; 817 int srcu_idx, ret = -EWOULDBLOCK; 818 unsigned int flags = 0; 819 820 if (bdev_is_partition(bdev)) 821 flags |= NVME_IOCTL_PARTITION; 822 823 srcu_idx = srcu_read_lock(&head->srcu); 824 ns = nvme_find_path(head); 825 if (!ns) 826 goto out_unlock; 827 828 /* 829 * Handle ioctls that apply to the controller instead of the namespace 830 * seperately and drop the ns SRCU reference early. This avoids a 831 * deadlock when deleting namespaces using the passthrough interface. 832 */ 833 if (is_ctrl_ioctl(cmd)) 834 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 835 open_for_write); 836 837 ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 838 out_unlock: 839 srcu_read_unlock(&head->srcu, srcu_idx); 840 return ret; 841 } 842 843 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, 844 unsigned long arg) 845 { 846 bool open_for_write = file->f_mode & FMODE_WRITE; 847 struct cdev *cdev = file_inode(file)->i_cdev; 848 struct nvme_ns_head *head = 849 container_of(cdev, struct nvme_ns_head, cdev); 850 void __user *argp = (void __user *)arg; 851 struct nvme_ns *ns; 852 int srcu_idx, ret = -EWOULDBLOCK; 853 854 srcu_idx = srcu_read_lock(&head->srcu); 855 ns = nvme_find_path(head); 856 if (!ns) 857 goto out_unlock; 858 859 if (is_ctrl_ioctl(cmd)) 860 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 861 open_for_write); 862 863 ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 864 out_unlock: 865 srcu_read_unlock(&head->srcu, srcu_idx); 866 return ret; 867 } 868 869 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, 870 unsigned int issue_flags) 871 { 872 struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; 873 struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); 874 int srcu_idx = srcu_read_lock(&head->srcu); 875 struct nvme_ns *ns = nvme_find_path(head); 876 int ret = -EINVAL; 877 878 if (ns) 879 ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 880 srcu_read_unlock(&head->srcu, srcu_idx); 881 return ret; 882 } 883 #endif /* CONFIG_NVME_MULTIPATH */ 884 885 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 886 { 887 struct nvme_ctrl *ctrl = ioucmd->file->private_data; 888 int ret; 889 890 /* IOPOLL not supported yet */ 891 if (issue_flags & IO_URING_F_IOPOLL) 892 return -EOPNOTSUPP; 893 894 ret = nvme_uring_cmd_checks(issue_flags); 895 if (ret) 896 return ret; 897 898 switch (ioucmd->cmd_op) { 899 case NVME_URING_CMD_ADMIN: 900 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false); 901 break; 902 case NVME_URING_CMD_ADMIN_VEC: 903 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true); 904 break; 905 default: 906 ret = -ENOTTY; 907 } 908 909 return ret; 910 } 911 912 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, 913 bool open_for_write) 914 { 915 struct nvme_ns *ns; 916 int ret; 917 918 down_read(&ctrl->namespaces_rwsem); 919 if (list_empty(&ctrl->namespaces)) { 920 ret = -ENOTTY; 921 goto out_unlock; 922 } 923 924 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 925 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 926 dev_warn(ctrl->device, 927 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 928 ret = -EINVAL; 929 goto out_unlock; 930 } 931 932 dev_warn(ctrl->device, 933 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 934 kref_get(&ns->kref); 935 up_read(&ctrl->namespaces_rwsem); 936 937 ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); 938 nvme_put_ns(ns); 939 return ret; 940 941 out_unlock: 942 up_read(&ctrl->namespaces_rwsem); 943 return ret; 944 } 945 946 long nvme_dev_ioctl(struct file *file, unsigned int cmd, 947 unsigned long arg) 948 { 949 bool open_for_write = file->f_mode & FMODE_WRITE; 950 struct nvme_ctrl *ctrl = file->private_data; 951 void __user *argp = (void __user *)arg; 952 953 switch (cmd) { 954 case NVME_IOCTL_ADMIN_CMD: 955 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 956 case NVME_IOCTL_ADMIN64_CMD: 957 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 958 case NVME_IOCTL_IO_CMD: 959 return nvme_dev_user_cmd(ctrl, argp, open_for_write); 960 case NVME_IOCTL_RESET: 961 if (!capable(CAP_SYS_ADMIN)) 962 return -EACCES; 963 dev_warn(ctrl->device, "resetting controller\n"); 964 return nvme_reset_ctrl_sync(ctrl); 965 case NVME_IOCTL_SUBSYS_RESET: 966 if (!capable(CAP_SYS_ADMIN)) 967 return -EACCES; 968 return nvme_reset_subsystem(ctrl); 969 case NVME_IOCTL_RESCAN: 970 if (!capable(CAP_SYS_ADMIN)) 971 return -EACCES; 972 nvme_queue_scan(ctrl); 973 return 0; 974 default: 975 return -ENOTTY; 976 } 977 } 978