1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Common code for the NVMe target. 4 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/module.h> 8 #include <linux/random.h> 9 #include <linux/rculist.h> 10 #include <linux/pci-p2pdma.h> 11 #include <linux/scatterlist.h> 12 13 #include <generated/utsrelease.h> 14 15 #define CREATE_TRACE_POINTS 16 #include "trace.h" 17 18 #include "nvmet.h" 19 #include "debugfs.h" 20 21 struct kmem_cache *nvmet_bvec_cache; 22 struct workqueue_struct *buffered_io_wq; 23 struct workqueue_struct *zbd_wq; 24 static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; 25 static DEFINE_IDA(cntlid_ida); 26 27 struct workqueue_struct *nvmet_wq; 28 EXPORT_SYMBOL_GPL(nvmet_wq); 29 30 /* 31 * This read/write semaphore is used to synchronize access to configuration 32 * information on a target system that will result in discovery log page 33 * information change for at least one host. 34 * The full list of resources to protected by this semaphore is: 35 * 36 * - subsystems list 37 * - per-subsystem allowed hosts list 38 * - allow_any_host subsystem attribute 39 * - nvmet_genctr 40 * - the nvmet_transports array 41 * 42 * When updating any of those lists/structures write lock should be obtained, 43 * while when reading (popolating discovery log page or checking host-subsystem 44 * link) read lock is obtained to allow concurrent reads. 45 */ 46 DECLARE_RWSEM(nvmet_config_sem); 47 48 u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; 49 u64 nvmet_ana_chgcnt; 50 DECLARE_RWSEM(nvmet_ana_sem); 51 52 inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) 53 { 54 switch (errno) { 55 case 0: 56 return NVME_SC_SUCCESS; 57 case -ENOSPC: 58 req->error_loc = offsetof(struct nvme_rw_command, length); 59 return NVME_SC_CAP_EXCEEDED | NVME_STATUS_DNR; 60 case -EREMOTEIO: 61 req->error_loc = offsetof(struct nvme_rw_command, slba); 62 return NVME_SC_LBA_RANGE | NVME_STATUS_DNR; 63 case -EOPNOTSUPP: 64 req->error_loc = offsetof(struct nvme_common_command, opcode); 65 switch (req->cmd->common.opcode) { 66 case nvme_cmd_dsm: 67 case nvme_cmd_write_zeroes: 68 return NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR; 69 default: 70 return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; 71 } 72 break; 73 case -ENODATA: 74 req->error_loc = offsetof(struct nvme_rw_command, nsid); 75 return NVME_SC_ACCESS_DENIED; 76 case -EIO: 77 fallthrough; 78 default: 79 req->error_loc = offsetof(struct nvme_common_command, opcode); 80 return NVME_SC_INTERNAL | NVME_STATUS_DNR; 81 } 82 } 83 84 u16 nvmet_report_invalid_opcode(struct nvmet_req *req) 85 { 86 pr_debug("unhandled cmd %d on qid %d\n", req->cmd->common.opcode, 87 req->sq->qid); 88 89 req->error_loc = offsetof(struct nvme_common_command, opcode); 90 return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; 91 } 92 93 static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, 94 const char *subsysnqn); 95 96 u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, 97 size_t len) 98 { 99 if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { 100 req->error_loc = offsetof(struct nvme_common_command, dptr); 101 return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; 102 } 103 return 0; 104 } 105 106 u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) 107 { 108 if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { 109 req->error_loc = offsetof(struct nvme_common_command, dptr); 110 return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; 111 } 112 return 0; 113 } 114 115 u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) 116 { 117 if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) { 118 req->error_loc = offsetof(struct nvme_common_command, dptr); 119 return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; 120 } 121 return 0; 122 } 123 124 static u32 nvmet_max_nsid(struct nvmet_subsys *subsys) 125 { 126 struct nvmet_ns *cur; 127 unsigned long idx; 128 u32 nsid = 0; 129 130 xa_for_each(&subsys->namespaces, idx, cur) 131 nsid = cur->nsid; 132 133 return nsid; 134 } 135 136 static u32 nvmet_async_event_result(struct nvmet_async_event *aen) 137 { 138 return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); 139 } 140 141 static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl) 142 { 143 struct nvmet_req *req; 144 145 mutex_lock(&ctrl->lock); 146 while (ctrl->nr_async_event_cmds) { 147 req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; 148 mutex_unlock(&ctrl->lock); 149 nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_STATUS_DNR); 150 mutex_lock(&ctrl->lock); 151 } 152 mutex_unlock(&ctrl->lock); 153 } 154 155 static void nvmet_async_events_process(struct nvmet_ctrl *ctrl) 156 { 157 struct nvmet_async_event *aen; 158 struct nvmet_req *req; 159 160 mutex_lock(&ctrl->lock); 161 while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) { 162 aen = list_first_entry(&ctrl->async_events, 163 struct nvmet_async_event, entry); 164 req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; 165 nvmet_set_result(req, nvmet_async_event_result(aen)); 166 167 list_del(&aen->entry); 168 kfree(aen); 169 170 mutex_unlock(&ctrl->lock); 171 trace_nvmet_async_event(ctrl, req->cqe->result.u32); 172 nvmet_req_complete(req, 0); 173 mutex_lock(&ctrl->lock); 174 } 175 mutex_unlock(&ctrl->lock); 176 } 177 178 static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) 179 { 180 struct nvmet_async_event *aen, *tmp; 181 182 mutex_lock(&ctrl->lock); 183 list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) { 184 list_del(&aen->entry); 185 kfree(aen); 186 } 187 mutex_unlock(&ctrl->lock); 188 } 189 190 static void nvmet_async_event_work(struct work_struct *work) 191 { 192 struct nvmet_ctrl *ctrl = 193 container_of(work, struct nvmet_ctrl, async_event_work); 194 195 nvmet_async_events_process(ctrl); 196 } 197 198 void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, 199 u8 event_info, u8 log_page) 200 { 201 struct nvmet_async_event *aen; 202 203 aen = kmalloc(sizeof(*aen), GFP_KERNEL); 204 if (!aen) 205 return; 206 207 aen->event_type = event_type; 208 aen->event_info = event_info; 209 aen->log_page = log_page; 210 211 mutex_lock(&ctrl->lock); 212 list_add_tail(&aen->entry, &ctrl->async_events); 213 mutex_unlock(&ctrl->lock); 214 215 queue_work(nvmet_wq, &ctrl->async_event_work); 216 } 217 218 static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) 219 { 220 u32 i; 221 222 mutex_lock(&ctrl->lock); 223 if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES) 224 goto out_unlock; 225 226 for (i = 0; i < ctrl->nr_changed_ns; i++) { 227 if (ctrl->changed_ns_list[i] == nsid) 228 goto out_unlock; 229 } 230 231 if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) { 232 ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff); 233 ctrl->nr_changed_ns = U32_MAX; 234 goto out_unlock; 235 } 236 237 ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid; 238 out_unlock: 239 mutex_unlock(&ctrl->lock); 240 } 241 242 void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) 243 { 244 struct nvmet_ctrl *ctrl; 245 246 lockdep_assert_held(&subsys->lock); 247 248 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 249 nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); 250 if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR)) 251 continue; 252 nvmet_add_async_event(ctrl, NVME_AER_NOTICE, 253 NVME_AER_NOTICE_NS_CHANGED, 254 NVME_LOG_CHANGED_NS); 255 } 256 } 257 258 void nvmet_send_ana_event(struct nvmet_subsys *subsys, 259 struct nvmet_port *port) 260 { 261 struct nvmet_ctrl *ctrl; 262 263 mutex_lock(&subsys->lock); 264 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 265 if (port && ctrl->port != port) 266 continue; 267 if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE)) 268 continue; 269 nvmet_add_async_event(ctrl, NVME_AER_NOTICE, 270 NVME_AER_NOTICE_ANA, NVME_LOG_ANA); 271 } 272 mutex_unlock(&subsys->lock); 273 } 274 275 void nvmet_port_send_ana_event(struct nvmet_port *port) 276 { 277 struct nvmet_subsys_link *p; 278 279 down_read(&nvmet_config_sem); 280 list_for_each_entry(p, &port->subsystems, entry) 281 nvmet_send_ana_event(p->subsys, port); 282 up_read(&nvmet_config_sem); 283 } 284 285 int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) 286 { 287 int ret = 0; 288 289 down_write(&nvmet_config_sem); 290 if (nvmet_transports[ops->type]) 291 ret = -EINVAL; 292 else 293 nvmet_transports[ops->type] = ops; 294 up_write(&nvmet_config_sem); 295 296 return ret; 297 } 298 EXPORT_SYMBOL_GPL(nvmet_register_transport); 299 300 void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops) 301 { 302 down_write(&nvmet_config_sem); 303 nvmet_transports[ops->type] = NULL; 304 up_write(&nvmet_config_sem); 305 } 306 EXPORT_SYMBOL_GPL(nvmet_unregister_transport); 307 308 void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys) 309 { 310 struct nvmet_ctrl *ctrl; 311 312 mutex_lock(&subsys->lock); 313 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 314 if (ctrl->port == port) 315 ctrl->ops->delete_ctrl(ctrl); 316 } 317 mutex_unlock(&subsys->lock); 318 } 319 320 int nvmet_enable_port(struct nvmet_port *port) 321 { 322 const struct nvmet_fabrics_ops *ops; 323 int ret; 324 325 lockdep_assert_held(&nvmet_config_sem); 326 327 ops = nvmet_transports[port->disc_addr.trtype]; 328 if (!ops) { 329 up_write(&nvmet_config_sem); 330 request_module("nvmet-transport-%d", port->disc_addr.trtype); 331 down_write(&nvmet_config_sem); 332 ops = nvmet_transports[port->disc_addr.trtype]; 333 if (!ops) { 334 pr_err("transport type %d not supported\n", 335 port->disc_addr.trtype); 336 return -EINVAL; 337 } 338 } 339 340 if (!try_module_get(ops->owner)) 341 return -EINVAL; 342 343 /* 344 * If the user requested PI support and the transport isn't pi capable, 345 * don't enable the port. 346 */ 347 if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) { 348 pr_err("T10-PI is not supported by transport type %d\n", 349 port->disc_addr.trtype); 350 ret = -EINVAL; 351 goto out_put; 352 } 353 354 ret = ops->add_port(port); 355 if (ret) 356 goto out_put; 357 358 /* If the transport didn't set inline_data_size, then disable it. */ 359 if (port->inline_data_size < 0) 360 port->inline_data_size = 0; 361 362 /* 363 * If the transport didn't set the max_queue_size properly, then clamp 364 * it to the target limits. Also set default values in case the 365 * transport didn't set it at all. 366 */ 367 if (port->max_queue_size < 0) 368 port->max_queue_size = NVMET_MAX_QUEUE_SIZE; 369 else 370 port->max_queue_size = clamp_t(int, port->max_queue_size, 371 NVMET_MIN_QUEUE_SIZE, 372 NVMET_MAX_QUEUE_SIZE); 373 374 port->enabled = true; 375 port->tr_ops = ops; 376 return 0; 377 378 out_put: 379 module_put(ops->owner); 380 return ret; 381 } 382 383 void nvmet_disable_port(struct nvmet_port *port) 384 { 385 const struct nvmet_fabrics_ops *ops; 386 387 lockdep_assert_held(&nvmet_config_sem); 388 389 port->enabled = false; 390 port->tr_ops = NULL; 391 392 ops = nvmet_transports[port->disc_addr.trtype]; 393 ops->remove_port(port); 394 module_put(ops->owner); 395 } 396 397 static void nvmet_keep_alive_timer(struct work_struct *work) 398 { 399 struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), 400 struct nvmet_ctrl, ka_work); 401 bool reset_tbkas = ctrl->reset_tbkas; 402 403 ctrl->reset_tbkas = false; 404 if (reset_tbkas) { 405 pr_debug("ctrl %d reschedule traffic based keep-alive timer\n", 406 ctrl->cntlid); 407 queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); 408 return; 409 } 410 411 pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", 412 ctrl->cntlid, ctrl->kato); 413 414 nvmet_ctrl_fatal_error(ctrl); 415 } 416 417 void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) 418 { 419 if (unlikely(ctrl->kato == 0)) 420 return; 421 422 pr_debug("ctrl %d start keep-alive timer for %d secs\n", 423 ctrl->cntlid, ctrl->kato); 424 425 queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); 426 } 427 428 void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) 429 { 430 if (unlikely(ctrl->kato == 0)) 431 return; 432 433 pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); 434 435 cancel_delayed_work_sync(&ctrl->ka_work); 436 } 437 438 u16 nvmet_req_find_ns(struct nvmet_req *req) 439 { 440 u32 nsid = le32_to_cpu(req->cmd->common.nsid); 441 struct nvmet_subsys *subsys = nvmet_req_subsys(req); 442 443 req->ns = xa_load(&subsys->namespaces, nsid); 444 if (unlikely(!req->ns)) { 445 req->error_loc = offsetof(struct nvme_common_command, nsid); 446 if (nvmet_subsys_nsid_exists(subsys, nsid)) 447 return NVME_SC_INTERNAL_PATH_ERROR; 448 return NVME_SC_INVALID_NS | NVME_STATUS_DNR; 449 } 450 451 percpu_ref_get(&req->ns->ref); 452 return NVME_SC_SUCCESS; 453 } 454 455 static void nvmet_destroy_namespace(struct percpu_ref *ref) 456 { 457 struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref); 458 459 complete(&ns->disable_done); 460 } 461 462 void nvmet_put_namespace(struct nvmet_ns *ns) 463 { 464 percpu_ref_put(&ns->ref); 465 } 466 467 static void nvmet_ns_dev_disable(struct nvmet_ns *ns) 468 { 469 nvmet_bdev_ns_disable(ns); 470 nvmet_file_ns_disable(ns); 471 } 472 473 static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) 474 { 475 int ret; 476 struct pci_dev *p2p_dev; 477 478 if (!ns->use_p2pmem) 479 return 0; 480 481 if (!ns->bdev) { 482 pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n"); 483 return -EINVAL; 484 } 485 486 if (!blk_queue_pci_p2pdma(ns->bdev->bd_disk->queue)) { 487 pr_err("peer-to-peer DMA is not supported by the driver of %s\n", 488 ns->device_path); 489 return -EINVAL; 490 } 491 492 if (ns->p2p_dev) { 493 ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true); 494 if (ret < 0) 495 return -EINVAL; 496 } else { 497 /* 498 * Right now we just check that there is p2pmem available so 499 * we can report an error to the user right away if there 500 * is not. We'll find the actual device to use once we 501 * setup the controller when the port's device is available. 502 */ 503 504 p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns)); 505 if (!p2p_dev) { 506 pr_err("no peer-to-peer memory is available for %s\n", 507 ns->device_path); 508 return -EINVAL; 509 } 510 511 pci_dev_put(p2p_dev); 512 } 513 514 return 0; 515 } 516 517 /* 518 * Note: ctrl->subsys->lock should be held when calling this function 519 */ 520 static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, 521 struct nvmet_ns *ns) 522 { 523 struct device *clients[2]; 524 struct pci_dev *p2p_dev; 525 int ret; 526 527 if (!ctrl->p2p_client || !ns->use_p2pmem) 528 return; 529 530 if (ns->p2p_dev) { 531 ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true); 532 if (ret < 0) 533 return; 534 535 p2p_dev = pci_dev_get(ns->p2p_dev); 536 } else { 537 clients[0] = ctrl->p2p_client; 538 clients[1] = nvmet_ns_dev(ns); 539 540 p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients)); 541 if (!p2p_dev) { 542 pr_err("no peer-to-peer memory is available that's supported by %s and %s\n", 543 dev_name(ctrl->p2p_client), ns->device_path); 544 return; 545 } 546 } 547 548 ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev); 549 if (ret < 0) 550 pci_dev_put(p2p_dev); 551 552 pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev), 553 ns->nsid); 554 } 555 556 bool nvmet_ns_revalidate(struct nvmet_ns *ns) 557 { 558 loff_t oldsize = ns->size; 559 560 if (ns->bdev) 561 nvmet_bdev_ns_revalidate(ns); 562 else 563 nvmet_file_ns_revalidate(ns); 564 565 return oldsize != ns->size; 566 } 567 568 int nvmet_ns_enable(struct nvmet_ns *ns) 569 { 570 struct nvmet_subsys *subsys = ns->subsys; 571 struct nvmet_ctrl *ctrl; 572 int ret; 573 574 mutex_lock(&subsys->lock); 575 ret = 0; 576 577 if (nvmet_is_passthru_subsys(subsys)) { 578 pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); 579 goto out_unlock; 580 } 581 582 if (ns->enabled) 583 goto out_unlock; 584 585 ret = -EMFILE; 586 if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) 587 goto out_unlock; 588 589 ret = nvmet_bdev_ns_enable(ns); 590 if (ret == -ENOTBLK) 591 ret = nvmet_file_ns_enable(ns); 592 if (ret) 593 goto out_unlock; 594 595 ret = nvmet_p2pmem_ns_enable(ns); 596 if (ret) 597 goto out_dev_disable; 598 599 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) 600 nvmet_p2pmem_ns_add_p2p(ctrl, ns); 601 602 ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 603 0, GFP_KERNEL); 604 if (ret) 605 goto out_dev_put; 606 607 if (ns->nsid > subsys->max_nsid) 608 subsys->max_nsid = ns->nsid; 609 610 ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); 611 if (ret) 612 goto out_restore_subsys_maxnsid; 613 614 subsys->nr_namespaces++; 615 616 nvmet_ns_changed(subsys, ns->nsid); 617 ns->enabled = true; 618 ret = 0; 619 out_unlock: 620 mutex_unlock(&subsys->lock); 621 return ret; 622 623 out_restore_subsys_maxnsid: 624 subsys->max_nsid = nvmet_max_nsid(subsys); 625 percpu_ref_exit(&ns->ref); 626 out_dev_put: 627 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) 628 pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); 629 out_dev_disable: 630 nvmet_ns_dev_disable(ns); 631 goto out_unlock; 632 } 633 634 void nvmet_ns_disable(struct nvmet_ns *ns) 635 { 636 struct nvmet_subsys *subsys = ns->subsys; 637 struct nvmet_ctrl *ctrl; 638 639 mutex_lock(&subsys->lock); 640 if (!ns->enabled) 641 goto out_unlock; 642 643 ns->enabled = false; 644 xa_erase(&ns->subsys->namespaces, ns->nsid); 645 if (ns->nsid == subsys->max_nsid) 646 subsys->max_nsid = nvmet_max_nsid(subsys); 647 648 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) 649 pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); 650 651 mutex_unlock(&subsys->lock); 652 653 /* 654 * Now that we removed the namespaces from the lookup list, we 655 * can kill the per_cpu ref and wait for any remaining references 656 * to be dropped, as well as a RCU grace period for anyone only 657 * using the namepace under rcu_read_lock(). Note that we can't 658 * use call_rcu here as we need to ensure the namespaces have 659 * been fully destroyed before unloading the module. 660 */ 661 percpu_ref_kill(&ns->ref); 662 synchronize_rcu(); 663 wait_for_completion(&ns->disable_done); 664 percpu_ref_exit(&ns->ref); 665 666 mutex_lock(&subsys->lock); 667 668 subsys->nr_namespaces--; 669 nvmet_ns_changed(subsys, ns->nsid); 670 nvmet_ns_dev_disable(ns); 671 out_unlock: 672 mutex_unlock(&subsys->lock); 673 } 674 675 void nvmet_ns_free(struct nvmet_ns *ns) 676 { 677 nvmet_ns_disable(ns); 678 679 down_write(&nvmet_ana_sem); 680 nvmet_ana_group_enabled[ns->anagrpid]--; 681 up_write(&nvmet_ana_sem); 682 683 kfree(ns->device_path); 684 kfree(ns); 685 } 686 687 struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) 688 { 689 struct nvmet_ns *ns; 690 691 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 692 if (!ns) 693 return NULL; 694 695 init_completion(&ns->disable_done); 696 697 ns->nsid = nsid; 698 ns->subsys = subsys; 699 700 down_write(&nvmet_ana_sem); 701 ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; 702 nvmet_ana_group_enabled[ns->anagrpid]++; 703 up_write(&nvmet_ana_sem); 704 705 uuid_gen(&ns->uuid); 706 ns->buffered_io = false; 707 ns->csi = NVME_CSI_NVM; 708 709 return ns; 710 } 711 712 static void nvmet_update_sq_head(struct nvmet_req *req) 713 { 714 if (req->sq->size) { 715 u32 old_sqhd, new_sqhd; 716 717 old_sqhd = READ_ONCE(req->sq->sqhd); 718 do { 719 new_sqhd = (old_sqhd + 1) % req->sq->size; 720 } while (!try_cmpxchg(&req->sq->sqhd, &old_sqhd, new_sqhd)); 721 } 722 req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); 723 } 724 725 static void nvmet_set_error(struct nvmet_req *req, u16 status) 726 { 727 struct nvmet_ctrl *ctrl = req->sq->ctrl; 728 struct nvme_error_slot *new_error_slot; 729 unsigned long flags; 730 731 req->cqe->status = cpu_to_le16(status << 1); 732 733 if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC) 734 return; 735 736 spin_lock_irqsave(&ctrl->error_lock, flags); 737 ctrl->err_counter++; 738 new_error_slot = 739 &ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS]; 740 741 new_error_slot->error_count = cpu_to_le64(ctrl->err_counter); 742 new_error_slot->sqid = cpu_to_le16(req->sq->qid); 743 new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id); 744 new_error_slot->status_field = cpu_to_le16(status << 1); 745 new_error_slot->param_error_location = cpu_to_le16(req->error_loc); 746 new_error_slot->lba = cpu_to_le64(req->error_slba); 747 new_error_slot->nsid = req->cmd->common.nsid; 748 spin_unlock_irqrestore(&ctrl->error_lock, flags); 749 750 /* set the more bit for this request */ 751 req->cqe->status |= cpu_to_le16(1 << 14); 752 } 753 754 static void __nvmet_req_complete(struct nvmet_req *req, u16 status) 755 { 756 struct nvmet_ns *ns = req->ns; 757 758 if (!req->sq->sqhd_disabled) 759 nvmet_update_sq_head(req); 760 req->cqe->sq_id = cpu_to_le16(req->sq->qid); 761 req->cqe->command_id = req->cmd->common.command_id; 762 763 if (unlikely(status)) 764 nvmet_set_error(req, status); 765 766 trace_nvmet_req_complete(req); 767 768 req->ops->queue_response(req); 769 if (ns) 770 nvmet_put_namespace(ns); 771 } 772 773 void nvmet_req_complete(struct nvmet_req *req, u16 status) 774 { 775 struct nvmet_sq *sq = req->sq; 776 777 __nvmet_req_complete(req, status); 778 percpu_ref_put(&sq->ref); 779 } 780 EXPORT_SYMBOL_GPL(nvmet_req_complete); 781 782 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, 783 u16 qid, u16 size) 784 { 785 cq->qid = qid; 786 cq->size = size; 787 } 788 789 void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, 790 u16 qid, u16 size) 791 { 792 sq->sqhd = 0; 793 sq->qid = qid; 794 sq->size = size; 795 796 ctrl->sqs[qid] = sq; 797 } 798 799 static void nvmet_confirm_sq(struct percpu_ref *ref) 800 { 801 struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); 802 803 complete(&sq->confirm_done); 804 } 805 806 void nvmet_sq_destroy(struct nvmet_sq *sq) 807 { 808 struct nvmet_ctrl *ctrl = sq->ctrl; 809 810 /* 811 * If this is the admin queue, complete all AERs so that our 812 * queue doesn't have outstanding requests on it. 813 */ 814 if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq) 815 nvmet_async_events_failall(ctrl); 816 percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq); 817 wait_for_completion(&sq->confirm_done); 818 wait_for_completion(&sq->free_done); 819 percpu_ref_exit(&sq->ref); 820 nvmet_auth_sq_free(sq); 821 822 /* 823 * we must reference the ctrl again after waiting for inflight IO 824 * to complete. Because admin connect may have sneaked in after we 825 * store sq->ctrl locally, but before we killed the percpu_ref. the 826 * admin connect allocates and assigns sq->ctrl, which now needs a 827 * final ref put, as this ctrl is going away. 828 */ 829 ctrl = sq->ctrl; 830 831 if (ctrl) { 832 /* 833 * The teardown flow may take some time, and the host may not 834 * send us keep-alive during this period, hence reset the 835 * traffic based keep-alive timer so we don't trigger a 836 * controller teardown as a result of a keep-alive expiration. 837 */ 838 ctrl->reset_tbkas = true; 839 sq->ctrl->sqs[sq->qid] = NULL; 840 nvmet_ctrl_put(ctrl); 841 sq->ctrl = NULL; /* allows reusing the queue later */ 842 } 843 } 844 EXPORT_SYMBOL_GPL(nvmet_sq_destroy); 845 846 static void nvmet_sq_free(struct percpu_ref *ref) 847 { 848 struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); 849 850 complete(&sq->free_done); 851 } 852 853 int nvmet_sq_init(struct nvmet_sq *sq) 854 { 855 int ret; 856 857 ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); 858 if (ret) { 859 pr_err("percpu_ref init failed!\n"); 860 return ret; 861 } 862 init_completion(&sq->free_done); 863 init_completion(&sq->confirm_done); 864 nvmet_auth_sq_init(sq); 865 866 return 0; 867 } 868 EXPORT_SYMBOL_GPL(nvmet_sq_init); 869 870 static inline u16 nvmet_check_ana_state(struct nvmet_port *port, 871 struct nvmet_ns *ns) 872 { 873 enum nvme_ana_state state = port->ana_state[ns->anagrpid]; 874 875 if (unlikely(state == NVME_ANA_INACCESSIBLE)) 876 return NVME_SC_ANA_INACCESSIBLE; 877 if (unlikely(state == NVME_ANA_PERSISTENT_LOSS)) 878 return NVME_SC_ANA_PERSISTENT_LOSS; 879 if (unlikely(state == NVME_ANA_CHANGE)) 880 return NVME_SC_ANA_TRANSITION; 881 return 0; 882 } 883 884 static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) 885 { 886 if (unlikely(req->ns->readonly)) { 887 switch (req->cmd->common.opcode) { 888 case nvme_cmd_read: 889 case nvme_cmd_flush: 890 break; 891 default: 892 return NVME_SC_NS_WRITE_PROTECTED; 893 } 894 } 895 896 return 0; 897 } 898 899 static u16 nvmet_parse_io_cmd(struct nvmet_req *req) 900 { 901 struct nvme_command *cmd = req->cmd; 902 u16 ret; 903 904 if (nvme_is_fabrics(cmd)) 905 return nvmet_parse_fabrics_io_cmd(req); 906 907 if (unlikely(!nvmet_check_auth_status(req))) 908 return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; 909 910 ret = nvmet_check_ctrl_status(req); 911 if (unlikely(ret)) 912 return ret; 913 914 if (nvmet_is_passthru_req(req)) 915 return nvmet_parse_passthru_io_cmd(req); 916 917 ret = nvmet_req_find_ns(req); 918 if (unlikely(ret)) 919 return ret; 920 921 ret = nvmet_check_ana_state(req->port, req->ns); 922 if (unlikely(ret)) { 923 req->error_loc = offsetof(struct nvme_common_command, nsid); 924 return ret; 925 } 926 ret = nvmet_io_cmd_check_access(req); 927 if (unlikely(ret)) { 928 req->error_loc = offsetof(struct nvme_common_command, nsid); 929 return ret; 930 } 931 932 switch (req->ns->csi) { 933 case NVME_CSI_NVM: 934 if (req->ns->file) 935 return nvmet_file_parse_io_cmd(req); 936 return nvmet_bdev_parse_io_cmd(req); 937 case NVME_CSI_ZNS: 938 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 939 return nvmet_bdev_zns_parse_io_cmd(req); 940 return NVME_SC_INVALID_IO_CMD_SET; 941 default: 942 return NVME_SC_INVALID_IO_CMD_SET; 943 } 944 } 945 946 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, 947 struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops) 948 { 949 u8 flags = req->cmd->common.flags; 950 u16 status; 951 952 req->cq = cq; 953 req->sq = sq; 954 req->ops = ops; 955 req->sg = NULL; 956 req->metadata_sg = NULL; 957 req->sg_cnt = 0; 958 req->metadata_sg_cnt = 0; 959 req->transfer_len = 0; 960 req->metadata_len = 0; 961 req->cqe->result.u64 = 0; 962 req->cqe->status = 0; 963 req->cqe->sq_head = 0; 964 req->ns = NULL; 965 req->error_loc = NVMET_NO_ERROR_LOC; 966 req->error_slba = 0; 967 968 /* no support for fused commands yet */ 969 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { 970 req->error_loc = offsetof(struct nvme_common_command, flags); 971 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 972 goto fail; 973 } 974 975 /* 976 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that 977 * contains an address of a single contiguous physical buffer that is 978 * byte aligned. 979 */ 980 if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { 981 req->error_loc = offsetof(struct nvme_common_command, flags); 982 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 983 goto fail; 984 } 985 986 if (unlikely(!req->sq->ctrl)) 987 /* will return an error for any non-connect command: */ 988 status = nvmet_parse_connect_cmd(req); 989 else if (likely(req->sq->qid != 0)) 990 status = nvmet_parse_io_cmd(req); 991 else 992 status = nvmet_parse_admin_cmd(req); 993 994 if (status) 995 goto fail; 996 997 trace_nvmet_req_init(req, req->cmd); 998 999 if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { 1000 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 1001 goto fail; 1002 } 1003 1004 if (sq->ctrl) 1005 sq->ctrl->reset_tbkas = true; 1006 1007 return true; 1008 1009 fail: 1010 __nvmet_req_complete(req, status); 1011 return false; 1012 } 1013 EXPORT_SYMBOL_GPL(nvmet_req_init); 1014 1015 void nvmet_req_uninit(struct nvmet_req *req) 1016 { 1017 percpu_ref_put(&req->sq->ref); 1018 if (req->ns) 1019 nvmet_put_namespace(req->ns); 1020 } 1021 EXPORT_SYMBOL_GPL(nvmet_req_uninit); 1022 1023 bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len) 1024 { 1025 if (unlikely(len != req->transfer_len)) { 1026 req->error_loc = offsetof(struct nvme_common_command, dptr); 1027 nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR); 1028 return false; 1029 } 1030 1031 return true; 1032 } 1033 EXPORT_SYMBOL_GPL(nvmet_check_transfer_len); 1034 1035 bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len) 1036 { 1037 if (unlikely(data_len > req->transfer_len)) { 1038 req->error_loc = offsetof(struct nvme_common_command, dptr); 1039 nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR); 1040 return false; 1041 } 1042 1043 return true; 1044 } 1045 1046 static unsigned int nvmet_data_transfer_len(struct nvmet_req *req) 1047 { 1048 return req->transfer_len - req->metadata_len; 1049 } 1050 1051 static int nvmet_req_alloc_p2pmem_sgls(struct pci_dev *p2p_dev, 1052 struct nvmet_req *req) 1053 { 1054 req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt, 1055 nvmet_data_transfer_len(req)); 1056 if (!req->sg) 1057 goto out_err; 1058 1059 if (req->metadata_len) { 1060 req->metadata_sg = pci_p2pmem_alloc_sgl(p2p_dev, 1061 &req->metadata_sg_cnt, req->metadata_len); 1062 if (!req->metadata_sg) 1063 goto out_free_sg; 1064 } 1065 1066 req->p2p_dev = p2p_dev; 1067 1068 return 0; 1069 out_free_sg: 1070 pci_p2pmem_free_sgl(req->p2p_dev, req->sg); 1071 out_err: 1072 return -ENOMEM; 1073 } 1074 1075 static struct pci_dev *nvmet_req_find_p2p_dev(struct nvmet_req *req) 1076 { 1077 if (!IS_ENABLED(CONFIG_PCI_P2PDMA) || 1078 !req->sq->ctrl || !req->sq->qid || !req->ns) 1079 return NULL; 1080 return radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, req->ns->nsid); 1081 } 1082 1083 int nvmet_req_alloc_sgls(struct nvmet_req *req) 1084 { 1085 struct pci_dev *p2p_dev = nvmet_req_find_p2p_dev(req); 1086 1087 if (p2p_dev && !nvmet_req_alloc_p2pmem_sgls(p2p_dev, req)) 1088 return 0; 1089 1090 req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL, 1091 &req->sg_cnt); 1092 if (unlikely(!req->sg)) 1093 goto out; 1094 1095 if (req->metadata_len) { 1096 req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL, 1097 &req->metadata_sg_cnt); 1098 if (unlikely(!req->metadata_sg)) 1099 goto out_free; 1100 } 1101 1102 return 0; 1103 out_free: 1104 sgl_free(req->sg); 1105 out: 1106 return -ENOMEM; 1107 } 1108 EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls); 1109 1110 void nvmet_req_free_sgls(struct nvmet_req *req) 1111 { 1112 if (req->p2p_dev) { 1113 pci_p2pmem_free_sgl(req->p2p_dev, req->sg); 1114 if (req->metadata_sg) 1115 pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg); 1116 req->p2p_dev = NULL; 1117 } else { 1118 sgl_free(req->sg); 1119 if (req->metadata_sg) 1120 sgl_free(req->metadata_sg); 1121 } 1122 1123 req->sg = NULL; 1124 req->metadata_sg = NULL; 1125 req->sg_cnt = 0; 1126 req->metadata_sg_cnt = 0; 1127 } 1128 EXPORT_SYMBOL_GPL(nvmet_req_free_sgls); 1129 1130 static inline bool nvmet_cc_en(u32 cc) 1131 { 1132 return (cc >> NVME_CC_EN_SHIFT) & 0x1; 1133 } 1134 1135 static inline u8 nvmet_cc_css(u32 cc) 1136 { 1137 return (cc >> NVME_CC_CSS_SHIFT) & 0x7; 1138 } 1139 1140 static inline u8 nvmet_cc_mps(u32 cc) 1141 { 1142 return (cc >> NVME_CC_MPS_SHIFT) & 0xf; 1143 } 1144 1145 static inline u8 nvmet_cc_ams(u32 cc) 1146 { 1147 return (cc >> NVME_CC_AMS_SHIFT) & 0x7; 1148 } 1149 1150 static inline u8 nvmet_cc_shn(u32 cc) 1151 { 1152 return (cc >> NVME_CC_SHN_SHIFT) & 0x3; 1153 } 1154 1155 static inline u8 nvmet_cc_iosqes(u32 cc) 1156 { 1157 return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; 1158 } 1159 1160 static inline u8 nvmet_cc_iocqes(u32 cc) 1161 { 1162 return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; 1163 } 1164 1165 static inline bool nvmet_css_supported(u8 cc_css) 1166 { 1167 switch (cc_css << NVME_CC_CSS_SHIFT) { 1168 case NVME_CC_CSS_NVM: 1169 case NVME_CC_CSS_CSI: 1170 return true; 1171 default: 1172 return false; 1173 } 1174 } 1175 1176 static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) 1177 { 1178 lockdep_assert_held(&ctrl->lock); 1179 1180 /* 1181 * Only I/O controllers should verify iosqes,iocqes. 1182 * Strictly speaking, the spec says a discovery controller 1183 * should verify iosqes,iocqes are zeroed, however that 1184 * would break backwards compatibility, so don't enforce it. 1185 */ 1186 if (!nvmet_is_disc_subsys(ctrl->subsys) && 1187 (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || 1188 nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) { 1189 ctrl->csts = NVME_CSTS_CFS; 1190 return; 1191 } 1192 1193 if (nvmet_cc_mps(ctrl->cc) != 0 || 1194 nvmet_cc_ams(ctrl->cc) != 0 || 1195 !nvmet_css_supported(nvmet_cc_css(ctrl->cc))) { 1196 ctrl->csts = NVME_CSTS_CFS; 1197 return; 1198 } 1199 1200 ctrl->csts = NVME_CSTS_RDY; 1201 1202 /* 1203 * Controllers that are not yet enabled should not really enforce the 1204 * keep alive timeout, but we still want to track a timeout and cleanup 1205 * in case a host died before it enabled the controller. Hence, simply 1206 * reset the keep alive timer when the controller is enabled. 1207 */ 1208 if (ctrl->kato) 1209 mod_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); 1210 } 1211 1212 static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) 1213 { 1214 lockdep_assert_held(&ctrl->lock); 1215 1216 /* XXX: tear down queues? */ 1217 ctrl->csts &= ~NVME_CSTS_RDY; 1218 ctrl->cc = 0; 1219 } 1220 1221 void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) 1222 { 1223 u32 old; 1224 1225 mutex_lock(&ctrl->lock); 1226 old = ctrl->cc; 1227 ctrl->cc = new; 1228 1229 if (nvmet_cc_en(new) && !nvmet_cc_en(old)) 1230 nvmet_start_ctrl(ctrl); 1231 if (!nvmet_cc_en(new) && nvmet_cc_en(old)) 1232 nvmet_clear_ctrl(ctrl); 1233 if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) { 1234 nvmet_clear_ctrl(ctrl); 1235 ctrl->csts |= NVME_CSTS_SHST_CMPLT; 1236 } 1237 if (!nvmet_cc_shn(new) && nvmet_cc_shn(old)) 1238 ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; 1239 mutex_unlock(&ctrl->lock); 1240 } 1241 1242 static void nvmet_init_cap(struct nvmet_ctrl *ctrl) 1243 { 1244 /* command sets supported: NVMe command set: */ 1245 ctrl->cap = (1ULL << 37); 1246 /* Controller supports one or more I/O Command Sets */ 1247 ctrl->cap |= (1ULL << 43); 1248 /* CC.EN timeout in 500msec units: */ 1249 ctrl->cap |= (15ULL << 24); 1250 /* maximum queue entries supported: */ 1251 if (ctrl->ops->get_max_queue_size) 1252 ctrl->cap |= min_t(u16, ctrl->ops->get_max_queue_size(ctrl), 1253 ctrl->port->max_queue_size) - 1; 1254 else 1255 ctrl->cap |= ctrl->port->max_queue_size - 1; 1256 1257 if (nvmet_is_passthru_subsys(ctrl->subsys)) 1258 nvmet_passthrough_override_cap(ctrl); 1259 } 1260 1261 struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, 1262 const char *hostnqn, u16 cntlid, 1263 struct nvmet_req *req) 1264 { 1265 struct nvmet_ctrl *ctrl = NULL; 1266 struct nvmet_subsys *subsys; 1267 1268 subsys = nvmet_find_get_subsys(req->port, subsysnqn); 1269 if (!subsys) { 1270 pr_warn("connect request for invalid subsystem %s!\n", 1271 subsysnqn); 1272 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); 1273 goto out; 1274 } 1275 1276 mutex_lock(&subsys->lock); 1277 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 1278 if (ctrl->cntlid == cntlid) { 1279 if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) { 1280 pr_warn("hostnqn mismatch.\n"); 1281 continue; 1282 } 1283 if (!kref_get_unless_zero(&ctrl->ref)) 1284 continue; 1285 1286 /* ctrl found */ 1287 goto found; 1288 } 1289 } 1290 1291 ctrl = NULL; /* ctrl not found */ 1292 pr_warn("could not find controller %d for subsys %s / host %s\n", 1293 cntlid, subsysnqn, hostnqn); 1294 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); 1295 1296 found: 1297 mutex_unlock(&subsys->lock); 1298 nvmet_subsys_put(subsys); 1299 out: 1300 return ctrl; 1301 } 1302 1303 u16 nvmet_check_ctrl_status(struct nvmet_req *req) 1304 { 1305 if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { 1306 pr_err("got cmd %d while CC.EN == 0 on qid = %d\n", 1307 req->cmd->common.opcode, req->sq->qid); 1308 return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; 1309 } 1310 1311 if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { 1312 pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n", 1313 req->cmd->common.opcode, req->sq->qid); 1314 return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; 1315 } 1316 1317 if (unlikely(!nvmet_check_auth_status(req))) { 1318 pr_warn("qid %d not authenticated\n", req->sq->qid); 1319 return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; 1320 } 1321 return 0; 1322 } 1323 1324 bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) 1325 { 1326 struct nvmet_host_link *p; 1327 1328 lockdep_assert_held(&nvmet_config_sem); 1329 1330 if (subsys->allow_any_host) 1331 return true; 1332 1333 if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */ 1334 return true; 1335 1336 list_for_each_entry(p, &subsys->hosts, entry) { 1337 if (!strcmp(nvmet_host_name(p->host), hostnqn)) 1338 return true; 1339 } 1340 1341 return false; 1342 } 1343 1344 /* 1345 * Note: ctrl->subsys->lock should be held when calling this function 1346 */ 1347 static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, 1348 struct nvmet_req *req) 1349 { 1350 struct nvmet_ns *ns; 1351 unsigned long idx; 1352 1353 if (!req->p2p_client) 1354 return; 1355 1356 ctrl->p2p_client = get_device(req->p2p_client); 1357 1358 xa_for_each(&ctrl->subsys->namespaces, idx, ns) 1359 nvmet_p2pmem_ns_add_p2p(ctrl, ns); 1360 } 1361 1362 /* 1363 * Note: ctrl->subsys->lock should be held when calling this function 1364 */ 1365 static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) 1366 { 1367 struct radix_tree_iter iter; 1368 void __rcu **slot; 1369 1370 radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0) 1371 pci_dev_put(radix_tree_deref_slot(slot)); 1372 1373 put_device(ctrl->p2p_client); 1374 } 1375 1376 static void nvmet_fatal_error_handler(struct work_struct *work) 1377 { 1378 struct nvmet_ctrl *ctrl = 1379 container_of(work, struct nvmet_ctrl, fatal_err_work); 1380 1381 pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); 1382 ctrl->ops->delete_ctrl(ctrl); 1383 } 1384 1385 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, 1386 struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) 1387 { 1388 struct nvmet_subsys *subsys; 1389 struct nvmet_ctrl *ctrl; 1390 int ret; 1391 u16 status; 1392 1393 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; 1394 subsys = nvmet_find_get_subsys(req->port, subsysnqn); 1395 if (!subsys) { 1396 pr_warn("connect request for invalid subsystem %s!\n", 1397 subsysnqn); 1398 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); 1399 req->error_loc = offsetof(struct nvme_common_command, dptr); 1400 goto out; 1401 } 1402 1403 down_read(&nvmet_config_sem); 1404 if (!nvmet_host_allowed(subsys, hostnqn)) { 1405 pr_info("connect by host %s for subsystem %s not allowed\n", 1406 hostnqn, subsysnqn); 1407 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); 1408 up_read(&nvmet_config_sem); 1409 status = NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR; 1410 req->error_loc = offsetof(struct nvme_common_command, dptr); 1411 goto out_put_subsystem; 1412 } 1413 up_read(&nvmet_config_sem); 1414 1415 status = NVME_SC_INTERNAL; 1416 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); 1417 if (!ctrl) 1418 goto out_put_subsystem; 1419 mutex_init(&ctrl->lock); 1420 1421 ctrl->port = req->port; 1422 ctrl->ops = req->ops; 1423 1424 #ifdef CONFIG_NVME_TARGET_PASSTHRU 1425 /* By default, set loop targets to clear IDS by default */ 1426 if (ctrl->port->disc_addr.trtype == NVMF_TRTYPE_LOOP) 1427 subsys->clear_ids = 1; 1428 #endif 1429 1430 INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); 1431 INIT_LIST_HEAD(&ctrl->async_events); 1432 INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL); 1433 INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); 1434 INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); 1435 1436 memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); 1437 memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); 1438 1439 kref_init(&ctrl->ref); 1440 ctrl->subsys = subsys; 1441 ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; 1442 nvmet_init_cap(ctrl); 1443 WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); 1444 1445 ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES, 1446 sizeof(__le32), GFP_KERNEL); 1447 if (!ctrl->changed_ns_list) 1448 goto out_free_ctrl; 1449 1450 ctrl->sqs = kcalloc(subsys->max_qid + 1, 1451 sizeof(struct nvmet_sq *), 1452 GFP_KERNEL); 1453 if (!ctrl->sqs) 1454 goto out_free_changed_ns_list; 1455 1456 ret = ida_alloc_range(&cntlid_ida, 1457 subsys->cntlid_min, subsys->cntlid_max, 1458 GFP_KERNEL); 1459 if (ret < 0) { 1460 status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; 1461 goto out_free_sqs; 1462 } 1463 ctrl->cntlid = ret; 1464 1465 /* 1466 * Discovery controllers may use some arbitrary high value 1467 * in order to cleanup stale discovery sessions 1468 */ 1469 if (nvmet_is_disc_subsys(ctrl->subsys) && !kato) 1470 kato = NVMET_DISC_KATO_MS; 1471 1472 /* keep-alive timeout in seconds */ 1473 ctrl->kato = DIV_ROUND_UP(kato, 1000); 1474 1475 ctrl->err_counter = 0; 1476 spin_lock_init(&ctrl->error_lock); 1477 1478 nvmet_start_keep_alive_timer(ctrl); 1479 1480 mutex_lock(&subsys->lock); 1481 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); 1482 nvmet_setup_p2p_ns_map(ctrl, req); 1483 nvmet_debugfs_ctrl_setup(ctrl); 1484 mutex_unlock(&subsys->lock); 1485 1486 *ctrlp = ctrl; 1487 return 0; 1488 1489 out_free_sqs: 1490 kfree(ctrl->sqs); 1491 out_free_changed_ns_list: 1492 kfree(ctrl->changed_ns_list); 1493 out_free_ctrl: 1494 kfree(ctrl); 1495 out_put_subsystem: 1496 nvmet_subsys_put(subsys); 1497 out: 1498 return status; 1499 } 1500 1501 static void nvmet_ctrl_free(struct kref *ref) 1502 { 1503 struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); 1504 struct nvmet_subsys *subsys = ctrl->subsys; 1505 1506 mutex_lock(&subsys->lock); 1507 nvmet_release_p2p_ns_map(ctrl); 1508 list_del(&ctrl->subsys_entry); 1509 mutex_unlock(&subsys->lock); 1510 1511 nvmet_stop_keep_alive_timer(ctrl); 1512 1513 flush_work(&ctrl->async_event_work); 1514 cancel_work_sync(&ctrl->fatal_err_work); 1515 1516 nvmet_destroy_auth(ctrl); 1517 1518 nvmet_debugfs_ctrl_free(ctrl); 1519 1520 ida_free(&cntlid_ida, ctrl->cntlid); 1521 1522 nvmet_async_events_free(ctrl); 1523 kfree(ctrl->sqs); 1524 kfree(ctrl->changed_ns_list); 1525 kfree(ctrl); 1526 1527 nvmet_subsys_put(subsys); 1528 } 1529 1530 void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) 1531 { 1532 kref_put(&ctrl->ref, nvmet_ctrl_free); 1533 } 1534 1535 void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) 1536 { 1537 mutex_lock(&ctrl->lock); 1538 if (!(ctrl->csts & NVME_CSTS_CFS)) { 1539 ctrl->csts |= NVME_CSTS_CFS; 1540 queue_work(nvmet_wq, &ctrl->fatal_err_work); 1541 } 1542 mutex_unlock(&ctrl->lock); 1543 } 1544 EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); 1545 1546 ssize_t nvmet_ctrl_host_traddr(struct nvmet_ctrl *ctrl, 1547 char *traddr, size_t traddr_len) 1548 { 1549 if (!ctrl->ops->host_traddr) 1550 return -EOPNOTSUPP; 1551 return ctrl->ops->host_traddr(ctrl, traddr, traddr_len); 1552 } 1553 1554 static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, 1555 const char *subsysnqn) 1556 { 1557 struct nvmet_subsys_link *p; 1558 1559 if (!port) 1560 return NULL; 1561 1562 if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) { 1563 if (!kref_get_unless_zero(&nvmet_disc_subsys->ref)) 1564 return NULL; 1565 return nvmet_disc_subsys; 1566 } 1567 1568 down_read(&nvmet_config_sem); 1569 if (!strncmp(nvmet_disc_subsys->subsysnqn, subsysnqn, 1570 NVMF_NQN_SIZE)) { 1571 if (kref_get_unless_zero(&nvmet_disc_subsys->ref)) { 1572 up_read(&nvmet_config_sem); 1573 return nvmet_disc_subsys; 1574 } 1575 } 1576 list_for_each_entry(p, &port->subsystems, entry) { 1577 if (!strncmp(p->subsys->subsysnqn, subsysnqn, 1578 NVMF_NQN_SIZE)) { 1579 if (!kref_get_unless_zero(&p->subsys->ref)) 1580 break; 1581 up_read(&nvmet_config_sem); 1582 return p->subsys; 1583 } 1584 } 1585 up_read(&nvmet_config_sem); 1586 return NULL; 1587 } 1588 1589 struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, 1590 enum nvme_subsys_type type) 1591 { 1592 struct nvmet_subsys *subsys; 1593 char serial[NVMET_SN_MAX_SIZE / 2]; 1594 int ret; 1595 1596 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); 1597 if (!subsys) 1598 return ERR_PTR(-ENOMEM); 1599 1600 subsys->ver = NVMET_DEFAULT_VS; 1601 /* generate a random serial number as our controllers are ephemeral: */ 1602 get_random_bytes(&serial, sizeof(serial)); 1603 bin2hex(subsys->serial, &serial, sizeof(serial)); 1604 1605 subsys->model_number = kstrdup(NVMET_DEFAULT_CTRL_MODEL, GFP_KERNEL); 1606 if (!subsys->model_number) { 1607 ret = -ENOMEM; 1608 goto free_subsys; 1609 } 1610 1611 subsys->ieee_oui = 0; 1612 1613 subsys->firmware_rev = kstrndup(UTS_RELEASE, NVMET_FR_MAX_SIZE, GFP_KERNEL); 1614 if (!subsys->firmware_rev) { 1615 ret = -ENOMEM; 1616 goto free_mn; 1617 } 1618 1619 switch (type) { 1620 case NVME_NQN_NVME: 1621 subsys->max_qid = NVMET_NR_QUEUES; 1622 break; 1623 case NVME_NQN_DISC: 1624 case NVME_NQN_CURR: 1625 subsys->max_qid = 0; 1626 break; 1627 default: 1628 pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); 1629 ret = -EINVAL; 1630 goto free_fr; 1631 } 1632 subsys->type = type; 1633 subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, 1634 GFP_KERNEL); 1635 if (!subsys->subsysnqn) { 1636 ret = -ENOMEM; 1637 goto free_fr; 1638 } 1639 subsys->cntlid_min = NVME_CNTLID_MIN; 1640 subsys->cntlid_max = NVME_CNTLID_MAX; 1641 kref_init(&subsys->ref); 1642 1643 mutex_init(&subsys->lock); 1644 xa_init(&subsys->namespaces); 1645 INIT_LIST_HEAD(&subsys->ctrls); 1646 INIT_LIST_HEAD(&subsys->hosts); 1647 1648 ret = nvmet_debugfs_subsys_setup(subsys); 1649 if (ret) 1650 goto free_subsysnqn; 1651 1652 return subsys; 1653 1654 free_subsysnqn: 1655 kfree(subsys->subsysnqn); 1656 free_fr: 1657 kfree(subsys->firmware_rev); 1658 free_mn: 1659 kfree(subsys->model_number); 1660 free_subsys: 1661 kfree(subsys); 1662 return ERR_PTR(ret); 1663 } 1664 1665 static void nvmet_subsys_free(struct kref *ref) 1666 { 1667 struct nvmet_subsys *subsys = 1668 container_of(ref, struct nvmet_subsys, ref); 1669 1670 WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); 1671 1672 nvmet_debugfs_subsys_free(subsys); 1673 1674 xa_destroy(&subsys->namespaces); 1675 nvmet_passthru_subsys_free(subsys); 1676 1677 kfree(subsys->subsysnqn); 1678 kfree(subsys->model_number); 1679 kfree(subsys->firmware_rev); 1680 kfree(subsys); 1681 } 1682 1683 void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys) 1684 { 1685 struct nvmet_ctrl *ctrl; 1686 1687 mutex_lock(&subsys->lock); 1688 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) 1689 ctrl->ops->delete_ctrl(ctrl); 1690 mutex_unlock(&subsys->lock); 1691 } 1692 1693 void nvmet_subsys_put(struct nvmet_subsys *subsys) 1694 { 1695 kref_put(&subsys->ref, nvmet_subsys_free); 1696 } 1697 1698 static int __init nvmet_init(void) 1699 { 1700 int error = -ENOMEM; 1701 1702 nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; 1703 1704 nvmet_bvec_cache = kmem_cache_create("nvmet-bvec", 1705 NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0, 1706 SLAB_HWCACHE_ALIGN, NULL); 1707 if (!nvmet_bvec_cache) 1708 return -ENOMEM; 1709 1710 zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); 1711 if (!zbd_wq) 1712 goto out_destroy_bvec_cache; 1713 1714 buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", 1715 WQ_MEM_RECLAIM, 0); 1716 if (!buffered_io_wq) 1717 goto out_free_zbd_work_queue; 1718 1719 nvmet_wq = alloc_workqueue("nvmet-wq", 1720 WQ_MEM_RECLAIM | WQ_UNBOUND, 0); 1721 if (!nvmet_wq) 1722 goto out_free_buffered_work_queue; 1723 1724 error = nvmet_init_discovery(); 1725 if (error) 1726 goto out_free_nvmet_work_queue; 1727 1728 error = nvmet_init_debugfs(); 1729 if (error) 1730 goto out_exit_discovery; 1731 1732 error = nvmet_init_configfs(); 1733 if (error) 1734 goto out_exit_debugfs; 1735 1736 return 0; 1737 1738 out_exit_debugfs: 1739 nvmet_exit_debugfs(); 1740 out_exit_discovery: 1741 nvmet_exit_discovery(); 1742 out_free_nvmet_work_queue: 1743 destroy_workqueue(nvmet_wq); 1744 out_free_buffered_work_queue: 1745 destroy_workqueue(buffered_io_wq); 1746 out_free_zbd_work_queue: 1747 destroy_workqueue(zbd_wq); 1748 out_destroy_bvec_cache: 1749 kmem_cache_destroy(nvmet_bvec_cache); 1750 return error; 1751 } 1752 1753 static void __exit nvmet_exit(void) 1754 { 1755 nvmet_exit_configfs(); 1756 nvmet_exit_debugfs(); 1757 nvmet_exit_discovery(); 1758 ida_destroy(&cntlid_ida); 1759 destroy_workqueue(nvmet_wq); 1760 destroy_workqueue(buffered_io_wq); 1761 destroy_workqueue(zbd_wq); 1762 kmem_cache_destroy(nvmet_bvec_cache); 1763 1764 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); 1765 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); 1766 } 1767 1768 module_init(nvmet_init); 1769 module_exit(nvmet_exit); 1770 1771 MODULE_DESCRIPTION("NVMe target core framework"); 1772 MODULE_LICENSE("GPL v2"); 1773