1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVM Express device driver 4 * Copyright (c) 2011-2014, Intel Corporation. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/blk-mq.h> 9 #include <linux/compat.h> 10 #include <linux/delay.h> 11 #include <linux/errno.h> 12 #include <linux/hdreg.h> 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/backing-dev.h> 16 #include <linux/list_sort.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/pr.h> 20 #include <linux/ptrace.h> 21 #include <linux/nvme_ioctl.h> 22 #include <linux/pm_qos.h> 23 #include <asm/unaligned.h> 24 25 #include "nvme.h" 26 #include "fabrics.h" 27 28 #define CREATE_TRACE_POINTS 29 #include "trace.h" 30 31 #define NVME_MINORS (1U << MINORBITS) 32 33 unsigned int admin_timeout = 60; 34 module_param(admin_timeout, uint, 0644); 35 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 36 EXPORT_SYMBOL_GPL(admin_timeout); 37 38 unsigned int nvme_io_timeout = 30; 39 module_param_named(io_timeout, nvme_io_timeout, uint, 0644); 40 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 41 EXPORT_SYMBOL_GPL(nvme_io_timeout); 42 43 static unsigned char shutdown_timeout = 5; 44 module_param(shutdown_timeout, byte, 0644); 45 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 46 47 static u8 nvme_max_retries = 5; 48 module_param_named(max_retries, nvme_max_retries, byte, 0644); 49 MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); 50 51 static unsigned long default_ps_max_latency_us = 100000; 52 module_param(default_ps_max_latency_us, ulong, 0644); 53 MODULE_PARM_DESC(default_ps_max_latency_us, 54 "max power saving latency for new devices; use PM QOS to change per device"); 55 56 static bool force_apst; 57 module_param(force_apst, bool, 0644); 58 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); 59 60 static bool streams; 61 module_param(streams, bool, 0644); 62 MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); 63 64 /* 65 * nvme_wq - hosts nvme related works that are not reset or delete 66 * nvme_reset_wq - hosts nvme reset works 67 * nvme_delete_wq - hosts nvme delete works 68 * 69 * nvme_wq will host works such as scan, aen handling, fw activation, 70 * keep-alive, periodic reconnects etc. nvme_reset_wq 71 * runs reset works which also flush works hosted on nvme_wq for 72 * serialization purposes. nvme_delete_wq host controller deletion 73 * works which flush reset works for serialization. 74 */ 75 struct workqueue_struct *nvme_wq; 76 EXPORT_SYMBOL_GPL(nvme_wq); 77 78 struct workqueue_struct *nvme_reset_wq; 79 EXPORT_SYMBOL_GPL(nvme_reset_wq); 80 81 struct workqueue_struct *nvme_delete_wq; 82 EXPORT_SYMBOL_GPL(nvme_delete_wq); 83 84 static LIST_HEAD(nvme_subsystems); 85 static DEFINE_MUTEX(nvme_subsystems_lock); 86 87 static DEFINE_IDA(nvme_instance_ida); 88 static dev_t nvme_ctrl_base_chr_devt; 89 static struct class *nvme_class; 90 static struct class *nvme_subsys_class; 91 92 static void nvme_put_subsystem(struct nvme_subsystem *subsys); 93 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 94 unsigned nsid); 95 96 /* 97 * Prepare a queue for teardown. 98 * 99 * This must forcibly unquiesce queues to avoid blocking dispatch, and only set 100 * the capacity to 0 after that to avoid blocking dispatchers that may be 101 * holding bd_butex. This will end buffered writers dirtying pages that can't 102 * be synced. 103 */ 104 static void nvme_set_queue_dying(struct nvme_ns *ns) 105 { 106 if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 107 return; 108 109 blk_set_queue_dying(ns->queue); 110 blk_mq_unquiesce_queue(ns->queue); 111 112 set_capacity_and_notify(ns->disk, 0); 113 } 114 115 static void nvme_queue_scan(struct nvme_ctrl *ctrl) 116 { 117 /* 118 * Only new queue scan work when admin and IO queues are both alive 119 */ 120 if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) 121 queue_work(nvme_wq, &ctrl->scan_work); 122 } 123 124 /* 125 * Use this function to proceed with scheduling reset_work for a controller 126 * that had previously been set to the resetting state. This is intended for 127 * code paths that can't be interrupted by other reset attempts. A hot removal 128 * may prevent this from succeeding. 129 */ 130 int nvme_try_sched_reset(struct nvme_ctrl *ctrl) 131 { 132 if (ctrl->state != NVME_CTRL_RESETTING) 133 return -EBUSY; 134 if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 135 return -EBUSY; 136 return 0; 137 } 138 EXPORT_SYMBOL_GPL(nvme_try_sched_reset); 139 140 static void nvme_failfast_work(struct work_struct *work) 141 { 142 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 143 struct nvme_ctrl, failfast_work); 144 145 if (ctrl->state != NVME_CTRL_CONNECTING) 146 return; 147 148 set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); 149 dev_info(ctrl->device, "failfast expired\n"); 150 nvme_kick_requeue_lists(ctrl); 151 } 152 153 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl) 154 { 155 if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1) 156 return; 157 158 schedule_delayed_work(&ctrl->failfast_work, 159 ctrl->opts->fast_io_fail_tmo * HZ); 160 } 161 162 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl) 163 { 164 if (!ctrl->opts) 165 return; 166 167 cancel_delayed_work_sync(&ctrl->failfast_work); 168 clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); 169 } 170 171 172 int nvme_reset_ctrl(struct nvme_ctrl *ctrl) 173 { 174 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 175 return -EBUSY; 176 if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 177 return -EBUSY; 178 return 0; 179 } 180 EXPORT_SYMBOL_GPL(nvme_reset_ctrl); 181 182 static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) 183 { 184 int ret; 185 186 ret = nvme_reset_ctrl(ctrl); 187 if (!ret) { 188 flush_work(&ctrl->reset_work); 189 if (ctrl->state != NVME_CTRL_LIVE) 190 ret = -ENETRESET; 191 } 192 193 return ret; 194 } 195 196 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) 197 { 198 dev_info(ctrl->device, 199 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); 200 201 flush_work(&ctrl->reset_work); 202 nvme_stop_ctrl(ctrl); 203 nvme_remove_namespaces(ctrl); 204 ctrl->ops->delete_ctrl(ctrl); 205 nvme_uninit_ctrl(ctrl); 206 } 207 208 static void nvme_delete_ctrl_work(struct work_struct *work) 209 { 210 struct nvme_ctrl *ctrl = 211 container_of(work, struct nvme_ctrl, delete_work); 212 213 nvme_do_delete_ctrl(ctrl); 214 } 215 216 int nvme_delete_ctrl(struct nvme_ctrl *ctrl) 217 { 218 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 219 return -EBUSY; 220 if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) 221 return -EBUSY; 222 return 0; 223 } 224 EXPORT_SYMBOL_GPL(nvme_delete_ctrl); 225 226 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) 227 { 228 /* 229 * Keep a reference until nvme_do_delete_ctrl() complete, 230 * since ->delete_ctrl can free the controller. 231 */ 232 nvme_get_ctrl(ctrl); 233 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 234 nvme_do_delete_ctrl(ctrl); 235 nvme_put_ctrl(ctrl); 236 } 237 238 static blk_status_t nvme_error_status(u16 status) 239 { 240 switch (status & 0x7ff) { 241 case NVME_SC_SUCCESS: 242 return BLK_STS_OK; 243 case NVME_SC_CAP_EXCEEDED: 244 return BLK_STS_NOSPC; 245 case NVME_SC_LBA_RANGE: 246 case NVME_SC_CMD_INTERRUPTED: 247 case NVME_SC_NS_NOT_READY: 248 return BLK_STS_TARGET; 249 case NVME_SC_BAD_ATTRIBUTES: 250 case NVME_SC_ONCS_NOT_SUPPORTED: 251 case NVME_SC_INVALID_OPCODE: 252 case NVME_SC_INVALID_FIELD: 253 case NVME_SC_INVALID_NS: 254 return BLK_STS_NOTSUPP; 255 case NVME_SC_WRITE_FAULT: 256 case NVME_SC_READ_ERROR: 257 case NVME_SC_UNWRITTEN_BLOCK: 258 case NVME_SC_ACCESS_DENIED: 259 case NVME_SC_READ_ONLY: 260 case NVME_SC_COMPARE_FAILED: 261 return BLK_STS_MEDIUM; 262 case NVME_SC_GUARD_CHECK: 263 case NVME_SC_APPTAG_CHECK: 264 case NVME_SC_REFTAG_CHECK: 265 case NVME_SC_INVALID_PI: 266 return BLK_STS_PROTECTION; 267 case NVME_SC_RESERVATION_CONFLICT: 268 return BLK_STS_NEXUS; 269 case NVME_SC_HOST_PATH_ERROR: 270 return BLK_STS_TRANSPORT; 271 case NVME_SC_ZONE_TOO_MANY_ACTIVE: 272 return BLK_STS_ZONE_ACTIVE_RESOURCE; 273 case NVME_SC_ZONE_TOO_MANY_OPEN: 274 return BLK_STS_ZONE_OPEN_RESOURCE; 275 default: 276 return BLK_STS_IOERR; 277 } 278 } 279 280 static void nvme_retry_req(struct request *req) 281 { 282 struct nvme_ns *ns = req->q->queuedata; 283 unsigned long delay = 0; 284 u16 crd; 285 286 /* The mask and shift result must be <= 3 */ 287 crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; 288 if (ns && crd) 289 delay = ns->ctrl->crdt[crd - 1] * 100; 290 291 nvme_req(req)->retries++; 292 blk_mq_requeue_request(req, false); 293 blk_mq_delay_kick_requeue_list(req->q, delay); 294 } 295 296 enum nvme_disposition { 297 COMPLETE, 298 RETRY, 299 FAILOVER, 300 }; 301 302 static inline enum nvme_disposition nvme_decide_disposition(struct request *req) 303 { 304 if (likely(nvme_req(req)->status == 0)) 305 return COMPLETE; 306 307 if (blk_noretry_request(req) || 308 (nvme_req(req)->status & NVME_SC_DNR) || 309 nvme_req(req)->retries >= nvme_max_retries) 310 return COMPLETE; 311 312 if (req->cmd_flags & REQ_NVME_MPATH) { 313 if (nvme_is_path_error(nvme_req(req)->status) || 314 blk_queue_dying(req->q)) 315 return FAILOVER; 316 } else { 317 if (blk_queue_dying(req->q)) 318 return COMPLETE; 319 } 320 321 return RETRY; 322 } 323 324 static inline void nvme_end_req(struct request *req) 325 { 326 blk_status_t status = nvme_error_status(nvme_req(req)->status); 327 328 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 329 req_op(req) == REQ_OP_ZONE_APPEND) 330 req->__sector = nvme_lba_to_sect(req->q->queuedata, 331 le64_to_cpu(nvme_req(req)->result.u64)); 332 333 nvme_trace_bio_complete(req); 334 blk_mq_end_request(req, status); 335 } 336 337 void nvme_complete_rq(struct request *req) 338 { 339 trace_nvme_complete_rq(req); 340 nvme_cleanup_cmd(req); 341 342 if (nvme_req(req)->ctrl->kas) 343 nvme_req(req)->ctrl->comp_seen = true; 344 345 switch (nvme_decide_disposition(req)) { 346 case COMPLETE: 347 nvme_end_req(req); 348 return; 349 case RETRY: 350 nvme_retry_req(req); 351 return; 352 case FAILOVER: 353 nvme_failover_req(req); 354 return; 355 } 356 } 357 EXPORT_SYMBOL_GPL(nvme_complete_rq); 358 359 bool nvme_cancel_request(struct request *req, void *data, bool reserved) 360 { 361 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, 362 "Cancelling I/O %d", req->tag); 363 364 /* don't abort one completed request */ 365 if (blk_mq_request_completed(req)) 366 return true; 367 368 nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; 369 blk_mq_complete_request(req); 370 return true; 371 } 372 EXPORT_SYMBOL_GPL(nvme_cancel_request); 373 374 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 375 enum nvme_ctrl_state new_state) 376 { 377 enum nvme_ctrl_state old_state; 378 unsigned long flags; 379 bool changed = false; 380 381 spin_lock_irqsave(&ctrl->lock, flags); 382 383 old_state = ctrl->state; 384 switch (new_state) { 385 case NVME_CTRL_LIVE: 386 switch (old_state) { 387 case NVME_CTRL_NEW: 388 case NVME_CTRL_RESETTING: 389 case NVME_CTRL_CONNECTING: 390 changed = true; 391 fallthrough; 392 default: 393 break; 394 } 395 break; 396 case NVME_CTRL_RESETTING: 397 switch (old_state) { 398 case NVME_CTRL_NEW: 399 case NVME_CTRL_LIVE: 400 changed = true; 401 fallthrough; 402 default: 403 break; 404 } 405 break; 406 case NVME_CTRL_CONNECTING: 407 switch (old_state) { 408 case NVME_CTRL_NEW: 409 case NVME_CTRL_RESETTING: 410 changed = true; 411 fallthrough; 412 default: 413 break; 414 } 415 break; 416 case NVME_CTRL_DELETING: 417 switch (old_state) { 418 case NVME_CTRL_LIVE: 419 case NVME_CTRL_RESETTING: 420 case NVME_CTRL_CONNECTING: 421 changed = true; 422 fallthrough; 423 default: 424 break; 425 } 426 break; 427 case NVME_CTRL_DELETING_NOIO: 428 switch (old_state) { 429 case NVME_CTRL_DELETING: 430 case NVME_CTRL_DEAD: 431 changed = true; 432 fallthrough; 433 default: 434 break; 435 } 436 break; 437 case NVME_CTRL_DEAD: 438 switch (old_state) { 439 case NVME_CTRL_DELETING: 440 changed = true; 441 fallthrough; 442 default: 443 break; 444 } 445 break; 446 default: 447 break; 448 } 449 450 if (changed) { 451 ctrl->state = new_state; 452 wake_up_all(&ctrl->state_wq); 453 } 454 455 spin_unlock_irqrestore(&ctrl->lock, flags); 456 if (!changed) 457 return false; 458 459 if (ctrl->state == NVME_CTRL_LIVE) { 460 if (old_state == NVME_CTRL_CONNECTING) 461 nvme_stop_failfast_work(ctrl); 462 nvme_kick_requeue_lists(ctrl); 463 } else if (ctrl->state == NVME_CTRL_CONNECTING && 464 old_state == NVME_CTRL_RESETTING) { 465 nvme_start_failfast_work(ctrl); 466 } 467 return changed; 468 } 469 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 470 471 /* 472 * Returns true for sink states that can't ever transition back to live. 473 */ 474 static bool nvme_state_terminal(struct nvme_ctrl *ctrl) 475 { 476 switch (ctrl->state) { 477 case NVME_CTRL_NEW: 478 case NVME_CTRL_LIVE: 479 case NVME_CTRL_RESETTING: 480 case NVME_CTRL_CONNECTING: 481 return false; 482 case NVME_CTRL_DELETING: 483 case NVME_CTRL_DELETING_NOIO: 484 case NVME_CTRL_DEAD: 485 return true; 486 default: 487 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); 488 return true; 489 } 490 } 491 492 /* 493 * Waits for the controller state to be resetting, or returns false if it is 494 * not possible to ever transition to that state. 495 */ 496 bool nvme_wait_reset(struct nvme_ctrl *ctrl) 497 { 498 wait_event(ctrl->state_wq, 499 nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || 500 nvme_state_terminal(ctrl)); 501 return ctrl->state == NVME_CTRL_RESETTING; 502 } 503 EXPORT_SYMBOL_GPL(nvme_wait_reset); 504 505 static void nvme_free_ns_head(struct kref *ref) 506 { 507 struct nvme_ns_head *head = 508 container_of(ref, struct nvme_ns_head, ref); 509 510 nvme_mpath_remove_disk(head); 511 ida_simple_remove(&head->subsys->ns_ida, head->instance); 512 cleanup_srcu_struct(&head->srcu); 513 nvme_put_subsystem(head->subsys); 514 kfree(head); 515 } 516 517 static void nvme_put_ns_head(struct nvme_ns_head *head) 518 { 519 kref_put(&head->ref, nvme_free_ns_head); 520 } 521 522 static void nvme_free_ns(struct kref *kref) 523 { 524 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 525 526 if (ns->ndev) 527 nvme_nvm_unregister(ns); 528 529 put_disk(ns->disk); 530 nvme_put_ns_head(ns->head); 531 nvme_put_ctrl(ns->ctrl); 532 kfree(ns); 533 } 534 535 void nvme_put_ns(struct nvme_ns *ns) 536 { 537 kref_put(&ns->kref, nvme_free_ns); 538 } 539 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); 540 541 static inline void nvme_clear_nvme_request(struct request *req) 542 { 543 if (!(req->rq_flags & RQF_DONTPREP)) { 544 nvme_req(req)->retries = 0; 545 nvme_req(req)->flags = 0; 546 req->rq_flags |= RQF_DONTPREP; 547 } 548 } 549 550 static inline unsigned int nvme_req_op(struct nvme_command *cmd) 551 { 552 return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; 553 } 554 555 static inline void nvme_init_request(struct request *req, 556 struct nvme_command *cmd) 557 { 558 if (req->q->queuedata) 559 req->timeout = NVME_IO_TIMEOUT; 560 else /* no queuedata implies admin queue */ 561 req->timeout = NVME_ADMIN_TIMEOUT; 562 563 req->cmd_flags |= REQ_FAILFAST_DRIVER; 564 nvme_clear_nvme_request(req); 565 nvme_req(req)->cmd = cmd; 566 } 567 568 struct request *nvme_alloc_request(struct request_queue *q, 569 struct nvme_command *cmd, blk_mq_req_flags_t flags) 570 { 571 struct request *req; 572 573 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); 574 if (!IS_ERR(req)) 575 nvme_init_request(req, cmd); 576 return req; 577 } 578 EXPORT_SYMBOL_GPL(nvme_alloc_request); 579 580 static struct request *nvme_alloc_request_qid(struct request_queue *q, 581 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) 582 { 583 struct request *req; 584 585 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, 586 qid ? qid - 1 : 0); 587 if (!IS_ERR(req)) 588 nvme_init_request(req, cmd); 589 return req; 590 } 591 592 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) 593 { 594 struct nvme_command c; 595 596 memset(&c, 0, sizeof(c)); 597 598 c.directive.opcode = nvme_admin_directive_send; 599 c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); 600 c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; 601 c.directive.dtype = NVME_DIR_IDENTIFY; 602 c.directive.tdtype = NVME_DIR_STREAMS; 603 c.directive.endir = enable ? NVME_DIR_ENDIR : 0; 604 605 return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); 606 } 607 608 static int nvme_disable_streams(struct nvme_ctrl *ctrl) 609 { 610 return nvme_toggle_streams(ctrl, false); 611 } 612 613 static int nvme_enable_streams(struct nvme_ctrl *ctrl) 614 { 615 return nvme_toggle_streams(ctrl, true); 616 } 617 618 static int nvme_get_stream_params(struct nvme_ctrl *ctrl, 619 struct streams_directive_params *s, u32 nsid) 620 { 621 struct nvme_command c; 622 623 memset(&c, 0, sizeof(c)); 624 memset(s, 0, sizeof(*s)); 625 626 c.directive.opcode = nvme_admin_directive_recv; 627 c.directive.nsid = cpu_to_le32(nsid); 628 c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s))); 629 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; 630 c.directive.dtype = NVME_DIR_STREAMS; 631 632 return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); 633 } 634 635 static int nvme_configure_directives(struct nvme_ctrl *ctrl) 636 { 637 struct streams_directive_params s; 638 int ret; 639 640 if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) 641 return 0; 642 if (!streams) 643 return 0; 644 645 ret = nvme_enable_streams(ctrl); 646 if (ret) 647 return ret; 648 649 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); 650 if (ret) 651 goto out_disable_stream; 652 653 ctrl->nssa = le16_to_cpu(s.nssa); 654 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { 655 dev_info(ctrl->device, "too few streams (%u) available\n", 656 ctrl->nssa); 657 goto out_disable_stream; 658 } 659 660 ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); 661 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); 662 return 0; 663 664 out_disable_stream: 665 nvme_disable_streams(ctrl); 666 return ret; 667 } 668 669 /* 670 * Check if 'req' has a write hint associated with it. If it does, assign 671 * a valid namespace stream to the write. 672 */ 673 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, 674 struct request *req, u16 *control, 675 u32 *dsmgmt) 676 { 677 enum rw_hint streamid = req->write_hint; 678 679 if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) 680 streamid = 0; 681 else { 682 streamid--; 683 if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) 684 return; 685 686 *control |= NVME_RW_DTYPE_STREAMS; 687 *dsmgmt |= streamid << 16; 688 } 689 690 if (streamid < ARRAY_SIZE(req->q->write_hints)) 691 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; 692 } 693 694 static void nvme_setup_passthrough(struct request *req, 695 struct nvme_command *cmd) 696 { 697 memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); 698 /* passthru commands should let the driver set the SGL flags */ 699 cmd->common.flags &= ~NVME_CMD_SGL_ALL; 700 } 701 702 static inline void nvme_setup_flush(struct nvme_ns *ns, 703 struct nvme_command *cmnd) 704 { 705 cmnd->common.opcode = nvme_cmd_flush; 706 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); 707 } 708 709 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, 710 struct nvme_command *cmnd) 711 { 712 unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; 713 struct nvme_dsm_range *range; 714 struct bio *bio; 715 716 /* 717 * Some devices do not consider the DSM 'Number of Ranges' field when 718 * determining how much data to DMA. Always allocate memory for maximum 719 * number of segments to prevent device reading beyond end of buffer. 720 */ 721 static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES; 722 723 range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN); 724 if (!range) { 725 /* 726 * If we fail allocation our range, fallback to the controller 727 * discard page. If that's also busy, it's safe to return 728 * busy, as we know we can make progress once that's freed. 729 */ 730 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) 731 return BLK_STS_RESOURCE; 732 733 range = page_address(ns->ctrl->discard_page); 734 } 735 736 __rq_for_each_bio(bio, req) { 737 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); 738 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; 739 740 if (n < segments) { 741 range[n].cattr = cpu_to_le32(0); 742 range[n].nlb = cpu_to_le32(nlb); 743 range[n].slba = cpu_to_le64(slba); 744 } 745 n++; 746 } 747 748 if (WARN_ON_ONCE(n != segments)) { 749 if (virt_to_page(range) == ns->ctrl->discard_page) 750 clear_bit_unlock(0, &ns->ctrl->discard_page_busy); 751 else 752 kfree(range); 753 return BLK_STS_IOERR; 754 } 755 756 cmnd->dsm.opcode = nvme_cmd_dsm; 757 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); 758 cmnd->dsm.nr = cpu_to_le32(segments - 1); 759 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 760 761 req->special_vec.bv_page = virt_to_page(range); 762 req->special_vec.bv_offset = offset_in_page(range); 763 req->special_vec.bv_len = alloc_size; 764 req->rq_flags |= RQF_SPECIAL_PAYLOAD; 765 766 return BLK_STS_OK; 767 } 768 769 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, 770 struct request *req, struct nvme_command *cmnd) 771 { 772 if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 773 return nvme_setup_discard(ns, req, cmnd); 774 775 cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; 776 cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); 777 cmnd->write_zeroes.slba = 778 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); 779 cmnd->write_zeroes.length = 780 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 781 cmnd->write_zeroes.control = 0; 782 return BLK_STS_OK; 783 } 784 785 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, 786 struct request *req, struct nvme_command *cmnd, 787 enum nvme_opcode op) 788 { 789 struct nvme_ctrl *ctrl = ns->ctrl; 790 u16 control = 0; 791 u32 dsmgmt = 0; 792 793 if (req->cmd_flags & REQ_FUA) 794 control |= NVME_RW_FUA; 795 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 796 control |= NVME_RW_LR; 797 798 if (req->cmd_flags & REQ_RAHEAD) 799 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 800 801 cmnd->rw.opcode = op; 802 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); 803 cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); 804 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 805 806 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) 807 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); 808 809 if (ns->ms) { 810 /* 811 * If formated with metadata, the block layer always provides a 812 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else 813 * we enable the PRACT bit for protection information or set the 814 * namespace capacity to zero to prevent any I/O. 815 */ 816 if (!blk_integrity_rq(req)) { 817 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) 818 return BLK_STS_NOTSUPP; 819 control |= NVME_RW_PRINFO_PRACT; 820 } 821 822 switch (ns->pi_type) { 823 case NVME_NS_DPS_PI_TYPE3: 824 control |= NVME_RW_PRINFO_PRCHK_GUARD; 825 break; 826 case NVME_NS_DPS_PI_TYPE1: 827 case NVME_NS_DPS_PI_TYPE2: 828 control |= NVME_RW_PRINFO_PRCHK_GUARD | 829 NVME_RW_PRINFO_PRCHK_REF; 830 if (op == nvme_cmd_zone_append) 831 control |= NVME_RW_APPEND_PIREMAP; 832 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); 833 break; 834 } 835 } 836 837 cmnd->rw.control = cpu_to_le16(control); 838 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 839 return 0; 840 } 841 842 void nvme_cleanup_cmd(struct request *req) 843 { 844 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { 845 struct nvme_ns *ns = req->rq_disk->private_data; 846 struct page *page = req->special_vec.bv_page; 847 848 if (page == ns->ctrl->discard_page) 849 clear_bit_unlock(0, &ns->ctrl->discard_page_busy); 850 else 851 kfree(page_address(page) + req->special_vec.bv_offset); 852 } 853 } 854 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); 855 856 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 857 struct nvme_command *cmd) 858 { 859 blk_status_t ret = BLK_STS_OK; 860 861 nvme_clear_nvme_request(req); 862 863 memset(cmd, 0, sizeof(*cmd)); 864 switch (req_op(req)) { 865 case REQ_OP_DRV_IN: 866 case REQ_OP_DRV_OUT: 867 nvme_setup_passthrough(req, cmd); 868 break; 869 case REQ_OP_FLUSH: 870 nvme_setup_flush(ns, cmd); 871 break; 872 case REQ_OP_ZONE_RESET_ALL: 873 case REQ_OP_ZONE_RESET: 874 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); 875 break; 876 case REQ_OP_ZONE_OPEN: 877 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); 878 break; 879 case REQ_OP_ZONE_CLOSE: 880 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); 881 break; 882 case REQ_OP_ZONE_FINISH: 883 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); 884 break; 885 case REQ_OP_WRITE_ZEROES: 886 ret = nvme_setup_write_zeroes(ns, req, cmd); 887 break; 888 case REQ_OP_DISCARD: 889 ret = nvme_setup_discard(ns, req, cmd); 890 break; 891 case REQ_OP_READ: 892 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); 893 break; 894 case REQ_OP_WRITE: 895 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); 896 break; 897 case REQ_OP_ZONE_APPEND: 898 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); 899 break; 900 default: 901 WARN_ON_ONCE(1); 902 return BLK_STS_IOERR; 903 } 904 905 cmd->common.command_id = req->tag; 906 trace_nvme_setup_cmd(req, cmd); 907 return ret; 908 } 909 EXPORT_SYMBOL_GPL(nvme_setup_cmd); 910 911 static void nvme_end_sync_rq(struct request *rq, blk_status_t error) 912 { 913 struct completion *waiting = rq->end_io_data; 914 915 rq->end_io_data = NULL; 916 complete(waiting); 917 } 918 919 static void nvme_execute_rq_polled(struct request_queue *q, 920 struct gendisk *bd_disk, struct request *rq, int at_head) 921 { 922 DECLARE_COMPLETION_ONSTACK(wait); 923 924 WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); 925 926 rq->cmd_flags |= REQ_HIPRI; 927 rq->end_io_data = &wait; 928 blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); 929 930 while (!completion_done(&wait)) { 931 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); 932 cond_resched(); 933 } 934 } 935 936 /* 937 * Returns 0 on success. If the result is negative, it's a Linux error code; 938 * if the result is positive, it's an NVM Express status code 939 */ 940 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 941 union nvme_result *result, void *buffer, unsigned bufflen, 942 unsigned timeout, int qid, int at_head, 943 blk_mq_req_flags_t flags, bool poll) 944 { 945 struct request *req; 946 int ret; 947 948 if (qid == NVME_QID_ANY) 949 req = nvme_alloc_request(q, cmd, flags); 950 else 951 req = nvme_alloc_request_qid(q, cmd, flags, qid); 952 if (IS_ERR(req)) 953 return PTR_ERR(req); 954 955 if (timeout) 956 req->timeout = timeout; 957 958 if (buffer && bufflen) { 959 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); 960 if (ret) 961 goto out; 962 } 963 964 if (poll) 965 nvme_execute_rq_polled(req->q, NULL, req, at_head); 966 else 967 blk_execute_rq(req->q, NULL, req, at_head); 968 if (result) 969 *result = nvme_req(req)->result; 970 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 971 ret = -EINTR; 972 else 973 ret = nvme_req(req)->status; 974 out: 975 blk_mq_free_request(req); 976 return ret; 977 } 978 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); 979 980 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 981 void *buffer, unsigned bufflen) 982 { 983 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, 984 NVME_QID_ANY, 0, 0, false); 985 } 986 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 987 988 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, 989 unsigned len, u32 seed, bool write) 990 { 991 struct bio_integrity_payload *bip; 992 int ret = -ENOMEM; 993 void *buf; 994 995 buf = kmalloc(len, GFP_KERNEL); 996 if (!buf) 997 goto out; 998 999 ret = -EFAULT; 1000 if (write && copy_from_user(buf, ubuf, len)) 1001 goto out_free_meta; 1002 1003 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 1004 if (IS_ERR(bip)) { 1005 ret = PTR_ERR(bip); 1006 goto out_free_meta; 1007 } 1008 1009 bip->bip_iter.bi_size = len; 1010 bip->bip_iter.bi_sector = seed; 1011 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 1012 offset_in_page(buf)); 1013 if (ret == len) 1014 return buf; 1015 ret = -ENOMEM; 1016 out_free_meta: 1017 kfree(buf); 1018 out: 1019 return ERR_PTR(ret); 1020 } 1021 1022 static u32 nvme_known_admin_effects(u8 opcode) 1023 { 1024 switch (opcode) { 1025 case nvme_admin_format_nvm: 1026 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | 1027 NVME_CMD_EFFECTS_CSE_MASK; 1028 case nvme_admin_sanitize_nvm: 1029 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; 1030 default: 1031 break; 1032 } 1033 return 0; 1034 } 1035 1036 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) 1037 { 1038 u32 effects = 0; 1039 1040 if (ns) { 1041 if (ns->head->effects) 1042 effects = le32_to_cpu(ns->head->effects->iocs[opcode]); 1043 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) 1044 dev_warn(ctrl->device, 1045 "IO command:%02x has unhandled effects:%08x\n", 1046 opcode, effects); 1047 return 0; 1048 } 1049 1050 if (ctrl->effects) 1051 effects = le32_to_cpu(ctrl->effects->acs[opcode]); 1052 effects |= nvme_known_admin_effects(opcode); 1053 1054 return effects; 1055 } 1056 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); 1057 1058 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1059 u8 opcode) 1060 { 1061 u32 effects = nvme_command_effects(ctrl, ns, opcode); 1062 1063 /* 1064 * For simplicity, IO to all namespaces is quiesced even if the command 1065 * effects say only one namespace is affected. 1066 */ 1067 if (effects & NVME_CMD_EFFECTS_CSE_MASK) { 1068 mutex_lock(&ctrl->scan_lock); 1069 mutex_lock(&ctrl->subsys->lock); 1070 nvme_mpath_start_freeze(ctrl->subsys); 1071 nvme_mpath_wait_freeze(ctrl->subsys); 1072 nvme_start_freeze(ctrl); 1073 nvme_wait_freeze(ctrl); 1074 } 1075 return effects; 1076 } 1077 1078 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) 1079 { 1080 if (effects & NVME_CMD_EFFECTS_CSE_MASK) { 1081 nvme_unfreeze(ctrl); 1082 nvme_mpath_unfreeze(ctrl->subsys); 1083 mutex_unlock(&ctrl->subsys->lock); 1084 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); 1085 mutex_unlock(&ctrl->scan_lock); 1086 } 1087 if (effects & NVME_CMD_EFFECTS_CCC) 1088 nvme_init_identify(ctrl); 1089 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { 1090 nvme_queue_scan(ctrl); 1091 flush_work(&ctrl->scan_work); 1092 } 1093 } 1094 1095 void nvme_execute_passthru_rq(struct request *rq) 1096 { 1097 struct nvme_command *cmd = nvme_req(rq)->cmd; 1098 struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; 1099 struct nvme_ns *ns = rq->q->queuedata; 1100 struct gendisk *disk = ns ? ns->disk : NULL; 1101 u32 effects; 1102 1103 effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 1104 blk_execute_rq(rq->q, disk, rq, 0); 1105 nvme_passthru_end(ctrl, effects); 1106 } 1107 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); 1108 1109 static int nvme_submit_user_cmd(struct request_queue *q, 1110 struct nvme_command *cmd, void __user *ubuffer, 1111 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 1112 u32 meta_seed, u64 *result, unsigned timeout) 1113 { 1114 bool write = nvme_is_write(cmd); 1115 struct nvme_ns *ns = q->queuedata; 1116 struct gendisk *disk = ns ? ns->disk : NULL; 1117 struct request *req; 1118 struct bio *bio = NULL; 1119 void *meta = NULL; 1120 int ret; 1121 1122 req = nvme_alloc_request(q, cmd, 0); 1123 if (IS_ERR(req)) 1124 return PTR_ERR(req); 1125 1126 if (timeout) 1127 req->timeout = timeout; 1128 nvme_req(req)->flags |= NVME_REQ_USERCMD; 1129 1130 if (ubuffer && bufflen) { 1131 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, 1132 GFP_KERNEL); 1133 if (ret) 1134 goto out; 1135 bio = req->bio; 1136 bio->bi_disk = disk; 1137 if (disk && meta_buffer && meta_len) { 1138 meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, 1139 meta_seed, write); 1140 if (IS_ERR(meta)) { 1141 ret = PTR_ERR(meta); 1142 goto out_unmap; 1143 } 1144 req->cmd_flags |= REQ_INTEGRITY; 1145 } 1146 } 1147 1148 nvme_execute_passthru_rq(req); 1149 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 1150 ret = -EINTR; 1151 else 1152 ret = nvme_req(req)->status; 1153 if (result) 1154 *result = le64_to_cpu(nvme_req(req)->result.u64); 1155 if (meta && !ret && !write) { 1156 if (copy_to_user(meta_buffer, meta, meta_len)) 1157 ret = -EFAULT; 1158 } 1159 kfree(meta); 1160 out_unmap: 1161 if (bio) 1162 blk_rq_unmap_user(bio); 1163 out: 1164 blk_mq_free_request(req); 1165 return ret; 1166 } 1167 1168 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) 1169 { 1170 struct nvme_ctrl *ctrl = rq->end_io_data; 1171 unsigned long flags; 1172 bool startka = false; 1173 1174 blk_mq_free_request(rq); 1175 1176 if (status) { 1177 dev_err(ctrl->device, 1178 "failed nvme_keep_alive_end_io error=%d\n", 1179 status); 1180 return; 1181 } 1182 1183 ctrl->comp_seen = false; 1184 spin_lock_irqsave(&ctrl->lock, flags); 1185 if (ctrl->state == NVME_CTRL_LIVE || 1186 ctrl->state == NVME_CTRL_CONNECTING) 1187 startka = true; 1188 spin_unlock_irqrestore(&ctrl->lock, flags); 1189 if (startka) 1190 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); 1191 } 1192 1193 static int nvme_keep_alive(struct nvme_ctrl *ctrl) 1194 { 1195 struct request *rq; 1196 1197 rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, 1198 BLK_MQ_REQ_RESERVED); 1199 if (IS_ERR(rq)) 1200 return PTR_ERR(rq); 1201 1202 rq->timeout = ctrl->kato * HZ; 1203 rq->end_io_data = ctrl; 1204 1205 blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); 1206 1207 return 0; 1208 } 1209 1210 static void nvme_keep_alive_work(struct work_struct *work) 1211 { 1212 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 1213 struct nvme_ctrl, ka_work); 1214 bool comp_seen = ctrl->comp_seen; 1215 1216 if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { 1217 dev_dbg(ctrl->device, 1218 "reschedule traffic based keep-alive timer\n"); 1219 ctrl->comp_seen = false; 1220 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); 1221 return; 1222 } 1223 1224 if (nvme_keep_alive(ctrl)) { 1225 /* allocation failure, reset the controller */ 1226 dev_err(ctrl->device, "keep-alive failed\n"); 1227 nvme_reset_ctrl(ctrl); 1228 return; 1229 } 1230 } 1231 1232 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) 1233 { 1234 if (unlikely(ctrl->kato == 0)) 1235 return; 1236 1237 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); 1238 } 1239 1240 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) 1241 { 1242 if (unlikely(ctrl->kato == 0)) 1243 return; 1244 1245 cancel_delayed_work_sync(&ctrl->ka_work); 1246 } 1247 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); 1248 1249 /* 1250 * In NVMe 1.0 the CNS field was just a binary controller or namespace 1251 * flag, thus sending any new CNS opcodes has a big chance of not working. 1252 * Qemu unfortunately had that bug after reporting a 1.1 version compliance 1253 * (but not for any later version). 1254 */ 1255 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) 1256 { 1257 if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) 1258 return ctrl->vs < NVME_VS(1, 2, 0); 1259 return ctrl->vs < NVME_VS(1, 1, 0); 1260 } 1261 1262 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 1263 { 1264 struct nvme_command c = { }; 1265 int error; 1266 1267 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1268 c.identify.opcode = nvme_admin_identify; 1269 c.identify.cns = NVME_ID_CNS_CTRL; 1270 1271 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 1272 if (!*id) 1273 return -ENOMEM; 1274 1275 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1276 sizeof(struct nvme_id_ctrl)); 1277 if (error) 1278 kfree(*id); 1279 return error; 1280 } 1281 1282 static bool nvme_multi_css(struct nvme_ctrl *ctrl) 1283 { 1284 return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; 1285 } 1286 1287 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, 1288 struct nvme_ns_id_desc *cur, bool *csi_seen) 1289 { 1290 const char *warn_str = "ctrl returned bogus length:"; 1291 void *data = cur; 1292 1293 switch (cur->nidt) { 1294 case NVME_NIDT_EUI64: 1295 if (cur->nidl != NVME_NIDT_EUI64_LEN) { 1296 dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n", 1297 warn_str, cur->nidl); 1298 return -1; 1299 } 1300 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); 1301 return NVME_NIDT_EUI64_LEN; 1302 case NVME_NIDT_NGUID: 1303 if (cur->nidl != NVME_NIDT_NGUID_LEN) { 1304 dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n", 1305 warn_str, cur->nidl); 1306 return -1; 1307 } 1308 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); 1309 return NVME_NIDT_NGUID_LEN; 1310 case NVME_NIDT_UUID: 1311 if (cur->nidl != NVME_NIDT_UUID_LEN) { 1312 dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n", 1313 warn_str, cur->nidl); 1314 return -1; 1315 } 1316 uuid_copy(&ids->uuid, data + sizeof(*cur)); 1317 return NVME_NIDT_UUID_LEN; 1318 case NVME_NIDT_CSI: 1319 if (cur->nidl != NVME_NIDT_CSI_LEN) { 1320 dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", 1321 warn_str, cur->nidl); 1322 return -1; 1323 } 1324 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); 1325 *csi_seen = true; 1326 return NVME_NIDT_CSI_LEN; 1327 default: 1328 /* Skip unknown types */ 1329 return cur->nidl; 1330 } 1331 } 1332 1333 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, 1334 struct nvme_ns_ids *ids) 1335 { 1336 struct nvme_command c = { }; 1337 bool csi_seen = false; 1338 int status, pos, len; 1339 void *data; 1340 1341 if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl)) 1342 return 0; 1343 if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) 1344 return 0; 1345 1346 c.identify.opcode = nvme_admin_identify; 1347 c.identify.nsid = cpu_to_le32(nsid); 1348 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; 1349 1350 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 1351 if (!data) 1352 return -ENOMEM; 1353 1354 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, 1355 NVME_IDENTIFY_DATA_SIZE); 1356 if (status) { 1357 dev_warn(ctrl->device, 1358 "Identify Descriptors failed (nsid=%u, status=0x%x)\n", 1359 nsid, status); 1360 goto free_data; 1361 } 1362 1363 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { 1364 struct nvme_ns_id_desc *cur = data + pos; 1365 1366 if (cur->nidl == 0) 1367 break; 1368 1369 len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); 1370 if (len < 0) 1371 break; 1372 1373 len += sizeof(*cur); 1374 } 1375 1376 if (nvme_multi_css(ctrl) && !csi_seen) { 1377 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", 1378 nsid); 1379 status = -EINVAL; 1380 } 1381 1382 free_data: 1383 kfree(data); 1384 return status; 1385 } 1386 1387 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, 1388 struct nvme_ns_ids *ids, struct nvme_id_ns **id) 1389 { 1390 struct nvme_command c = { }; 1391 int error; 1392 1393 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1394 c.identify.opcode = nvme_admin_identify; 1395 c.identify.nsid = cpu_to_le32(nsid); 1396 c.identify.cns = NVME_ID_CNS_NS; 1397 1398 *id = kmalloc(sizeof(**id), GFP_KERNEL); 1399 if (!*id) 1400 return -ENOMEM; 1401 1402 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); 1403 if (error) { 1404 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); 1405 goto out_free_id; 1406 } 1407 1408 error = -ENODEV; 1409 if ((*id)->ncap == 0) /* namespace not allocated or attached */ 1410 goto out_free_id; 1411 1412 if (ctrl->vs >= NVME_VS(1, 1, 0) && 1413 !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 1414 memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); 1415 if (ctrl->vs >= NVME_VS(1, 2, 0) && 1416 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 1417 memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); 1418 1419 return 0; 1420 1421 out_free_id: 1422 kfree(*id); 1423 return error; 1424 } 1425 1426 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, 1427 unsigned int dword11, void *buffer, size_t buflen, u32 *result) 1428 { 1429 union nvme_result res = { 0 }; 1430 struct nvme_command c; 1431 int ret; 1432 1433 memset(&c, 0, sizeof(c)); 1434 c.features.opcode = op; 1435 c.features.fid = cpu_to_le32(fid); 1436 c.features.dword11 = cpu_to_le32(dword11); 1437 1438 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, 1439 buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); 1440 if (ret >= 0 && result) 1441 *result = le32_to_cpu(res.u32); 1442 return ret; 1443 } 1444 1445 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, 1446 unsigned int dword11, void *buffer, size_t buflen, 1447 u32 *result) 1448 { 1449 return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, 1450 buflen, result); 1451 } 1452 EXPORT_SYMBOL_GPL(nvme_set_features); 1453 1454 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, 1455 unsigned int dword11, void *buffer, size_t buflen, 1456 u32 *result) 1457 { 1458 return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, 1459 buflen, result); 1460 } 1461 EXPORT_SYMBOL_GPL(nvme_get_features); 1462 1463 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) 1464 { 1465 u32 q_count = (*count - 1) | ((*count - 1) << 16); 1466 u32 result; 1467 int status, nr_io_queues; 1468 1469 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, 1470 &result); 1471 if (status < 0) 1472 return status; 1473 1474 /* 1475 * Degraded controllers might return an error when setting the queue 1476 * count. We still want to be able to bring them online and offer 1477 * access to the admin queue, as that might be only way to fix them up. 1478 */ 1479 if (status > 0) { 1480 dev_err(ctrl->device, "Could not set queue count (%d)\n", status); 1481 *count = 0; 1482 } else { 1483 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 1484 *count = min(*count, nr_io_queues); 1485 } 1486 1487 return 0; 1488 } 1489 EXPORT_SYMBOL_GPL(nvme_set_queue_count); 1490 1491 #define NVME_AEN_SUPPORTED \ 1492 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ 1493 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) 1494 1495 static void nvme_enable_aen(struct nvme_ctrl *ctrl) 1496 { 1497 u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED; 1498 int status; 1499 1500 if (!supported_aens) 1501 return; 1502 1503 status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens, 1504 NULL, 0, &result); 1505 if (status) 1506 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", 1507 supported_aens); 1508 1509 queue_work(nvme_wq, &ctrl->async_event_work); 1510 } 1511 1512 /* 1513 * Convert integer values from ioctl structures to user pointers, silently 1514 * ignoring the upper bits in the compat case to match behaviour of 32-bit 1515 * kernels. 1516 */ 1517 static void __user *nvme_to_user_ptr(uintptr_t ptrval) 1518 { 1519 if (in_compat_syscall()) 1520 ptrval = (compat_uptr_t)ptrval; 1521 return (void __user *)ptrval; 1522 } 1523 1524 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1525 { 1526 struct nvme_user_io io; 1527 struct nvme_command c; 1528 unsigned length, meta_len; 1529 void __user *metadata; 1530 1531 if (copy_from_user(&io, uio, sizeof(io))) 1532 return -EFAULT; 1533 if (io.flags) 1534 return -EINVAL; 1535 1536 switch (io.opcode) { 1537 case nvme_cmd_write: 1538 case nvme_cmd_read: 1539 case nvme_cmd_compare: 1540 break; 1541 default: 1542 return -EINVAL; 1543 } 1544 1545 length = (io.nblocks + 1) << ns->lba_shift; 1546 meta_len = (io.nblocks + 1) * ns->ms; 1547 metadata = nvme_to_user_ptr(io.metadata); 1548 1549 if (ns->features & NVME_NS_EXT_LBAS) { 1550 length += meta_len; 1551 meta_len = 0; 1552 } else if (meta_len) { 1553 if ((io.metadata & 3) || !io.metadata) 1554 return -EINVAL; 1555 } 1556 1557 memset(&c, 0, sizeof(c)); 1558 c.rw.opcode = io.opcode; 1559 c.rw.flags = io.flags; 1560 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 1561 c.rw.slba = cpu_to_le64(io.slba); 1562 c.rw.length = cpu_to_le16(io.nblocks); 1563 c.rw.control = cpu_to_le16(io.control); 1564 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1565 c.rw.reftag = cpu_to_le32(io.reftag); 1566 c.rw.apptag = cpu_to_le16(io.apptag); 1567 c.rw.appmask = cpu_to_le16(io.appmask); 1568 1569 return nvme_submit_user_cmd(ns->queue, &c, 1570 nvme_to_user_ptr(io.addr), length, 1571 metadata, meta_len, lower_32_bits(io.slba), NULL, 0); 1572 } 1573 1574 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1575 struct nvme_passthru_cmd __user *ucmd) 1576 { 1577 struct nvme_passthru_cmd cmd; 1578 struct nvme_command c; 1579 unsigned timeout = 0; 1580 u64 result; 1581 int status; 1582 1583 if (!capable(CAP_SYS_ADMIN)) 1584 return -EACCES; 1585 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1586 return -EFAULT; 1587 if (cmd.flags) 1588 return -EINVAL; 1589 1590 memset(&c, 0, sizeof(c)); 1591 c.common.opcode = cmd.opcode; 1592 c.common.flags = cmd.flags; 1593 c.common.nsid = cpu_to_le32(cmd.nsid); 1594 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1595 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1596 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 1597 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 1598 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 1599 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 1600 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 1601 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 1602 1603 if (cmd.timeout_ms) 1604 timeout = msecs_to_jiffies(cmd.timeout_ms); 1605 1606 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1607 nvme_to_user_ptr(cmd.addr), cmd.data_len, 1608 nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 1609 0, &result, timeout); 1610 1611 if (status >= 0) { 1612 if (put_user(result, &ucmd->result)) 1613 return -EFAULT; 1614 } 1615 1616 return status; 1617 } 1618 1619 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1620 struct nvme_passthru_cmd64 __user *ucmd) 1621 { 1622 struct nvme_passthru_cmd64 cmd; 1623 struct nvme_command c; 1624 unsigned timeout = 0; 1625 int status; 1626 1627 if (!capable(CAP_SYS_ADMIN)) 1628 return -EACCES; 1629 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1630 return -EFAULT; 1631 if (cmd.flags) 1632 return -EINVAL; 1633 1634 memset(&c, 0, sizeof(c)); 1635 c.common.opcode = cmd.opcode; 1636 c.common.flags = cmd.flags; 1637 c.common.nsid = cpu_to_le32(cmd.nsid); 1638 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1639 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1640 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 1641 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 1642 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 1643 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 1644 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 1645 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 1646 1647 if (cmd.timeout_ms) 1648 timeout = msecs_to_jiffies(cmd.timeout_ms); 1649 1650 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1651 nvme_to_user_ptr(cmd.addr), cmd.data_len, 1652 nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 1653 0, &cmd.result, timeout); 1654 1655 if (status >= 0) { 1656 if (put_user(cmd.result, &ucmd->result)) 1657 return -EFAULT; 1658 } 1659 1660 return status; 1661 } 1662 1663 /* 1664 * Issue ioctl requests on the first available path. Note that unlike normal 1665 * block layer requests we will not retry failed request on another controller. 1666 */ 1667 struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, 1668 struct nvme_ns_head **head, int *srcu_idx) 1669 { 1670 #ifdef CONFIG_NVME_MULTIPATH 1671 if (disk->fops == &nvme_ns_head_ops) { 1672 struct nvme_ns *ns; 1673 1674 *head = disk->private_data; 1675 *srcu_idx = srcu_read_lock(&(*head)->srcu); 1676 ns = nvme_find_path(*head); 1677 if (!ns) 1678 srcu_read_unlock(&(*head)->srcu, *srcu_idx); 1679 return ns; 1680 } 1681 #endif 1682 *head = NULL; 1683 *srcu_idx = -1; 1684 return disk->private_data; 1685 } 1686 1687 void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) 1688 { 1689 if (head) 1690 srcu_read_unlock(&head->srcu, idx); 1691 } 1692 1693 static bool is_ctrl_ioctl(unsigned int cmd) 1694 { 1695 if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) 1696 return true; 1697 if (is_sed_ioctl(cmd)) 1698 return true; 1699 return false; 1700 } 1701 1702 static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 1703 void __user *argp, 1704 struct nvme_ns_head *head, 1705 int srcu_idx) 1706 { 1707 struct nvme_ctrl *ctrl = ns->ctrl; 1708 int ret; 1709 1710 nvme_get_ctrl(ns->ctrl); 1711 nvme_put_ns_from_disk(head, srcu_idx); 1712 1713 switch (cmd) { 1714 case NVME_IOCTL_ADMIN_CMD: 1715 ret = nvme_user_cmd(ctrl, NULL, argp); 1716 break; 1717 case NVME_IOCTL_ADMIN64_CMD: 1718 ret = nvme_user_cmd64(ctrl, NULL, argp); 1719 break; 1720 default: 1721 ret = sed_ioctl(ctrl->opal_dev, cmd, argp); 1722 break; 1723 } 1724 nvme_put_ctrl(ctrl); 1725 return ret; 1726 } 1727 1728 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 1729 unsigned int cmd, unsigned long arg) 1730 { 1731 struct nvme_ns_head *head = NULL; 1732 void __user *argp = (void __user *)arg; 1733 struct nvme_ns *ns; 1734 int srcu_idx, ret; 1735 1736 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 1737 if (unlikely(!ns)) 1738 return -EWOULDBLOCK; 1739 1740 /* 1741 * Handle ioctls that apply to the controller instead of the namespace 1742 * seperately and drop the ns SRCU reference early. This avoids a 1743 * deadlock when deleting namespaces using the passthrough interface. 1744 */ 1745 if (is_ctrl_ioctl(cmd)) 1746 return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); 1747 1748 switch (cmd) { 1749 case NVME_IOCTL_ID: 1750 force_successful_syscall_return(); 1751 ret = ns->head->ns_id; 1752 break; 1753 case NVME_IOCTL_IO_CMD: 1754 ret = nvme_user_cmd(ns->ctrl, ns, argp); 1755 break; 1756 case NVME_IOCTL_SUBMIT_IO: 1757 ret = nvme_submit_io(ns, argp); 1758 break; 1759 case NVME_IOCTL_IO64_CMD: 1760 ret = nvme_user_cmd64(ns->ctrl, ns, argp); 1761 break; 1762 default: 1763 if (ns->ndev) 1764 ret = nvme_nvm_ioctl(ns, cmd, arg); 1765 else 1766 ret = -ENOTTY; 1767 } 1768 1769 nvme_put_ns_from_disk(head, srcu_idx); 1770 return ret; 1771 } 1772 1773 #ifdef CONFIG_COMPAT 1774 struct nvme_user_io32 { 1775 __u8 opcode; 1776 __u8 flags; 1777 __u16 control; 1778 __u16 nblocks; 1779 __u16 rsvd; 1780 __u64 metadata; 1781 __u64 addr; 1782 __u64 slba; 1783 __u32 dsmgmt; 1784 __u32 reftag; 1785 __u16 apptag; 1786 __u16 appmask; 1787 } __attribute__((__packed__)); 1788 1789 #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) 1790 1791 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1792 unsigned int cmd, unsigned long arg) 1793 { 1794 /* 1795 * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO 1796 * between 32 bit programs and 64 bit kernel. 1797 * The cause is that the results of sizeof(struct nvme_user_io), 1798 * which is used to define NVME_IOCTL_SUBMIT_IO, 1799 * are not same between 32 bit compiler and 64 bit compiler. 1800 * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling 1801 * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. 1802 * Other IOCTL numbers are same between 32 bit and 64 bit. 1803 * So there is nothing to do regarding to other IOCTL numbers. 1804 */ 1805 if (cmd == NVME_IOCTL_SUBMIT_IO32) 1806 return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); 1807 1808 return nvme_ioctl(bdev, mode, cmd, arg); 1809 } 1810 #else 1811 #define nvme_compat_ioctl NULL 1812 #endif /* CONFIG_COMPAT */ 1813 1814 static int nvme_open(struct block_device *bdev, fmode_t mode) 1815 { 1816 struct nvme_ns *ns = bdev->bd_disk->private_data; 1817 1818 #ifdef CONFIG_NVME_MULTIPATH 1819 /* should never be called due to GENHD_FL_HIDDEN */ 1820 if (WARN_ON_ONCE(ns->head->disk)) 1821 goto fail; 1822 #endif 1823 if (!kref_get_unless_zero(&ns->kref)) 1824 goto fail; 1825 if (!try_module_get(ns->ctrl->ops->module)) 1826 goto fail_put_ns; 1827 1828 return 0; 1829 1830 fail_put_ns: 1831 nvme_put_ns(ns); 1832 fail: 1833 return -ENXIO; 1834 } 1835 1836 static void nvme_release(struct gendisk *disk, fmode_t mode) 1837 { 1838 struct nvme_ns *ns = disk->private_data; 1839 1840 module_put(ns->ctrl->ops->module); 1841 nvme_put_ns(ns); 1842 } 1843 1844 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1845 { 1846 /* some standard values */ 1847 geo->heads = 1 << 6; 1848 geo->sectors = 1 << 5; 1849 geo->cylinders = get_capacity(bdev->bd_disk) >> 11; 1850 return 0; 1851 } 1852 1853 #ifdef CONFIG_BLK_DEV_INTEGRITY 1854 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, 1855 u32 max_integrity_segments) 1856 { 1857 struct blk_integrity integrity; 1858 1859 memset(&integrity, 0, sizeof(integrity)); 1860 switch (pi_type) { 1861 case NVME_NS_DPS_PI_TYPE3: 1862 integrity.profile = &t10_pi_type3_crc; 1863 integrity.tag_size = sizeof(u16) + sizeof(u32); 1864 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1865 break; 1866 case NVME_NS_DPS_PI_TYPE1: 1867 case NVME_NS_DPS_PI_TYPE2: 1868 integrity.profile = &t10_pi_type1_crc; 1869 integrity.tag_size = sizeof(u16); 1870 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1871 break; 1872 default: 1873 integrity.profile = NULL; 1874 break; 1875 } 1876 integrity.tuple_size = ms; 1877 blk_integrity_register(disk, &integrity); 1878 blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); 1879 } 1880 #else 1881 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, 1882 u32 max_integrity_segments) 1883 { 1884 } 1885 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 1886 1887 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) 1888 { 1889 struct nvme_ctrl *ctrl = ns->ctrl; 1890 struct request_queue *queue = disk->queue; 1891 u32 size = queue_logical_block_size(queue); 1892 1893 if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { 1894 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); 1895 return; 1896 } 1897 1898 if (ctrl->nr_streams && ns->sws && ns->sgs) 1899 size *= ns->sws * ns->sgs; 1900 1901 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < 1902 NVME_DSM_MAX_RANGES); 1903 1904 queue->limits.discard_alignment = 0; 1905 queue->limits.discard_granularity = size; 1906 1907 /* If discard is already enabled, don't reset queue limits */ 1908 if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) 1909 return; 1910 1911 blk_queue_max_discard_sectors(queue, UINT_MAX); 1912 blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); 1913 1914 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 1915 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); 1916 } 1917 1918 static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) 1919 { 1920 u64 max_blocks; 1921 1922 if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || 1923 (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) 1924 return; 1925 /* 1926 * Even though NVMe spec explicitly states that MDTS is not 1927 * applicable to the write-zeroes:- "The restriction does not apply to 1928 * commands that do not transfer data between the host and the 1929 * controller (e.g., Write Uncorrectable ro Write Zeroes command).". 1930 * In order to be more cautious use controller's max_hw_sectors value 1931 * to configure the maximum sectors for the write-zeroes which is 1932 * configured based on the controller's MDTS field in the 1933 * nvme_init_identify() if available. 1934 */ 1935 if (ns->ctrl->max_hw_sectors == UINT_MAX) 1936 max_blocks = (u64)USHRT_MAX + 1; 1937 else 1938 max_blocks = ns->ctrl->max_hw_sectors + 1; 1939 1940 blk_queue_max_write_zeroes_sectors(disk->queue, 1941 nvme_lba_to_sect(ns, max_blocks)); 1942 } 1943 1944 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) 1945 { 1946 return !uuid_is_null(&ids->uuid) || 1947 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || 1948 memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); 1949 } 1950 1951 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) 1952 { 1953 return uuid_equal(&a->uuid, &b->uuid) && 1954 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && 1955 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && 1956 a->csi == b->csi; 1957 } 1958 1959 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1960 u32 *phys_bs, u32 *io_opt) 1961 { 1962 struct streams_directive_params s; 1963 int ret; 1964 1965 if (!ctrl->nr_streams) 1966 return 0; 1967 1968 ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); 1969 if (ret) 1970 return ret; 1971 1972 ns->sws = le32_to_cpu(s.sws); 1973 ns->sgs = le16_to_cpu(s.sgs); 1974 1975 if (ns->sws) { 1976 *phys_bs = ns->sws * (1 << ns->lba_shift); 1977 if (ns->sgs) 1978 *io_opt = *phys_bs * ns->sgs; 1979 } 1980 1981 return 0; 1982 } 1983 1984 static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) 1985 { 1986 struct nvme_ctrl *ctrl = ns->ctrl; 1987 1988 /* 1989 * The PI implementation requires the metadata size to be equal to the 1990 * t10 pi tuple size. 1991 */ 1992 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); 1993 if (ns->ms == sizeof(struct t10_pi_tuple)) 1994 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; 1995 else 1996 ns->pi_type = 0; 1997 1998 ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); 1999 if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) 2000 return 0; 2001 if (ctrl->ops->flags & NVME_F_FABRICS) { 2002 /* 2003 * The NVMe over Fabrics specification only supports metadata as 2004 * part of the extended data LBA. We rely on HCA/HBA support to 2005 * remap the separate metadata buffer from the block layer. 2006 */ 2007 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) 2008 return -EINVAL; 2009 if (ctrl->max_integrity_segments) 2010 ns->features |= 2011 (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); 2012 } else { 2013 /* 2014 * For PCIe controllers, we can't easily remap the separate 2015 * metadata buffer from the block layer and thus require a 2016 * separate metadata buffer for block layer metadata/PI support. 2017 * We allow extended LBAs for the passthrough interface, though. 2018 */ 2019 if (id->flbas & NVME_NS_FLBAS_META_EXT) 2020 ns->features |= NVME_NS_EXT_LBAS; 2021 else 2022 ns->features |= NVME_NS_METADATA_SUPPORTED; 2023 } 2024 2025 return 0; 2026 } 2027 2028 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 2029 struct request_queue *q) 2030 { 2031 bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; 2032 2033 if (ctrl->max_hw_sectors) { 2034 u32 max_segments = 2035 (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; 2036 2037 max_segments = min_not_zero(max_segments, ctrl->max_segments); 2038 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 2039 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 2040 } 2041 blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); 2042 blk_queue_dma_alignment(q, 7); 2043 blk_queue_write_cache(q, vwc, vwc); 2044 } 2045 2046 static void nvme_update_disk_info(struct gendisk *disk, 2047 struct nvme_ns *ns, struct nvme_id_ns *id) 2048 { 2049 sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); 2050 unsigned short bs = 1 << ns->lba_shift; 2051 u32 atomic_bs, phys_bs, io_opt = 0; 2052 2053 /* 2054 * The block layer can't support LBA sizes larger than the page size 2055 * yet, so catch this early and don't allow block I/O. 2056 */ 2057 if (ns->lba_shift > PAGE_SHIFT) { 2058 capacity = 0; 2059 bs = (1 << 9); 2060 } 2061 2062 blk_integrity_unregister(disk); 2063 2064 atomic_bs = phys_bs = bs; 2065 nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt); 2066 if (id->nabo == 0) { 2067 /* 2068 * Bit 1 indicates whether NAWUPF is defined for this namespace 2069 * and whether it should be used instead of AWUPF. If NAWUPF == 2070 * 0 then AWUPF must be used instead. 2071 */ 2072 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) 2073 atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; 2074 else 2075 atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; 2076 } 2077 2078 if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { 2079 /* NPWG = Namespace Preferred Write Granularity */ 2080 phys_bs = bs * (1 + le16_to_cpu(id->npwg)); 2081 /* NOWS = Namespace Optimal Write Size */ 2082 io_opt = bs * (1 + le16_to_cpu(id->nows)); 2083 } 2084 2085 blk_queue_logical_block_size(disk->queue, bs); 2086 /* 2087 * Linux filesystems assume writing a single physical block is 2088 * an atomic operation. Hence limit the physical block size to the 2089 * value of the Atomic Write Unit Power Fail parameter. 2090 */ 2091 blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); 2092 blk_queue_io_min(disk->queue, phys_bs); 2093 blk_queue_io_opt(disk->queue, io_opt); 2094 2095 /* 2096 * Register a metadata profile for PI, or the plain non-integrity NVMe 2097 * metadata masquerading as Type 0 if supported, otherwise reject block 2098 * I/O to namespaces with metadata except when the namespace supports 2099 * PI, as it can strip/insert in that case. 2100 */ 2101 if (ns->ms) { 2102 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && 2103 (ns->features & NVME_NS_METADATA_SUPPORTED)) 2104 nvme_init_integrity(disk, ns->ms, ns->pi_type, 2105 ns->ctrl->max_integrity_segments); 2106 else if (!nvme_ns_has_pi(ns)) 2107 capacity = 0; 2108 } 2109 2110 set_capacity_and_notify(disk, capacity); 2111 2112 nvme_config_discard(disk, ns); 2113 nvme_config_write_zeroes(disk, ns); 2114 2115 if ((id->nsattr & NVME_NS_ATTR_RO) || 2116 test_bit(NVME_NS_FORCE_RO, &ns->flags)) 2117 set_disk_ro(disk, true); 2118 } 2119 2120 static inline bool nvme_first_scan(struct gendisk *disk) 2121 { 2122 /* nvme_alloc_ns() scans the disk prior to adding it */ 2123 return !(disk->flags & GENHD_FL_UP); 2124 } 2125 2126 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) 2127 { 2128 struct nvme_ctrl *ctrl = ns->ctrl; 2129 u32 iob; 2130 2131 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && 2132 is_power_of_2(ctrl->max_hw_sectors)) 2133 iob = ctrl->max_hw_sectors; 2134 else 2135 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); 2136 2137 if (!iob) 2138 return; 2139 2140 if (!is_power_of_2(iob)) { 2141 if (nvme_first_scan(ns->disk)) 2142 pr_warn("%s: ignoring unaligned IO boundary:%u\n", 2143 ns->disk->disk_name, iob); 2144 return; 2145 } 2146 2147 if (blk_queue_is_zoned(ns->disk->queue)) { 2148 if (nvme_first_scan(ns->disk)) 2149 pr_warn("%s: ignoring zoned namespace IO boundary\n", 2150 ns->disk->disk_name); 2151 return; 2152 } 2153 2154 blk_queue_chunk_sectors(ns->queue, iob); 2155 } 2156 2157 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) 2158 { 2159 unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 2160 int ret; 2161 2162 blk_mq_freeze_queue(ns->disk->queue); 2163 ns->lba_shift = id->lbaf[lbaf].ds; 2164 nvme_set_queue_limits(ns->ctrl, ns->queue); 2165 2166 if (ns->head->ids.csi == NVME_CSI_ZNS) { 2167 ret = nvme_update_zone_info(ns, lbaf); 2168 if (ret) 2169 goto out_unfreeze; 2170 } 2171 2172 ret = nvme_configure_metadata(ns, id); 2173 if (ret) 2174 goto out_unfreeze; 2175 nvme_set_chunk_sectors(ns, id); 2176 nvme_update_disk_info(ns->disk, ns, id); 2177 blk_mq_unfreeze_queue(ns->disk->queue); 2178 2179 if (blk_queue_is_zoned(ns->queue)) { 2180 ret = nvme_revalidate_zones(ns); 2181 if (ret && !nvme_first_scan(ns->disk)) 2182 return ret; 2183 } 2184 2185 #ifdef CONFIG_NVME_MULTIPATH 2186 if (ns->head->disk) { 2187 blk_mq_freeze_queue(ns->head->disk->queue); 2188 nvme_update_disk_info(ns->head->disk, ns, id); 2189 blk_stack_limits(&ns->head->disk->queue->limits, 2190 &ns->queue->limits, 0); 2191 blk_queue_update_readahead(ns->head->disk->queue); 2192 blk_mq_unfreeze_queue(ns->head->disk->queue); 2193 } 2194 #endif 2195 return 0; 2196 2197 out_unfreeze: 2198 blk_mq_unfreeze_queue(ns->disk->queue); 2199 return ret; 2200 } 2201 2202 static char nvme_pr_type(enum pr_type type) 2203 { 2204 switch (type) { 2205 case PR_WRITE_EXCLUSIVE: 2206 return 1; 2207 case PR_EXCLUSIVE_ACCESS: 2208 return 2; 2209 case PR_WRITE_EXCLUSIVE_REG_ONLY: 2210 return 3; 2211 case PR_EXCLUSIVE_ACCESS_REG_ONLY: 2212 return 4; 2213 case PR_WRITE_EXCLUSIVE_ALL_REGS: 2214 return 5; 2215 case PR_EXCLUSIVE_ACCESS_ALL_REGS: 2216 return 6; 2217 default: 2218 return 0; 2219 } 2220 }; 2221 2222 static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 2223 u64 key, u64 sa_key, u8 op) 2224 { 2225 struct nvme_ns_head *head = NULL; 2226 struct nvme_ns *ns; 2227 struct nvme_command c; 2228 int srcu_idx, ret; 2229 u8 data[16] = { 0, }; 2230 2231 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 2232 if (unlikely(!ns)) 2233 return -EWOULDBLOCK; 2234 2235 put_unaligned_le64(key, &data[0]); 2236 put_unaligned_le64(sa_key, &data[8]); 2237 2238 memset(&c, 0, sizeof(c)); 2239 c.common.opcode = op; 2240 c.common.nsid = cpu_to_le32(ns->head->ns_id); 2241 c.common.cdw10 = cpu_to_le32(cdw10); 2242 2243 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); 2244 nvme_put_ns_from_disk(head, srcu_idx); 2245 return ret; 2246 } 2247 2248 static int nvme_pr_register(struct block_device *bdev, u64 old, 2249 u64 new, unsigned flags) 2250 { 2251 u32 cdw10; 2252 2253 if (flags & ~PR_FL_IGNORE_KEY) 2254 return -EOPNOTSUPP; 2255 2256 cdw10 = old ? 2 : 0; 2257 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 2258 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 2259 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 2260 } 2261 2262 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 2263 enum pr_type type, unsigned flags) 2264 { 2265 u32 cdw10; 2266 2267 if (flags & ~PR_FL_IGNORE_KEY) 2268 return -EOPNOTSUPP; 2269 2270 cdw10 = nvme_pr_type(type) << 8; 2271 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 2272 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 2273 } 2274 2275 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 2276 enum pr_type type, bool abort) 2277 { 2278 u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); 2279 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 2280 } 2281 2282 static int nvme_pr_clear(struct block_device *bdev, u64 key) 2283 { 2284 u32 cdw10 = 1 | (key ? 1 << 3 : 0); 2285 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); 2286 } 2287 2288 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 2289 { 2290 u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); 2291 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 2292 } 2293 2294 static const struct pr_ops nvme_pr_ops = { 2295 .pr_register = nvme_pr_register, 2296 .pr_reserve = nvme_pr_reserve, 2297 .pr_release = nvme_pr_release, 2298 .pr_preempt = nvme_pr_preempt, 2299 .pr_clear = nvme_pr_clear, 2300 }; 2301 2302 #ifdef CONFIG_BLK_SED_OPAL 2303 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, 2304 bool send) 2305 { 2306 struct nvme_ctrl *ctrl = data; 2307 struct nvme_command cmd; 2308 2309 memset(&cmd, 0, sizeof(cmd)); 2310 if (send) 2311 cmd.common.opcode = nvme_admin_security_send; 2312 else 2313 cmd.common.opcode = nvme_admin_security_recv; 2314 cmd.common.nsid = 0; 2315 cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); 2316 cmd.common.cdw11 = cpu_to_le32(len); 2317 2318 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, 2319 NVME_QID_ANY, 1, 0, false); 2320 } 2321 EXPORT_SYMBOL_GPL(nvme_sec_submit); 2322 #endif /* CONFIG_BLK_SED_OPAL */ 2323 2324 static const struct block_device_operations nvme_bdev_ops = { 2325 .owner = THIS_MODULE, 2326 .ioctl = nvme_ioctl, 2327 .compat_ioctl = nvme_compat_ioctl, 2328 .open = nvme_open, 2329 .release = nvme_release, 2330 .getgeo = nvme_getgeo, 2331 .report_zones = nvme_report_zones, 2332 .pr_ops = &nvme_pr_ops, 2333 }; 2334 2335 #ifdef CONFIG_NVME_MULTIPATH 2336 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 2337 { 2338 struct nvme_ns_head *head = bdev->bd_disk->private_data; 2339 2340 if (!kref_get_unless_zero(&head->ref)) 2341 return -ENXIO; 2342 return 0; 2343 } 2344 2345 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 2346 { 2347 nvme_put_ns_head(disk->private_data); 2348 } 2349 2350 const struct block_device_operations nvme_ns_head_ops = { 2351 .owner = THIS_MODULE, 2352 .submit_bio = nvme_ns_head_submit_bio, 2353 .open = nvme_ns_head_open, 2354 .release = nvme_ns_head_release, 2355 .ioctl = nvme_ioctl, 2356 .compat_ioctl = nvme_compat_ioctl, 2357 .getgeo = nvme_getgeo, 2358 .report_zones = nvme_report_zones, 2359 .pr_ops = &nvme_pr_ops, 2360 }; 2361 #endif /* CONFIG_NVME_MULTIPATH */ 2362 2363 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 2364 { 2365 unsigned long timeout = 2366 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 2367 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; 2368 int ret; 2369 2370 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 2371 if (csts == ~0) 2372 return -ENODEV; 2373 if ((csts & NVME_CSTS_RDY) == bit) 2374 break; 2375 2376 usleep_range(1000, 2000); 2377 if (fatal_signal_pending(current)) 2378 return -EINTR; 2379 if (time_after(jiffies, timeout)) { 2380 dev_err(ctrl->device, 2381 "Device not ready; aborting %s, CSTS=0x%x\n", 2382 enabled ? "initialisation" : "reset", csts); 2383 return -ENODEV; 2384 } 2385 } 2386 2387 return ret; 2388 } 2389 2390 /* 2391 * If the device has been passed off to us in an enabled state, just clear 2392 * the enabled bit. The spec says we should set the 'shutdown notification 2393 * bits', but doing so may cause the device to complete commands to the 2394 * admin queue ... and we don't know what memory that might be pointing at! 2395 */ 2396 int nvme_disable_ctrl(struct nvme_ctrl *ctrl) 2397 { 2398 int ret; 2399 2400 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 2401 ctrl->ctrl_config &= ~NVME_CC_ENABLE; 2402 2403 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2404 if (ret) 2405 return ret; 2406 2407 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) 2408 msleep(NVME_QUIRK_DELAY_AMOUNT); 2409 2410 return nvme_wait_ready(ctrl, ctrl->cap, false); 2411 } 2412 EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 2413 2414 int nvme_enable_ctrl(struct nvme_ctrl *ctrl) 2415 { 2416 unsigned dev_page_min; 2417 int ret; 2418 2419 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); 2420 if (ret) { 2421 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 2422 return ret; 2423 } 2424 dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; 2425 2426 if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { 2427 dev_err(ctrl->device, 2428 "Minimum device page size %u too large for host (%u)\n", 2429 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); 2430 return -ENODEV; 2431 } 2432 2433 if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) 2434 ctrl->ctrl_config = NVME_CC_CSS_CSI; 2435 else 2436 ctrl->ctrl_config = NVME_CC_CSS_NVM; 2437 ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 2438 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; 2439 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 2440 ctrl->ctrl_config |= NVME_CC_ENABLE; 2441 2442 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2443 if (ret) 2444 return ret; 2445 return nvme_wait_ready(ctrl, ctrl->cap, true); 2446 } 2447 EXPORT_SYMBOL_GPL(nvme_enable_ctrl); 2448 2449 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) 2450 { 2451 unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); 2452 u32 csts; 2453 int ret; 2454 2455 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 2456 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; 2457 2458 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2459 if (ret) 2460 return ret; 2461 2462 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 2463 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) 2464 break; 2465 2466 msleep(100); 2467 if (fatal_signal_pending(current)) 2468 return -EINTR; 2469 if (time_after(jiffies, timeout)) { 2470 dev_err(ctrl->device, 2471 "Device shutdown incomplete; abort shutdown\n"); 2472 return -ENODEV; 2473 } 2474 } 2475 2476 return ret; 2477 } 2478 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); 2479 2480 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) 2481 { 2482 __le64 ts; 2483 int ret; 2484 2485 if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) 2486 return 0; 2487 2488 ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); 2489 ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), 2490 NULL); 2491 if (ret) 2492 dev_warn_once(ctrl->device, 2493 "could not set timestamp (%d)\n", ret); 2494 return ret; 2495 } 2496 2497 static int nvme_configure_acre(struct nvme_ctrl *ctrl) 2498 { 2499 struct nvme_feat_host_behavior *host; 2500 int ret; 2501 2502 /* Don't bother enabling the feature if retry delay is not reported */ 2503 if (!ctrl->crdt[0]) 2504 return 0; 2505 2506 host = kzalloc(sizeof(*host), GFP_KERNEL); 2507 if (!host) 2508 return 0; 2509 2510 host->acre = NVME_ENABLE_ACRE; 2511 ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, 2512 host, sizeof(*host), NULL); 2513 kfree(host); 2514 return ret; 2515 } 2516 2517 static int nvme_configure_apst(struct nvme_ctrl *ctrl) 2518 { 2519 /* 2520 * APST (Autonomous Power State Transition) lets us program a 2521 * table of power state transitions that the controller will 2522 * perform automatically. We configure it with a simple 2523 * heuristic: we are willing to spend at most 2% of the time 2524 * transitioning between power states. Therefore, when running 2525 * in any given state, we will enter the next lower-power 2526 * non-operational state after waiting 50 * (enlat + exlat) 2527 * microseconds, as long as that state's exit latency is under 2528 * the requested maximum latency. 2529 * 2530 * We will not autonomously enter any non-operational state for 2531 * which the total latency exceeds ps_max_latency_us. Users 2532 * can set ps_max_latency_us to zero to turn off APST. 2533 */ 2534 2535 unsigned apste; 2536 struct nvme_feat_auto_pst *table; 2537 u64 max_lat_us = 0; 2538 int max_ps = -1; 2539 int ret; 2540 2541 /* 2542 * If APST isn't supported or if we haven't been initialized yet, 2543 * then don't do anything. 2544 */ 2545 if (!ctrl->apsta) 2546 return 0; 2547 2548 if (ctrl->npss > 31) { 2549 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); 2550 return 0; 2551 } 2552 2553 table = kzalloc(sizeof(*table), GFP_KERNEL); 2554 if (!table) 2555 return 0; 2556 2557 if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { 2558 /* Turn off APST. */ 2559 apste = 0; 2560 dev_dbg(ctrl->device, "APST disabled\n"); 2561 } else { 2562 __le64 target = cpu_to_le64(0); 2563 int state; 2564 2565 /* 2566 * Walk through all states from lowest- to highest-power. 2567 * According to the spec, lower-numbered states use more 2568 * power. NPSS, despite the name, is the index of the 2569 * lowest-power state, not the number of states. 2570 */ 2571 for (state = (int)ctrl->npss; state >= 0; state--) { 2572 u64 total_latency_us, exit_latency_us, transition_ms; 2573 2574 if (target) 2575 table->entries[state] = target; 2576 2577 /* 2578 * Don't allow transitions to the deepest state 2579 * if it's quirked off. 2580 */ 2581 if (state == ctrl->npss && 2582 (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) 2583 continue; 2584 2585 /* 2586 * Is this state a useful non-operational state for 2587 * higher-power states to autonomously transition to? 2588 */ 2589 if (!(ctrl->psd[state].flags & 2590 NVME_PS_FLAGS_NON_OP_STATE)) 2591 continue; 2592 2593 exit_latency_us = 2594 (u64)le32_to_cpu(ctrl->psd[state].exit_lat); 2595 if (exit_latency_us > ctrl->ps_max_latency_us) 2596 continue; 2597 2598 total_latency_us = 2599 exit_latency_us + 2600 le32_to_cpu(ctrl->psd[state].entry_lat); 2601 2602 /* 2603 * This state is good. Use it as the APST idle 2604 * target for higher power states. 2605 */ 2606 transition_ms = total_latency_us + 19; 2607 do_div(transition_ms, 20); 2608 if (transition_ms > (1 << 24) - 1) 2609 transition_ms = (1 << 24) - 1; 2610 2611 target = cpu_to_le64((state << 3) | 2612 (transition_ms << 8)); 2613 2614 if (max_ps == -1) 2615 max_ps = state; 2616 2617 if (total_latency_us > max_lat_us) 2618 max_lat_us = total_latency_us; 2619 } 2620 2621 apste = 1; 2622 2623 if (max_ps == -1) { 2624 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); 2625 } else { 2626 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", 2627 max_ps, max_lat_us, (int)sizeof(*table), table); 2628 } 2629 } 2630 2631 ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, 2632 table, sizeof(*table), NULL); 2633 if (ret) 2634 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); 2635 2636 kfree(table); 2637 return ret; 2638 } 2639 2640 static void nvme_set_latency_tolerance(struct device *dev, s32 val) 2641 { 2642 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2643 u64 latency; 2644 2645 switch (val) { 2646 case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: 2647 case PM_QOS_LATENCY_ANY: 2648 latency = U64_MAX; 2649 break; 2650 2651 default: 2652 latency = val; 2653 } 2654 2655 if (ctrl->ps_max_latency_us != latency) { 2656 ctrl->ps_max_latency_us = latency; 2657 nvme_configure_apst(ctrl); 2658 } 2659 } 2660 2661 struct nvme_core_quirk_entry { 2662 /* 2663 * NVMe model and firmware strings are padded with spaces. For 2664 * simplicity, strings in the quirk table are padded with NULLs 2665 * instead. 2666 */ 2667 u16 vid; 2668 const char *mn; 2669 const char *fr; 2670 unsigned long quirks; 2671 }; 2672 2673 static const struct nvme_core_quirk_entry core_quirks[] = { 2674 { 2675 /* 2676 * This Toshiba device seems to die using any APST states. See: 2677 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11 2678 */ 2679 .vid = 0x1179, 2680 .mn = "THNSF5256GPUK TOSHIBA", 2681 .quirks = NVME_QUIRK_NO_APST, 2682 }, 2683 { 2684 /* 2685 * This LiteON CL1-3D*-Q11 firmware version has a race 2686 * condition associated with actions related to suspend to idle 2687 * LiteON has resolved the problem in future firmware 2688 */ 2689 .vid = 0x14a4, 2690 .fr = "22301111", 2691 .quirks = NVME_QUIRK_SIMPLE_SUSPEND, 2692 } 2693 }; 2694 2695 /* match is null-terminated but idstr is space-padded. */ 2696 static bool string_matches(const char *idstr, const char *match, size_t len) 2697 { 2698 size_t matchlen; 2699 2700 if (!match) 2701 return true; 2702 2703 matchlen = strlen(match); 2704 WARN_ON_ONCE(matchlen > len); 2705 2706 if (memcmp(idstr, match, matchlen)) 2707 return false; 2708 2709 for (; matchlen < len; matchlen++) 2710 if (idstr[matchlen] != ' ') 2711 return false; 2712 2713 return true; 2714 } 2715 2716 static bool quirk_matches(const struct nvme_id_ctrl *id, 2717 const struct nvme_core_quirk_entry *q) 2718 { 2719 return q->vid == le16_to_cpu(id->vid) && 2720 string_matches(id->mn, q->mn, sizeof(id->mn)) && 2721 string_matches(id->fr, q->fr, sizeof(id->fr)); 2722 } 2723 2724 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, 2725 struct nvme_id_ctrl *id) 2726 { 2727 size_t nqnlen; 2728 int off; 2729 2730 if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { 2731 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); 2732 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { 2733 strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); 2734 return; 2735 } 2736 2737 if (ctrl->vs >= NVME_VS(1, 2, 1)) 2738 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); 2739 } 2740 2741 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ 2742 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, 2743 "nqn.2014.08.org.nvmexpress:%04x%04x", 2744 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); 2745 memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); 2746 off += sizeof(id->sn); 2747 memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); 2748 off += sizeof(id->mn); 2749 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); 2750 } 2751 2752 static void nvme_release_subsystem(struct device *dev) 2753 { 2754 struct nvme_subsystem *subsys = 2755 container_of(dev, struct nvme_subsystem, dev); 2756 2757 if (subsys->instance >= 0) 2758 ida_simple_remove(&nvme_instance_ida, subsys->instance); 2759 kfree(subsys); 2760 } 2761 2762 static void nvme_destroy_subsystem(struct kref *ref) 2763 { 2764 struct nvme_subsystem *subsys = 2765 container_of(ref, struct nvme_subsystem, ref); 2766 2767 mutex_lock(&nvme_subsystems_lock); 2768 list_del(&subsys->entry); 2769 mutex_unlock(&nvme_subsystems_lock); 2770 2771 ida_destroy(&subsys->ns_ida); 2772 device_del(&subsys->dev); 2773 put_device(&subsys->dev); 2774 } 2775 2776 static void nvme_put_subsystem(struct nvme_subsystem *subsys) 2777 { 2778 kref_put(&subsys->ref, nvme_destroy_subsystem); 2779 } 2780 2781 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) 2782 { 2783 struct nvme_subsystem *subsys; 2784 2785 lockdep_assert_held(&nvme_subsystems_lock); 2786 2787 /* 2788 * Fail matches for discovery subsystems. This results 2789 * in each discovery controller bound to a unique subsystem. 2790 * This avoids issues with validating controller values 2791 * that can only be true when there is a single unique subsystem. 2792 * There may be multiple and completely independent entities 2793 * that provide discovery controllers. 2794 */ 2795 if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) 2796 return NULL; 2797 2798 list_for_each_entry(subsys, &nvme_subsystems, entry) { 2799 if (strcmp(subsys->subnqn, subsysnqn)) 2800 continue; 2801 if (!kref_get_unless_zero(&subsys->ref)) 2802 continue; 2803 return subsys; 2804 } 2805 2806 return NULL; 2807 } 2808 2809 #define SUBSYS_ATTR_RO(_name, _mode, _show) \ 2810 struct device_attribute subsys_attr_##_name = \ 2811 __ATTR(_name, _mode, _show, NULL) 2812 2813 static ssize_t nvme_subsys_show_nqn(struct device *dev, 2814 struct device_attribute *attr, 2815 char *buf) 2816 { 2817 struct nvme_subsystem *subsys = 2818 container_of(dev, struct nvme_subsystem, dev); 2819 2820 return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); 2821 } 2822 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); 2823 2824 #define nvme_subsys_show_str_function(field) \ 2825 static ssize_t subsys_##field##_show(struct device *dev, \ 2826 struct device_attribute *attr, char *buf) \ 2827 { \ 2828 struct nvme_subsystem *subsys = \ 2829 container_of(dev, struct nvme_subsystem, dev); \ 2830 return sprintf(buf, "%.*s\n", \ 2831 (int)sizeof(subsys->field), subsys->field); \ 2832 } \ 2833 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); 2834 2835 nvme_subsys_show_str_function(model); 2836 nvme_subsys_show_str_function(serial); 2837 nvme_subsys_show_str_function(firmware_rev); 2838 2839 static struct attribute *nvme_subsys_attrs[] = { 2840 &subsys_attr_model.attr, 2841 &subsys_attr_serial.attr, 2842 &subsys_attr_firmware_rev.attr, 2843 &subsys_attr_subsysnqn.attr, 2844 #ifdef CONFIG_NVME_MULTIPATH 2845 &subsys_attr_iopolicy.attr, 2846 #endif 2847 NULL, 2848 }; 2849 2850 static struct attribute_group nvme_subsys_attrs_group = { 2851 .attrs = nvme_subsys_attrs, 2852 }; 2853 2854 static const struct attribute_group *nvme_subsys_attrs_groups[] = { 2855 &nvme_subsys_attrs_group, 2856 NULL, 2857 }; 2858 2859 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, 2860 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2861 { 2862 struct nvme_ctrl *tmp; 2863 2864 lockdep_assert_held(&nvme_subsystems_lock); 2865 2866 list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { 2867 if (nvme_state_terminal(tmp)) 2868 continue; 2869 2870 if (tmp->cntlid == ctrl->cntlid) { 2871 dev_err(ctrl->device, 2872 "Duplicate cntlid %u with %s, rejecting\n", 2873 ctrl->cntlid, dev_name(tmp->device)); 2874 return false; 2875 } 2876 2877 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || 2878 (ctrl->opts && ctrl->opts->discovery_nqn)) 2879 continue; 2880 2881 dev_err(ctrl->device, 2882 "Subsystem does not support multiple controllers\n"); 2883 return false; 2884 } 2885 2886 return true; 2887 } 2888 2889 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2890 { 2891 struct nvme_subsystem *subsys, *found; 2892 int ret; 2893 2894 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); 2895 if (!subsys) 2896 return -ENOMEM; 2897 2898 subsys->instance = -1; 2899 mutex_init(&subsys->lock); 2900 kref_init(&subsys->ref); 2901 INIT_LIST_HEAD(&subsys->ctrls); 2902 INIT_LIST_HEAD(&subsys->nsheads); 2903 nvme_init_subnqn(subsys, ctrl, id); 2904 memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); 2905 memcpy(subsys->model, id->mn, sizeof(subsys->model)); 2906 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); 2907 subsys->vendor_id = le16_to_cpu(id->vid); 2908 subsys->cmic = id->cmic; 2909 subsys->awupf = le16_to_cpu(id->awupf); 2910 #ifdef CONFIG_NVME_MULTIPATH 2911 subsys->iopolicy = NVME_IOPOLICY_NUMA; 2912 #endif 2913 2914 subsys->dev.class = nvme_subsys_class; 2915 subsys->dev.release = nvme_release_subsystem; 2916 subsys->dev.groups = nvme_subsys_attrs_groups; 2917 dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); 2918 device_initialize(&subsys->dev); 2919 2920 mutex_lock(&nvme_subsystems_lock); 2921 found = __nvme_find_get_subsystem(subsys->subnqn); 2922 if (found) { 2923 put_device(&subsys->dev); 2924 subsys = found; 2925 2926 if (!nvme_validate_cntlid(subsys, ctrl, id)) { 2927 ret = -EINVAL; 2928 goto out_put_subsystem; 2929 } 2930 } else { 2931 ret = device_add(&subsys->dev); 2932 if (ret) { 2933 dev_err(ctrl->device, 2934 "failed to register subsystem device.\n"); 2935 put_device(&subsys->dev); 2936 goto out_unlock; 2937 } 2938 ida_init(&subsys->ns_ida); 2939 list_add_tail(&subsys->entry, &nvme_subsystems); 2940 } 2941 2942 ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, 2943 dev_name(ctrl->device)); 2944 if (ret) { 2945 dev_err(ctrl->device, 2946 "failed to create sysfs link from subsystem.\n"); 2947 goto out_put_subsystem; 2948 } 2949 2950 if (!found) 2951 subsys->instance = ctrl->instance; 2952 ctrl->subsys = subsys; 2953 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); 2954 mutex_unlock(&nvme_subsystems_lock); 2955 return 0; 2956 2957 out_put_subsystem: 2958 nvme_put_subsystem(subsys); 2959 out_unlock: 2960 mutex_unlock(&nvme_subsystems_lock); 2961 return ret; 2962 } 2963 2964 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, 2965 void *log, size_t size, u64 offset) 2966 { 2967 struct nvme_command c = { }; 2968 u32 dwlen = nvme_bytes_to_numd(size); 2969 2970 c.get_log_page.opcode = nvme_admin_get_log_page; 2971 c.get_log_page.nsid = cpu_to_le32(nsid); 2972 c.get_log_page.lid = log_page; 2973 c.get_log_page.lsp = lsp; 2974 c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); 2975 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); 2976 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); 2977 c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); 2978 c.get_log_page.csi = csi; 2979 2980 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); 2981 } 2982 2983 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, 2984 struct nvme_effects_log **log) 2985 { 2986 struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi); 2987 int ret; 2988 2989 if (cel) 2990 goto out; 2991 2992 cel = kzalloc(sizeof(*cel), GFP_KERNEL); 2993 if (!cel) 2994 return -ENOMEM; 2995 2996 ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi, 2997 cel, sizeof(*cel), 0); 2998 if (ret) { 2999 kfree(cel); 3000 return ret; 3001 } 3002 3003 xa_store(&ctrl->cels, csi, cel, GFP_KERNEL); 3004 out: 3005 *log = cel; 3006 return 0; 3007 } 3008 3009 /* 3010 * Initialize the cached copies of the Identify data and various controller 3011 * register in our nvme_ctrl structure. This should be called as soon as 3012 * the admin queue is fully up and running. 3013 */ 3014 int nvme_init_identify(struct nvme_ctrl *ctrl) 3015 { 3016 struct nvme_id_ctrl *id; 3017 int ret, page_shift; 3018 u32 max_hw_sectors; 3019 bool prev_apst_enabled; 3020 3021 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 3022 if (ret) { 3023 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); 3024 return ret; 3025 } 3026 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; 3027 ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); 3028 3029 if (ctrl->vs >= NVME_VS(1, 1, 0)) 3030 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); 3031 3032 ret = nvme_identify_ctrl(ctrl, &id); 3033 if (ret) { 3034 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); 3035 return -EIO; 3036 } 3037 3038 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { 3039 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); 3040 if (ret < 0) 3041 goto out_free; 3042 } 3043 3044 if (!(ctrl->ops->flags & NVME_F_FABRICS)) 3045 ctrl->cntlid = le16_to_cpu(id->cntlid); 3046 3047 if (!ctrl->identified) { 3048 int i; 3049 3050 ret = nvme_init_subsystem(ctrl, id); 3051 if (ret) 3052 goto out_free; 3053 3054 /* 3055 * Check for quirks. Quirk can depend on firmware version, 3056 * so, in principle, the set of quirks present can change 3057 * across a reset. As a possible future enhancement, we 3058 * could re-scan for quirks every time we reinitialize 3059 * the device, but we'd have to make sure that the driver 3060 * behaves intelligently if the quirks change. 3061 */ 3062 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { 3063 if (quirk_matches(id, &core_quirks[i])) 3064 ctrl->quirks |= core_quirks[i].quirks; 3065 } 3066 } 3067 3068 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { 3069 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); 3070 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; 3071 } 3072 3073 ctrl->crdt[0] = le16_to_cpu(id->crdt1); 3074 ctrl->crdt[1] = le16_to_cpu(id->crdt2); 3075 ctrl->crdt[2] = le16_to_cpu(id->crdt3); 3076 3077 ctrl->oacs = le16_to_cpu(id->oacs); 3078 ctrl->oncs = le16_to_cpu(id->oncs); 3079 ctrl->mtfa = le16_to_cpu(id->mtfa); 3080 ctrl->oaes = le32_to_cpu(id->oaes); 3081 ctrl->wctemp = le16_to_cpu(id->wctemp); 3082 ctrl->cctemp = le16_to_cpu(id->cctemp); 3083 3084 atomic_set(&ctrl->abort_limit, id->acl + 1); 3085 ctrl->vwc = id->vwc; 3086 if (id->mdts) 3087 max_hw_sectors = 1 << (id->mdts + page_shift - 9); 3088 else 3089 max_hw_sectors = UINT_MAX; 3090 ctrl->max_hw_sectors = 3091 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); 3092 3093 nvme_set_queue_limits(ctrl, ctrl->admin_q); 3094 ctrl->sgls = le32_to_cpu(id->sgls); 3095 ctrl->kas = le16_to_cpu(id->kas); 3096 ctrl->max_namespaces = le32_to_cpu(id->mnan); 3097 ctrl->ctratt = le32_to_cpu(id->ctratt); 3098 3099 if (id->rtd3e) { 3100 /* us -> s */ 3101 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; 3102 3103 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, 3104 shutdown_timeout, 60); 3105 3106 if (ctrl->shutdown_timeout != shutdown_timeout) 3107 dev_info(ctrl->device, 3108 "Shutdown timeout set to %u seconds\n", 3109 ctrl->shutdown_timeout); 3110 } else 3111 ctrl->shutdown_timeout = shutdown_timeout; 3112 3113 ctrl->npss = id->npss; 3114 ctrl->apsta = id->apsta; 3115 prev_apst_enabled = ctrl->apst_enabled; 3116 if (ctrl->quirks & NVME_QUIRK_NO_APST) { 3117 if (force_apst && id->apsta) { 3118 dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); 3119 ctrl->apst_enabled = true; 3120 } else { 3121 ctrl->apst_enabled = false; 3122 } 3123 } else { 3124 ctrl->apst_enabled = id->apsta; 3125 } 3126 memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); 3127 3128 if (ctrl->ops->flags & NVME_F_FABRICS) { 3129 ctrl->icdoff = le16_to_cpu(id->icdoff); 3130 ctrl->ioccsz = le32_to_cpu(id->ioccsz); 3131 ctrl->iorcsz = le32_to_cpu(id->iorcsz); 3132 ctrl->maxcmd = le16_to_cpu(id->maxcmd); 3133 3134 /* 3135 * In fabrics we need to verify the cntlid matches the 3136 * admin connect 3137 */ 3138 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { 3139 dev_err(ctrl->device, 3140 "Mismatching cntlid: Connect %u vs Identify " 3141 "%u, rejecting\n", 3142 ctrl->cntlid, le16_to_cpu(id->cntlid)); 3143 ret = -EINVAL; 3144 goto out_free; 3145 } 3146 3147 if (!ctrl->opts->discovery_nqn && !ctrl->kas) { 3148 dev_err(ctrl->device, 3149 "keep-alive support is mandatory for fabrics\n"); 3150 ret = -EINVAL; 3151 goto out_free; 3152 } 3153 } else { 3154 ctrl->hmpre = le32_to_cpu(id->hmpre); 3155 ctrl->hmmin = le32_to_cpu(id->hmmin); 3156 ctrl->hmminds = le32_to_cpu(id->hmminds); 3157 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); 3158 } 3159 3160 ret = nvme_mpath_init(ctrl, id); 3161 kfree(id); 3162 3163 if (ret < 0) 3164 return ret; 3165 3166 if (ctrl->apst_enabled && !prev_apst_enabled) 3167 dev_pm_qos_expose_latency_tolerance(ctrl->device); 3168 else if (!ctrl->apst_enabled && prev_apst_enabled) 3169 dev_pm_qos_hide_latency_tolerance(ctrl->device); 3170 3171 ret = nvme_configure_apst(ctrl); 3172 if (ret < 0) 3173 return ret; 3174 3175 ret = nvme_configure_timestamp(ctrl); 3176 if (ret < 0) 3177 return ret; 3178 3179 ret = nvme_configure_directives(ctrl); 3180 if (ret < 0) 3181 return ret; 3182 3183 ret = nvme_configure_acre(ctrl); 3184 if (ret < 0) 3185 return ret; 3186 3187 if (!ctrl->identified) { 3188 ret = nvme_hwmon_init(ctrl); 3189 if (ret < 0) 3190 return ret; 3191 } 3192 3193 ctrl->identified = true; 3194 3195 return 0; 3196 3197 out_free: 3198 kfree(id); 3199 return ret; 3200 } 3201 EXPORT_SYMBOL_GPL(nvme_init_identify); 3202 3203 static int nvme_dev_open(struct inode *inode, struct file *file) 3204 { 3205 struct nvme_ctrl *ctrl = 3206 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 3207 3208 switch (ctrl->state) { 3209 case NVME_CTRL_LIVE: 3210 break; 3211 default: 3212 return -EWOULDBLOCK; 3213 } 3214 3215 nvme_get_ctrl(ctrl); 3216 if (!try_module_get(ctrl->ops->module)) { 3217 nvme_put_ctrl(ctrl); 3218 return -EINVAL; 3219 } 3220 3221 file->private_data = ctrl; 3222 return 0; 3223 } 3224 3225 static int nvme_dev_release(struct inode *inode, struct file *file) 3226 { 3227 struct nvme_ctrl *ctrl = 3228 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 3229 3230 module_put(ctrl->ops->module); 3231 nvme_put_ctrl(ctrl); 3232 return 0; 3233 } 3234 3235 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) 3236 { 3237 struct nvme_ns *ns; 3238 int ret; 3239 3240 down_read(&ctrl->namespaces_rwsem); 3241 if (list_empty(&ctrl->namespaces)) { 3242 ret = -ENOTTY; 3243 goto out_unlock; 3244 } 3245 3246 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 3247 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 3248 dev_warn(ctrl->device, 3249 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 3250 ret = -EINVAL; 3251 goto out_unlock; 3252 } 3253 3254 dev_warn(ctrl->device, 3255 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 3256 kref_get(&ns->kref); 3257 up_read(&ctrl->namespaces_rwsem); 3258 3259 ret = nvme_user_cmd(ctrl, ns, argp); 3260 nvme_put_ns(ns); 3261 return ret; 3262 3263 out_unlock: 3264 up_read(&ctrl->namespaces_rwsem); 3265 return ret; 3266 } 3267 3268 static long nvme_dev_ioctl(struct file *file, unsigned int cmd, 3269 unsigned long arg) 3270 { 3271 struct nvme_ctrl *ctrl = file->private_data; 3272 void __user *argp = (void __user *)arg; 3273 3274 switch (cmd) { 3275 case NVME_IOCTL_ADMIN_CMD: 3276 return nvme_user_cmd(ctrl, NULL, argp); 3277 case NVME_IOCTL_ADMIN64_CMD: 3278 return nvme_user_cmd64(ctrl, NULL, argp); 3279 case NVME_IOCTL_IO_CMD: 3280 return nvme_dev_user_cmd(ctrl, argp); 3281 case NVME_IOCTL_RESET: 3282 dev_warn(ctrl->device, "resetting controller\n"); 3283 return nvme_reset_ctrl_sync(ctrl); 3284 case NVME_IOCTL_SUBSYS_RESET: 3285 return nvme_reset_subsystem(ctrl); 3286 case NVME_IOCTL_RESCAN: 3287 nvme_queue_scan(ctrl); 3288 return 0; 3289 default: 3290 return -ENOTTY; 3291 } 3292 } 3293 3294 static const struct file_operations nvme_dev_fops = { 3295 .owner = THIS_MODULE, 3296 .open = nvme_dev_open, 3297 .release = nvme_dev_release, 3298 .unlocked_ioctl = nvme_dev_ioctl, 3299 .compat_ioctl = compat_ptr_ioctl, 3300 }; 3301 3302 static ssize_t nvme_sysfs_reset(struct device *dev, 3303 struct device_attribute *attr, const char *buf, 3304 size_t count) 3305 { 3306 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3307 int ret; 3308 3309 ret = nvme_reset_ctrl_sync(ctrl); 3310 if (ret < 0) 3311 return ret; 3312 return count; 3313 } 3314 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 3315 3316 static ssize_t nvme_sysfs_rescan(struct device *dev, 3317 struct device_attribute *attr, const char *buf, 3318 size_t count) 3319 { 3320 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3321 3322 nvme_queue_scan(ctrl); 3323 return count; 3324 } 3325 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); 3326 3327 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) 3328 { 3329 struct gendisk *disk = dev_to_disk(dev); 3330 3331 if (disk->fops == &nvme_bdev_ops) 3332 return nvme_get_ns_from_dev(dev)->head; 3333 else 3334 return disk->private_data; 3335 } 3336 3337 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 3338 char *buf) 3339 { 3340 struct nvme_ns_head *head = dev_to_ns_head(dev); 3341 struct nvme_ns_ids *ids = &head->ids; 3342 struct nvme_subsystem *subsys = head->subsys; 3343 int serial_len = sizeof(subsys->serial); 3344 int model_len = sizeof(subsys->model); 3345 3346 if (!uuid_is_null(&ids->uuid)) 3347 return sprintf(buf, "uuid.%pU\n", &ids->uuid); 3348 3349 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 3350 return sprintf(buf, "eui.%16phN\n", ids->nguid); 3351 3352 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 3353 return sprintf(buf, "eui.%8phN\n", ids->eui64); 3354 3355 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || 3356 subsys->serial[serial_len - 1] == '\0')) 3357 serial_len--; 3358 while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || 3359 subsys->model[model_len - 1] == '\0')) 3360 model_len--; 3361 3362 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, 3363 serial_len, subsys->serial, model_len, subsys->model, 3364 head->ns_id); 3365 } 3366 static DEVICE_ATTR_RO(wwid); 3367 3368 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, 3369 char *buf) 3370 { 3371 return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); 3372 } 3373 static DEVICE_ATTR_RO(nguid); 3374 3375 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 3376 char *buf) 3377 { 3378 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 3379 3380 /* For backward compatibility expose the NGUID to userspace if 3381 * we have no UUID set 3382 */ 3383 if (uuid_is_null(&ids->uuid)) { 3384 printk_ratelimited(KERN_WARNING 3385 "No UUID available providing old NGUID\n"); 3386 return sprintf(buf, "%pU\n", ids->nguid); 3387 } 3388 return sprintf(buf, "%pU\n", &ids->uuid); 3389 } 3390 static DEVICE_ATTR_RO(uuid); 3391 3392 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 3393 char *buf) 3394 { 3395 return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); 3396 } 3397 static DEVICE_ATTR_RO(eui); 3398 3399 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 3400 char *buf) 3401 { 3402 return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); 3403 } 3404 static DEVICE_ATTR_RO(nsid); 3405 3406 static struct attribute *nvme_ns_id_attrs[] = { 3407 &dev_attr_wwid.attr, 3408 &dev_attr_uuid.attr, 3409 &dev_attr_nguid.attr, 3410 &dev_attr_eui.attr, 3411 &dev_attr_nsid.attr, 3412 #ifdef CONFIG_NVME_MULTIPATH 3413 &dev_attr_ana_grpid.attr, 3414 &dev_attr_ana_state.attr, 3415 #endif 3416 NULL, 3417 }; 3418 3419 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, 3420 struct attribute *a, int n) 3421 { 3422 struct device *dev = container_of(kobj, struct device, kobj); 3423 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 3424 3425 if (a == &dev_attr_uuid.attr) { 3426 if (uuid_is_null(&ids->uuid) && 3427 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 3428 return 0; 3429 } 3430 if (a == &dev_attr_nguid.attr) { 3431 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 3432 return 0; 3433 } 3434 if (a == &dev_attr_eui.attr) { 3435 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 3436 return 0; 3437 } 3438 #ifdef CONFIG_NVME_MULTIPATH 3439 if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { 3440 if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ 3441 return 0; 3442 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) 3443 return 0; 3444 } 3445 #endif 3446 return a->mode; 3447 } 3448 3449 static const struct attribute_group nvme_ns_id_attr_group = { 3450 .attrs = nvme_ns_id_attrs, 3451 .is_visible = nvme_ns_id_attrs_are_visible, 3452 }; 3453 3454 const struct attribute_group *nvme_ns_id_attr_groups[] = { 3455 &nvme_ns_id_attr_group, 3456 #ifdef CONFIG_NVM 3457 &nvme_nvm_attr_group, 3458 #endif 3459 NULL, 3460 }; 3461 3462 #define nvme_show_str_function(field) \ 3463 static ssize_t field##_show(struct device *dev, \ 3464 struct device_attribute *attr, char *buf) \ 3465 { \ 3466 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 3467 return sprintf(buf, "%.*s\n", \ 3468 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ 3469 } \ 3470 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 3471 3472 nvme_show_str_function(model); 3473 nvme_show_str_function(serial); 3474 nvme_show_str_function(firmware_rev); 3475 3476 #define nvme_show_int_function(field) \ 3477 static ssize_t field##_show(struct device *dev, \ 3478 struct device_attribute *attr, char *buf) \ 3479 { \ 3480 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 3481 return sprintf(buf, "%d\n", ctrl->field); \ 3482 } \ 3483 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 3484 3485 nvme_show_int_function(cntlid); 3486 nvme_show_int_function(numa_node); 3487 nvme_show_int_function(queue_count); 3488 nvme_show_int_function(sqsize); 3489 3490 static ssize_t nvme_sysfs_delete(struct device *dev, 3491 struct device_attribute *attr, const char *buf, 3492 size_t count) 3493 { 3494 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3495 3496 if (device_remove_file_self(dev, attr)) 3497 nvme_delete_ctrl_sync(ctrl); 3498 return count; 3499 } 3500 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); 3501 3502 static ssize_t nvme_sysfs_show_transport(struct device *dev, 3503 struct device_attribute *attr, 3504 char *buf) 3505 { 3506 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3507 3508 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); 3509 } 3510 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); 3511 3512 static ssize_t nvme_sysfs_show_state(struct device *dev, 3513 struct device_attribute *attr, 3514 char *buf) 3515 { 3516 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3517 static const char *const state_name[] = { 3518 [NVME_CTRL_NEW] = "new", 3519 [NVME_CTRL_LIVE] = "live", 3520 [NVME_CTRL_RESETTING] = "resetting", 3521 [NVME_CTRL_CONNECTING] = "connecting", 3522 [NVME_CTRL_DELETING] = "deleting", 3523 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", 3524 [NVME_CTRL_DEAD] = "dead", 3525 }; 3526 3527 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && 3528 state_name[ctrl->state]) 3529 return sprintf(buf, "%s\n", state_name[ctrl->state]); 3530 3531 return sprintf(buf, "unknown state\n"); 3532 } 3533 3534 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); 3535 3536 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, 3537 struct device_attribute *attr, 3538 char *buf) 3539 { 3540 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3541 3542 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); 3543 } 3544 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); 3545 3546 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, 3547 struct device_attribute *attr, 3548 char *buf) 3549 { 3550 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3551 3552 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn); 3553 } 3554 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); 3555 3556 static ssize_t nvme_sysfs_show_hostid(struct device *dev, 3557 struct device_attribute *attr, 3558 char *buf) 3559 { 3560 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3561 3562 return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id); 3563 } 3564 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); 3565 3566 static ssize_t nvme_sysfs_show_address(struct device *dev, 3567 struct device_attribute *attr, 3568 char *buf) 3569 { 3570 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3571 3572 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); 3573 } 3574 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); 3575 3576 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, 3577 struct device_attribute *attr, char *buf) 3578 { 3579 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3580 struct nvmf_ctrl_options *opts = ctrl->opts; 3581 3582 if (ctrl->opts->max_reconnects == -1) 3583 return sprintf(buf, "off\n"); 3584 return sprintf(buf, "%d\n", 3585 opts->max_reconnects * opts->reconnect_delay); 3586 } 3587 3588 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, 3589 struct device_attribute *attr, const char *buf, size_t count) 3590 { 3591 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3592 struct nvmf_ctrl_options *opts = ctrl->opts; 3593 int ctrl_loss_tmo, err; 3594 3595 err = kstrtoint(buf, 10, &ctrl_loss_tmo); 3596 if (err) 3597 return -EINVAL; 3598 3599 else if (ctrl_loss_tmo < 0) 3600 opts->max_reconnects = -1; 3601 else 3602 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, 3603 opts->reconnect_delay); 3604 return count; 3605 } 3606 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, 3607 nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); 3608 3609 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, 3610 struct device_attribute *attr, char *buf) 3611 { 3612 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3613 3614 if (ctrl->opts->reconnect_delay == -1) 3615 return sprintf(buf, "off\n"); 3616 return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); 3617 } 3618 3619 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, 3620 struct device_attribute *attr, const char *buf, size_t count) 3621 { 3622 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3623 unsigned int v; 3624 int err; 3625 3626 err = kstrtou32(buf, 10, &v); 3627 if (err) 3628 return err; 3629 3630 ctrl->opts->reconnect_delay = v; 3631 return count; 3632 } 3633 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, 3634 nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); 3635 3636 static struct attribute *nvme_dev_attrs[] = { 3637 &dev_attr_reset_controller.attr, 3638 &dev_attr_rescan_controller.attr, 3639 &dev_attr_model.attr, 3640 &dev_attr_serial.attr, 3641 &dev_attr_firmware_rev.attr, 3642 &dev_attr_cntlid.attr, 3643 &dev_attr_delete_controller.attr, 3644 &dev_attr_transport.attr, 3645 &dev_attr_subsysnqn.attr, 3646 &dev_attr_address.attr, 3647 &dev_attr_state.attr, 3648 &dev_attr_numa_node.attr, 3649 &dev_attr_queue_count.attr, 3650 &dev_attr_sqsize.attr, 3651 &dev_attr_hostnqn.attr, 3652 &dev_attr_hostid.attr, 3653 &dev_attr_ctrl_loss_tmo.attr, 3654 &dev_attr_reconnect_delay.attr, 3655 NULL 3656 }; 3657 3658 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, 3659 struct attribute *a, int n) 3660 { 3661 struct device *dev = container_of(kobj, struct device, kobj); 3662 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3663 3664 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) 3665 return 0; 3666 if (a == &dev_attr_address.attr && !ctrl->ops->get_address) 3667 return 0; 3668 if (a == &dev_attr_hostnqn.attr && !ctrl->opts) 3669 return 0; 3670 if (a == &dev_attr_hostid.attr && !ctrl->opts) 3671 return 0; 3672 if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) 3673 return 0; 3674 if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) 3675 return 0; 3676 3677 return a->mode; 3678 } 3679 3680 static struct attribute_group nvme_dev_attrs_group = { 3681 .attrs = nvme_dev_attrs, 3682 .is_visible = nvme_dev_attrs_are_visible, 3683 }; 3684 3685 static const struct attribute_group *nvme_dev_attr_groups[] = { 3686 &nvme_dev_attrs_group, 3687 NULL, 3688 }; 3689 3690 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, 3691 unsigned nsid) 3692 { 3693 struct nvme_ns_head *h; 3694 3695 lockdep_assert_held(&subsys->lock); 3696 3697 list_for_each_entry(h, &subsys->nsheads, entry) { 3698 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) 3699 return h; 3700 } 3701 3702 return NULL; 3703 } 3704 3705 static int __nvme_check_ids(struct nvme_subsystem *subsys, 3706 struct nvme_ns_head *new) 3707 { 3708 struct nvme_ns_head *h; 3709 3710 lockdep_assert_held(&subsys->lock); 3711 3712 list_for_each_entry(h, &subsys->nsheads, entry) { 3713 if (nvme_ns_ids_valid(&new->ids) && 3714 nvme_ns_ids_equal(&new->ids, &h->ids)) 3715 return -EINVAL; 3716 } 3717 3718 return 0; 3719 } 3720 3721 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, 3722 unsigned nsid, struct nvme_ns_ids *ids) 3723 { 3724 struct nvme_ns_head *head; 3725 size_t size = sizeof(*head); 3726 int ret = -ENOMEM; 3727 3728 #ifdef CONFIG_NVME_MULTIPATH 3729 size += num_possible_nodes() * sizeof(struct nvme_ns *); 3730 #endif 3731 3732 head = kzalloc(size, GFP_KERNEL); 3733 if (!head) 3734 goto out; 3735 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); 3736 if (ret < 0) 3737 goto out_free_head; 3738 head->instance = ret; 3739 INIT_LIST_HEAD(&head->list); 3740 ret = init_srcu_struct(&head->srcu); 3741 if (ret) 3742 goto out_ida_remove; 3743 head->subsys = ctrl->subsys; 3744 head->ns_id = nsid; 3745 head->ids = *ids; 3746 kref_init(&head->ref); 3747 3748 ret = __nvme_check_ids(ctrl->subsys, head); 3749 if (ret) { 3750 dev_err(ctrl->device, 3751 "duplicate IDs for nsid %d\n", nsid); 3752 goto out_cleanup_srcu; 3753 } 3754 3755 if (head->ids.csi) { 3756 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); 3757 if (ret) 3758 goto out_cleanup_srcu; 3759 } else 3760 head->effects = ctrl->effects; 3761 3762 ret = nvme_mpath_alloc_disk(ctrl, head); 3763 if (ret) 3764 goto out_cleanup_srcu; 3765 3766 list_add_tail(&head->entry, &ctrl->subsys->nsheads); 3767 3768 kref_get(&ctrl->subsys->ref); 3769 3770 return head; 3771 out_cleanup_srcu: 3772 cleanup_srcu_struct(&head->srcu); 3773 out_ida_remove: 3774 ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); 3775 out_free_head: 3776 kfree(head); 3777 out: 3778 if (ret > 0) 3779 ret = blk_status_to_errno(nvme_error_status(ret)); 3780 return ERR_PTR(ret); 3781 } 3782 3783 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, 3784 struct nvme_ns_ids *ids, bool is_shared) 3785 { 3786 struct nvme_ctrl *ctrl = ns->ctrl; 3787 struct nvme_ns_head *head = NULL; 3788 int ret = 0; 3789 3790 mutex_lock(&ctrl->subsys->lock); 3791 head = nvme_find_ns_head(ctrl->subsys, nsid); 3792 if (!head) { 3793 head = nvme_alloc_ns_head(ctrl, nsid, ids); 3794 if (IS_ERR(head)) { 3795 ret = PTR_ERR(head); 3796 goto out_unlock; 3797 } 3798 head->shared = is_shared; 3799 } else { 3800 ret = -EINVAL; 3801 if (!is_shared || !head->shared) { 3802 dev_err(ctrl->device, 3803 "Duplicate unshared namespace %d\n", nsid); 3804 goto out_put_ns_head; 3805 } 3806 if (!nvme_ns_ids_equal(&head->ids, ids)) { 3807 dev_err(ctrl->device, 3808 "IDs don't match for shared namespace %d\n", 3809 nsid); 3810 goto out_put_ns_head; 3811 } 3812 } 3813 3814 list_add_tail(&ns->siblings, &head->list); 3815 ns->head = head; 3816 mutex_unlock(&ctrl->subsys->lock); 3817 return 0; 3818 3819 out_put_ns_head: 3820 nvme_put_ns_head(head); 3821 out_unlock: 3822 mutex_unlock(&ctrl->subsys->lock); 3823 return ret; 3824 } 3825 3826 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 3827 { 3828 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 3829 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 3830 3831 return nsa->head->ns_id - nsb->head->ns_id; 3832 } 3833 3834 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) 3835 { 3836 struct nvme_ns *ns, *ret = NULL; 3837 3838 down_read(&ctrl->namespaces_rwsem); 3839 list_for_each_entry(ns, &ctrl->namespaces, list) { 3840 if (ns->head->ns_id == nsid) { 3841 if (!kref_get_unless_zero(&ns->kref)) 3842 continue; 3843 ret = ns; 3844 break; 3845 } 3846 if (ns->head->ns_id > nsid) 3847 break; 3848 } 3849 up_read(&ctrl->namespaces_rwsem); 3850 return ret; 3851 } 3852 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); 3853 3854 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, 3855 struct nvme_ns_ids *ids) 3856 { 3857 struct nvme_ns *ns; 3858 struct gendisk *disk; 3859 struct nvme_id_ns *id; 3860 char disk_name[DISK_NAME_LEN]; 3861 int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; 3862 3863 if (nvme_identify_ns(ctrl, nsid, ids, &id)) 3864 return; 3865 3866 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 3867 if (!ns) 3868 goto out_free_id; 3869 3870 ns->queue = blk_mq_init_queue(ctrl->tagset); 3871 if (IS_ERR(ns->queue)) 3872 goto out_free_ns; 3873 3874 if (ctrl->opts && ctrl->opts->data_digest) 3875 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); 3876 3877 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); 3878 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) 3879 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); 3880 3881 ns->queue->queuedata = ns; 3882 ns->ctrl = ctrl; 3883 kref_init(&ns->kref); 3884 3885 if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) 3886 goto out_free_queue; 3887 nvme_set_disk_name(disk_name, ns, ctrl, &flags); 3888 3889 disk = alloc_disk_node(0, node); 3890 if (!disk) 3891 goto out_unlink_ns; 3892 3893 disk->fops = &nvme_bdev_ops; 3894 disk->private_data = ns; 3895 disk->queue = ns->queue; 3896 disk->flags = flags; 3897 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); 3898 ns->disk = disk; 3899 3900 if (nvme_update_ns_info(ns, id)) 3901 goto out_put_disk; 3902 3903 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { 3904 if (nvme_nvm_register(ns, disk_name, node)) { 3905 dev_warn(ctrl->device, "LightNVM init failure\n"); 3906 goto out_put_disk; 3907 } 3908 } 3909 3910 down_write(&ctrl->namespaces_rwsem); 3911 list_add_tail(&ns->list, &ctrl->namespaces); 3912 up_write(&ctrl->namespaces_rwsem); 3913 3914 nvme_get_ctrl(ctrl); 3915 3916 device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); 3917 3918 nvme_mpath_add_disk(ns, id); 3919 nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); 3920 kfree(id); 3921 3922 return; 3923 out_put_disk: 3924 /* prevent double queue cleanup */ 3925 ns->disk->queue = NULL; 3926 put_disk(ns->disk); 3927 out_unlink_ns: 3928 mutex_lock(&ctrl->subsys->lock); 3929 list_del_rcu(&ns->siblings); 3930 if (list_empty(&ns->head->list)) 3931 list_del_init(&ns->head->entry); 3932 mutex_unlock(&ctrl->subsys->lock); 3933 nvme_put_ns_head(ns->head); 3934 out_free_queue: 3935 blk_cleanup_queue(ns->queue); 3936 out_free_ns: 3937 kfree(ns); 3938 out_free_id: 3939 kfree(id); 3940 } 3941 3942 static void nvme_ns_remove(struct nvme_ns *ns) 3943 { 3944 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 3945 return; 3946 3947 set_capacity(ns->disk, 0); 3948 nvme_fault_inject_fini(&ns->fault_inject); 3949 3950 mutex_lock(&ns->ctrl->subsys->lock); 3951 list_del_rcu(&ns->siblings); 3952 if (list_empty(&ns->head->list)) 3953 list_del_init(&ns->head->entry); 3954 mutex_unlock(&ns->ctrl->subsys->lock); 3955 3956 synchronize_rcu(); /* guarantee not available in head->list */ 3957 nvme_mpath_clear_current_path(ns); 3958 synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ 3959 3960 if (ns->disk->flags & GENHD_FL_UP) { 3961 del_gendisk(ns->disk); 3962 blk_cleanup_queue(ns->queue); 3963 if (blk_get_integrity(ns->disk)) 3964 blk_integrity_unregister(ns->disk); 3965 } 3966 3967 down_write(&ns->ctrl->namespaces_rwsem); 3968 list_del_init(&ns->list); 3969 up_write(&ns->ctrl->namespaces_rwsem); 3970 3971 nvme_mpath_check_last_path(ns); 3972 nvme_put_ns(ns); 3973 } 3974 3975 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) 3976 { 3977 struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid); 3978 3979 if (ns) { 3980 nvme_ns_remove(ns); 3981 nvme_put_ns(ns); 3982 } 3983 } 3984 3985 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) 3986 { 3987 struct nvme_id_ns *id; 3988 int ret = -ENODEV; 3989 3990 if (test_bit(NVME_NS_DEAD, &ns->flags)) 3991 goto out; 3992 3993 ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); 3994 if (ret) 3995 goto out; 3996 3997 ret = -ENODEV; 3998 if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { 3999 dev_err(ns->ctrl->device, 4000 "identifiers changed for nsid %d\n", ns->head->ns_id); 4001 goto out_free_id; 4002 } 4003 4004 ret = nvme_update_ns_info(ns, id); 4005 4006 out_free_id: 4007 kfree(id); 4008 out: 4009 /* 4010 * Only remove the namespace if we got a fatal error back from the 4011 * device, otherwise ignore the error and just move on. 4012 * 4013 * TODO: we should probably schedule a delayed retry here. 4014 */ 4015 if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR))) 4016 nvme_ns_remove(ns); 4017 } 4018 4019 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 4020 { 4021 struct nvme_ns_ids ids = { }; 4022 struct nvme_ns *ns; 4023 4024 if (nvme_identify_ns_descs(ctrl, nsid, &ids)) 4025 return; 4026 4027 ns = nvme_find_get_ns(ctrl, nsid); 4028 if (ns) { 4029 nvme_validate_ns(ns, &ids); 4030 nvme_put_ns(ns); 4031 return; 4032 } 4033 4034 switch (ids.csi) { 4035 case NVME_CSI_NVM: 4036 nvme_alloc_ns(ctrl, nsid, &ids); 4037 break; 4038 case NVME_CSI_ZNS: 4039 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 4040 dev_warn(ctrl->device, 4041 "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", 4042 nsid); 4043 break; 4044 } 4045 nvme_alloc_ns(ctrl, nsid, &ids); 4046 break; 4047 default: 4048 dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", 4049 ids.csi, nsid); 4050 break; 4051 } 4052 } 4053 4054 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 4055 unsigned nsid) 4056 { 4057 struct nvme_ns *ns, *next; 4058 LIST_HEAD(rm_list); 4059 4060 down_write(&ctrl->namespaces_rwsem); 4061 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 4062 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) 4063 list_move_tail(&ns->list, &rm_list); 4064 } 4065 up_write(&ctrl->namespaces_rwsem); 4066 4067 list_for_each_entry_safe(ns, next, &rm_list, list) 4068 nvme_ns_remove(ns); 4069 4070 } 4071 4072 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) 4073 { 4074 const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32); 4075 __le32 *ns_list; 4076 u32 prev = 0; 4077 int ret = 0, i; 4078 4079 if (nvme_ctrl_limited_cns(ctrl)) 4080 return -EOPNOTSUPP; 4081 4082 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 4083 if (!ns_list) 4084 return -ENOMEM; 4085 4086 for (;;) { 4087 struct nvme_command cmd = { 4088 .identify.opcode = nvme_admin_identify, 4089 .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST, 4090 .identify.nsid = cpu_to_le32(prev), 4091 }; 4092 4093 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, 4094 NVME_IDENTIFY_DATA_SIZE); 4095 if (ret) { 4096 dev_warn(ctrl->device, 4097 "Identify NS List failed (status=0x%x)\n", ret); 4098 goto free; 4099 } 4100 4101 for (i = 0; i < nr_entries; i++) { 4102 u32 nsid = le32_to_cpu(ns_list[i]); 4103 4104 if (!nsid) /* end of the list? */ 4105 goto out; 4106 nvme_validate_or_alloc_ns(ctrl, nsid); 4107 while (++prev < nsid) 4108 nvme_ns_remove_by_nsid(ctrl, prev); 4109 } 4110 } 4111 out: 4112 nvme_remove_invalid_namespaces(ctrl, prev); 4113 free: 4114 kfree(ns_list); 4115 return ret; 4116 } 4117 4118 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) 4119 { 4120 struct nvme_id_ctrl *id; 4121 u32 nn, i; 4122 4123 if (nvme_identify_ctrl(ctrl, &id)) 4124 return; 4125 nn = le32_to_cpu(id->nn); 4126 kfree(id); 4127 4128 for (i = 1; i <= nn; i++) 4129 nvme_validate_or_alloc_ns(ctrl, i); 4130 4131 nvme_remove_invalid_namespaces(ctrl, nn); 4132 } 4133 4134 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) 4135 { 4136 size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); 4137 __le32 *log; 4138 int error; 4139 4140 log = kzalloc(log_size, GFP_KERNEL); 4141 if (!log) 4142 return; 4143 4144 /* 4145 * We need to read the log to clear the AEN, but we don't want to rely 4146 * on it for the changed namespace information as userspace could have 4147 * raced with us in reading the log page, which could cause us to miss 4148 * updates. 4149 */ 4150 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, 4151 NVME_CSI_NVM, log, log_size, 0); 4152 if (error) 4153 dev_warn(ctrl->device, 4154 "reading changed ns log failed: %d\n", error); 4155 4156 kfree(log); 4157 } 4158 4159 static void nvme_scan_work(struct work_struct *work) 4160 { 4161 struct nvme_ctrl *ctrl = 4162 container_of(work, struct nvme_ctrl, scan_work); 4163 4164 /* No tagset on a live ctrl means IO queues could not created */ 4165 if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) 4166 return; 4167 4168 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { 4169 dev_info(ctrl->device, "rescanning namespaces.\n"); 4170 nvme_clear_changed_ns_log(ctrl); 4171 } 4172 4173 mutex_lock(&ctrl->scan_lock); 4174 if (nvme_scan_ns_list(ctrl) != 0) 4175 nvme_scan_ns_sequential(ctrl); 4176 mutex_unlock(&ctrl->scan_lock); 4177 4178 down_write(&ctrl->namespaces_rwsem); 4179 list_sort(NULL, &ctrl->namespaces, ns_cmp); 4180 up_write(&ctrl->namespaces_rwsem); 4181 } 4182 4183 /* 4184 * This function iterates the namespace list unlocked to allow recovery from 4185 * controller failure. It is up to the caller to ensure the namespace list is 4186 * not modified by scan work while this function is executing. 4187 */ 4188 void nvme_remove_namespaces(struct nvme_ctrl *ctrl) 4189 { 4190 struct nvme_ns *ns, *next; 4191 LIST_HEAD(ns_list); 4192 4193 /* 4194 * make sure to requeue I/O to all namespaces as these 4195 * might result from the scan itself and must complete 4196 * for the scan_work to make progress 4197 */ 4198 nvme_mpath_clear_ctrl_paths(ctrl); 4199 4200 /* prevent racing with ns scanning */ 4201 flush_work(&ctrl->scan_work); 4202 4203 /* 4204 * The dead states indicates the controller was not gracefully 4205 * disconnected. In that case, we won't be able to flush any data while 4206 * removing the namespaces' disks; fail all the queues now to avoid 4207 * potentially having to clean up the failed sync later. 4208 */ 4209 if (ctrl->state == NVME_CTRL_DEAD) 4210 nvme_kill_queues(ctrl); 4211 4212 /* this is a no-op when called from the controller reset handler */ 4213 nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); 4214 4215 down_write(&ctrl->namespaces_rwsem); 4216 list_splice_init(&ctrl->namespaces, &ns_list); 4217 up_write(&ctrl->namespaces_rwsem); 4218 4219 list_for_each_entry_safe(ns, next, &ns_list, list) 4220 nvme_ns_remove(ns); 4221 } 4222 EXPORT_SYMBOL_GPL(nvme_remove_namespaces); 4223 4224 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) 4225 { 4226 struct nvme_ctrl *ctrl = 4227 container_of(dev, struct nvme_ctrl, ctrl_device); 4228 struct nvmf_ctrl_options *opts = ctrl->opts; 4229 int ret; 4230 4231 ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); 4232 if (ret) 4233 return ret; 4234 4235 if (opts) { 4236 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); 4237 if (ret) 4238 return ret; 4239 4240 ret = add_uevent_var(env, "NVME_TRSVCID=%s", 4241 opts->trsvcid ?: "none"); 4242 if (ret) 4243 return ret; 4244 4245 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", 4246 opts->host_traddr ?: "none"); 4247 } 4248 return ret; 4249 } 4250 4251 static void nvme_aen_uevent(struct nvme_ctrl *ctrl) 4252 { 4253 char *envp[2] = { NULL, NULL }; 4254 u32 aen_result = ctrl->aen_result; 4255 4256 ctrl->aen_result = 0; 4257 if (!aen_result) 4258 return; 4259 4260 envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); 4261 if (!envp[0]) 4262 return; 4263 kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); 4264 kfree(envp[0]); 4265 } 4266 4267 static void nvme_async_event_work(struct work_struct *work) 4268 { 4269 struct nvme_ctrl *ctrl = 4270 container_of(work, struct nvme_ctrl, async_event_work); 4271 4272 nvme_aen_uevent(ctrl); 4273 ctrl->ops->submit_async_event(ctrl); 4274 } 4275 4276 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) 4277 { 4278 4279 u32 csts; 4280 4281 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) 4282 return false; 4283 4284 if (csts == ~0) 4285 return false; 4286 4287 return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); 4288 } 4289 4290 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) 4291 { 4292 struct nvme_fw_slot_info_log *log; 4293 4294 log = kmalloc(sizeof(*log), GFP_KERNEL); 4295 if (!log) 4296 return; 4297 4298 if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, 4299 log, sizeof(*log), 0)) 4300 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); 4301 kfree(log); 4302 } 4303 4304 static void nvme_fw_act_work(struct work_struct *work) 4305 { 4306 struct nvme_ctrl *ctrl = container_of(work, 4307 struct nvme_ctrl, fw_act_work); 4308 unsigned long fw_act_timeout; 4309 4310 if (ctrl->mtfa) 4311 fw_act_timeout = jiffies + 4312 msecs_to_jiffies(ctrl->mtfa * 100); 4313 else 4314 fw_act_timeout = jiffies + 4315 msecs_to_jiffies(admin_timeout * 1000); 4316 4317 nvme_stop_queues(ctrl); 4318 while (nvme_ctrl_pp_status(ctrl)) { 4319 if (time_after(jiffies, fw_act_timeout)) { 4320 dev_warn(ctrl->device, 4321 "Fw activation timeout, reset controller\n"); 4322 nvme_try_sched_reset(ctrl); 4323 return; 4324 } 4325 msleep(100); 4326 } 4327 4328 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) 4329 return; 4330 4331 nvme_start_queues(ctrl); 4332 /* read FW slot information to clear the AER */ 4333 nvme_get_fw_slot_info(ctrl); 4334 } 4335 4336 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) 4337 { 4338 u32 aer_notice_type = (result & 0xff00) >> 8; 4339 4340 trace_nvme_async_event(ctrl, aer_notice_type); 4341 4342 switch (aer_notice_type) { 4343 case NVME_AER_NOTICE_NS_CHANGED: 4344 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events); 4345 nvme_queue_scan(ctrl); 4346 break; 4347 case NVME_AER_NOTICE_FW_ACT_STARTING: 4348 /* 4349 * We are (ab)using the RESETTING state to prevent subsequent 4350 * recovery actions from interfering with the controller's 4351 * firmware activation. 4352 */ 4353 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 4354 queue_work(nvme_wq, &ctrl->fw_act_work); 4355 break; 4356 #ifdef CONFIG_NVME_MULTIPATH 4357 case NVME_AER_NOTICE_ANA: 4358 if (!ctrl->ana_log_buf) 4359 break; 4360 queue_work(nvme_wq, &ctrl->ana_work); 4361 break; 4362 #endif 4363 case NVME_AER_NOTICE_DISC_CHANGED: 4364 ctrl->aen_result = result; 4365 break; 4366 default: 4367 dev_warn(ctrl->device, "async event result %08x\n", result); 4368 } 4369 } 4370 4371 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, 4372 volatile union nvme_result *res) 4373 { 4374 u32 result = le32_to_cpu(res->u32); 4375 u32 aer_type = result & 0x07; 4376 4377 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) 4378 return; 4379 4380 switch (aer_type) { 4381 case NVME_AER_NOTICE: 4382 nvme_handle_aen_notice(ctrl, result); 4383 break; 4384 case NVME_AER_ERROR: 4385 case NVME_AER_SMART: 4386 case NVME_AER_CSS: 4387 case NVME_AER_VS: 4388 trace_nvme_async_event(ctrl, aer_type); 4389 ctrl->aen_result = result; 4390 break; 4391 default: 4392 break; 4393 } 4394 queue_work(nvme_wq, &ctrl->async_event_work); 4395 } 4396 EXPORT_SYMBOL_GPL(nvme_complete_async_event); 4397 4398 void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 4399 { 4400 nvme_mpath_stop(ctrl); 4401 nvme_stop_keep_alive(ctrl); 4402 nvme_stop_failfast_work(ctrl); 4403 flush_work(&ctrl->async_event_work); 4404 cancel_work_sync(&ctrl->fw_act_work); 4405 } 4406 EXPORT_SYMBOL_GPL(nvme_stop_ctrl); 4407 4408 void nvme_start_ctrl(struct nvme_ctrl *ctrl) 4409 { 4410 nvme_start_keep_alive(ctrl); 4411 4412 nvme_enable_aen(ctrl); 4413 4414 if (ctrl->queue_count > 1) { 4415 nvme_queue_scan(ctrl); 4416 nvme_start_queues(ctrl); 4417 } 4418 } 4419 EXPORT_SYMBOL_GPL(nvme_start_ctrl); 4420 4421 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 4422 { 4423 nvme_fault_inject_fini(&ctrl->fault_inject); 4424 dev_pm_qos_hide_latency_tolerance(ctrl->device); 4425 cdev_device_del(&ctrl->cdev, ctrl->device); 4426 nvme_put_ctrl(ctrl); 4427 } 4428 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); 4429 4430 static void nvme_free_cels(struct nvme_ctrl *ctrl) 4431 { 4432 struct nvme_effects_log *cel; 4433 unsigned long i; 4434 4435 xa_for_each (&ctrl->cels, i, cel) { 4436 xa_erase(&ctrl->cels, i); 4437 kfree(cel); 4438 } 4439 4440 xa_destroy(&ctrl->cels); 4441 } 4442 4443 static void nvme_free_ctrl(struct device *dev) 4444 { 4445 struct nvme_ctrl *ctrl = 4446 container_of(dev, struct nvme_ctrl, ctrl_device); 4447 struct nvme_subsystem *subsys = ctrl->subsys; 4448 4449 if (!subsys || ctrl->instance != subsys->instance) 4450 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 4451 4452 nvme_free_cels(ctrl); 4453 nvme_mpath_uninit(ctrl); 4454 __free_page(ctrl->discard_page); 4455 4456 if (subsys) { 4457 mutex_lock(&nvme_subsystems_lock); 4458 list_del(&ctrl->subsys_entry); 4459 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); 4460 mutex_unlock(&nvme_subsystems_lock); 4461 } 4462 4463 ctrl->ops->free_ctrl(ctrl); 4464 4465 if (subsys) 4466 nvme_put_subsystem(subsys); 4467 } 4468 4469 /* 4470 * Initialize a NVMe controller structures. This needs to be called during 4471 * earliest initialization so that we have the initialized structured around 4472 * during probing. 4473 */ 4474 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 4475 const struct nvme_ctrl_ops *ops, unsigned long quirks) 4476 { 4477 int ret; 4478 4479 ctrl->state = NVME_CTRL_NEW; 4480 clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); 4481 spin_lock_init(&ctrl->lock); 4482 mutex_init(&ctrl->scan_lock); 4483 INIT_LIST_HEAD(&ctrl->namespaces); 4484 xa_init(&ctrl->cels); 4485 init_rwsem(&ctrl->namespaces_rwsem); 4486 ctrl->dev = dev; 4487 ctrl->ops = ops; 4488 ctrl->quirks = quirks; 4489 ctrl->numa_node = NUMA_NO_NODE; 4490 INIT_WORK(&ctrl->scan_work, nvme_scan_work); 4491 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 4492 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); 4493 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); 4494 init_waitqueue_head(&ctrl->state_wq); 4495 4496 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); 4497 INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); 4498 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); 4499 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; 4500 4501 BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > 4502 PAGE_SIZE); 4503 ctrl->discard_page = alloc_page(GFP_KERNEL); 4504 if (!ctrl->discard_page) { 4505 ret = -ENOMEM; 4506 goto out; 4507 } 4508 4509 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); 4510 if (ret < 0) 4511 goto out; 4512 ctrl->instance = ret; 4513 4514 device_initialize(&ctrl->ctrl_device); 4515 ctrl->device = &ctrl->ctrl_device; 4516 ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), 4517 ctrl->instance); 4518 ctrl->device->class = nvme_class; 4519 ctrl->device->parent = ctrl->dev; 4520 ctrl->device->groups = nvme_dev_attr_groups; 4521 ctrl->device->release = nvme_free_ctrl; 4522 dev_set_drvdata(ctrl->device, ctrl); 4523 ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); 4524 if (ret) 4525 goto out_release_instance; 4526 4527 nvme_get_ctrl(ctrl); 4528 cdev_init(&ctrl->cdev, &nvme_dev_fops); 4529 ctrl->cdev.owner = ops->module; 4530 ret = cdev_device_add(&ctrl->cdev, ctrl->device); 4531 if (ret) 4532 goto out_free_name; 4533 4534 /* 4535 * Initialize latency tolerance controls. The sysfs files won't 4536 * be visible to userspace unless the device actually supports APST. 4537 */ 4538 ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; 4539 dev_pm_qos_update_user_latency_tolerance(ctrl->device, 4540 min(default_ps_max_latency_us, (unsigned long)S32_MAX)); 4541 4542 nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); 4543 4544 return 0; 4545 out_free_name: 4546 nvme_put_ctrl(ctrl); 4547 kfree_const(ctrl->device->kobj.name); 4548 out_release_instance: 4549 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 4550 out: 4551 if (ctrl->discard_page) 4552 __free_page(ctrl->discard_page); 4553 return ret; 4554 } 4555 EXPORT_SYMBOL_GPL(nvme_init_ctrl); 4556 4557 /** 4558 * nvme_kill_queues(): Ends all namespace queues 4559 * @ctrl: the dead controller that needs to end 4560 * 4561 * Call this function when the driver determines it is unable to get the 4562 * controller in a state capable of servicing IO. 4563 */ 4564 void nvme_kill_queues(struct nvme_ctrl *ctrl) 4565 { 4566 struct nvme_ns *ns; 4567 4568 down_read(&ctrl->namespaces_rwsem); 4569 4570 /* Forcibly unquiesce queues to avoid blocking dispatch */ 4571 if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) 4572 blk_mq_unquiesce_queue(ctrl->admin_q); 4573 4574 list_for_each_entry(ns, &ctrl->namespaces, list) 4575 nvme_set_queue_dying(ns); 4576 4577 up_read(&ctrl->namespaces_rwsem); 4578 } 4579 EXPORT_SYMBOL_GPL(nvme_kill_queues); 4580 4581 void nvme_unfreeze(struct nvme_ctrl *ctrl) 4582 { 4583 struct nvme_ns *ns; 4584 4585 down_read(&ctrl->namespaces_rwsem); 4586 list_for_each_entry(ns, &ctrl->namespaces, list) 4587 blk_mq_unfreeze_queue(ns->queue); 4588 up_read(&ctrl->namespaces_rwsem); 4589 } 4590 EXPORT_SYMBOL_GPL(nvme_unfreeze); 4591 4592 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) 4593 { 4594 struct nvme_ns *ns; 4595 4596 down_read(&ctrl->namespaces_rwsem); 4597 list_for_each_entry(ns, &ctrl->namespaces, list) { 4598 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); 4599 if (timeout <= 0) 4600 break; 4601 } 4602 up_read(&ctrl->namespaces_rwsem); 4603 return timeout; 4604 } 4605 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); 4606 4607 void nvme_wait_freeze(struct nvme_ctrl *ctrl) 4608 { 4609 struct nvme_ns *ns; 4610 4611 down_read(&ctrl->namespaces_rwsem); 4612 list_for_each_entry(ns, &ctrl->namespaces, list) 4613 blk_mq_freeze_queue_wait(ns->queue); 4614 up_read(&ctrl->namespaces_rwsem); 4615 } 4616 EXPORT_SYMBOL_GPL(nvme_wait_freeze); 4617 4618 void nvme_start_freeze(struct nvme_ctrl *ctrl) 4619 { 4620 struct nvme_ns *ns; 4621 4622 down_read(&ctrl->namespaces_rwsem); 4623 list_for_each_entry(ns, &ctrl->namespaces, list) 4624 blk_freeze_queue_start(ns->queue); 4625 up_read(&ctrl->namespaces_rwsem); 4626 } 4627 EXPORT_SYMBOL_GPL(nvme_start_freeze); 4628 4629 void nvme_stop_queues(struct nvme_ctrl *ctrl) 4630 { 4631 struct nvme_ns *ns; 4632 4633 down_read(&ctrl->namespaces_rwsem); 4634 list_for_each_entry(ns, &ctrl->namespaces, list) 4635 blk_mq_quiesce_queue(ns->queue); 4636 up_read(&ctrl->namespaces_rwsem); 4637 } 4638 EXPORT_SYMBOL_GPL(nvme_stop_queues); 4639 4640 void nvme_start_queues(struct nvme_ctrl *ctrl) 4641 { 4642 struct nvme_ns *ns; 4643 4644 down_read(&ctrl->namespaces_rwsem); 4645 list_for_each_entry(ns, &ctrl->namespaces, list) 4646 blk_mq_unquiesce_queue(ns->queue); 4647 up_read(&ctrl->namespaces_rwsem); 4648 } 4649 EXPORT_SYMBOL_GPL(nvme_start_queues); 4650 4651 void nvme_sync_io_queues(struct nvme_ctrl *ctrl) 4652 { 4653 struct nvme_ns *ns; 4654 4655 down_read(&ctrl->namespaces_rwsem); 4656 list_for_each_entry(ns, &ctrl->namespaces, list) 4657 blk_sync_queue(ns->queue); 4658 up_read(&ctrl->namespaces_rwsem); 4659 } 4660 EXPORT_SYMBOL_GPL(nvme_sync_io_queues); 4661 4662 void nvme_sync_queues(struct nvme_ctrl *ctrl) 4663 { 4664 nvme_sync_io_queues(ctrl); 4665 if (ctrl->admin_q) 4666 blk_sync_queue(ctrl->admin_q); 4667 } 4668 EXPORT_SYMBOL_GPL(nvme_sync_queues); 4669 4670 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file) 4671 { 4672 if (file->f_op != &nvme_dev_fops) 4673 return NULL; 4674 return file->private_data; 4675 } 4676 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU); 4677 4678 /* 4679 * Check we didn't inadvertently grow the command structure sizes: 4680 */ 4681 static inline void _nvme_check_size(void) 4682 { 4683 BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64); 4684 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 4685 BUILD_BUG_ON(sizeof(struct nvme_identify) != 64); 4686 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 4687 BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64); 4688 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 4689 BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64); 4690 BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64); 4691 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 4692 BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64); 4693 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 4694 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); 4695 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); 4696 BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); 4697 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); 4698 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 4699 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 4700 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); 4701 BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); 4702 } 4703 4704 4705 static int __init nvme_core_init(void) 4706 { 4707 int result = -ENOMEM; 4708 4709 _nvme_check_size(); 4710 4711 nvme_wq = alloc_workqueue("nvme-wq", 4712 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4713 if (!nvme_wq) 4714 goto out; 4715 4716 nvme_reset_wq = alloc_workqueue("nvme-reset-wq", 4717 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4718 if (!nvme_reset_wq) 4719 goto destroy_wq; 4720 4721 nvme_delete_wq = alloc_workqueue("nvme-delete-wq", 4722 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4723 if (!nvme_delete_wq) 4724 goto destroy_reset_wq; 4725 4726 result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0, 4727 NVME_MINORS, "nvme"); 4728 if (result < 0) 4729 goto destroy_delete_wq; 4730 4731 nvme_class = class_create(THIS_MODULE, "nvme"); 4732 if (IS_ERR(nvme_class)) { 4733 result = PTR_ERR(nvme_class); 4734 goto unregister_chrdev; 4735 } 4736 nvme_class->dev_uevent = nvme_class_uevent; 4737 4738 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); 4739 if (IS_ERR(nvme_subsys_class)) { 4740 result = PTR_ERR(nvme_subsys_class); 4741 goto destroy_class; 4742 } 4743 return 0; 4744 4745 destroy_class: 4746 class_destroy(nvme_class); 4747 unregister_chrdev: 4748 unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); 4749 destroy_delete_wq: 4750 destroy_workqueue(nvme_delete_wq); 4751 destroy_reset_wq: 4752 destroy_workqueue(nvme_reset_wq); 4753 destroy_wq: 4754 destroy_workqueue(nvme_wq); 4755 out: 4756 return result; 4757 } 4758 4759 static void __exit nvme_core_exit(void) 4760 { 4761 class_destroy(nvme_subsys_class); 4762 class_destroy(nvme_class); 4763 unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); 4764 destroy_workqueue(nvme_delete_wq); 4765 destroy_workqueue(nvme_reset_wq); 4766 destroy_workqueue(nvme_wq); 4767 ida_destroy(&nvme_instance_ida); 4768 } 4769 4770 MODULE_LICENSE("GPL"); 4771 MODULE_VERSION("1.0"); 4772 module_init(nvme_core_init); 4773 module_exit(nvme_core_exit); 4774