1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/blkdev.h> 16 #include <linux/blk-mq.h> 17 #include <linux/delay.h> 18 #include <linux/errno.h> 19 #include <linux/hdreg.h> 20 #include <linux/kernel.h> 21 #include <linux/module.h> 22 #include <linux/list_sort.h> 23 #include <linux/slab.h> 24 #include <linux/types.h> 25 #include <linux/pr.h> 26 #include <linux/ptrace.h> 27 #include <linux/nvme_ioctl.h> 28 #include <linux/t10-pi.h> 29 #include <linux/pm_qos.h> 30 #include <asm/unaligned.h> 31 32 #define CREATE_TRACE_POINTS 33 #include "trace.h" 34 35 #include "nvme.h" 36 #include "fabrics.h" 37 38 #define NVME_MINORS (1U << MINORBITS) 39 40 unsigned int admin_timeout = 60; 41 module_param(admin_timeout, uint, 0644); 42 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 43 EXPORT_SYMBOL_GPL(admin_timeout); 44 45 unsigned int nvme_io_timeout = 30; 46 module_param_named(io_timeout, nvme_io_timeout, uint, 0644); 47 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 48 EXPORT_SYMBOL_GPL(nvme_io_timeout); 49 50 static unsigned char shutdown_timeout = 5; 51 module_param(shutdown_timeout, byte, 0644); 52 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 53 54 static u8 nvme_max_retries = 5; 55 module_param_named(max_retries, nvme_max_retries, byte, 0644); 56 MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); 57 58 static unsigned long default_ps_max_latency_us = 100000; 59 module_param(default_ps_max_latency_us, ulong, 0644); 60 MODULE_PARM_DESC(default_ps_max_latency_us, 61 "max power saving latency for new devices; use PM QOS to change per device"); 62 63 static bool force_apst; 64 module_param(force_apst, bool, 0644); 65 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); 66 67 static bool streams; 68 module_param(streams, bool, 0644); 69 MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); 70 71 /* 72 * nvme_wq - hosts nvme related works that are not reset or delete 73 * nvme_reset_wq - hosts nvme reset works 74 * nvme_delete_wq - hosts nvme delete works 75 * 76 * nvme_wq will host works such are scan, aen handling, fw activation, 77 * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq 78 * runs reset works which also flush works hosted on nvme_wq for 79 * serialization purposes. nvme_delete_wq host controller deletion 80 * works which flush reset works for serialization. 81 */ 82 struct workqueue_struct *nvme_wq; 83 EXPORT_SYMBOL_GPL(nvme_wq); 84 85 struct workqueue_struct *nvme_reset_wq; 86 EXPORT_SYMBOL_GPL(nvme_reset_wq); 87 88 struct workqueue_struct *nvme_delete_wq; 89 EXPORT_SYMBOL_GPL(nvme_delete_wq); 90 91 static DEFINE_IDA(nvme_subsystems_ida); 92 static LIST_HEAD(nvme_subsystems); 93 static DEFINE_MUTEX(nvme_subsystems_lock); 94 95 static DEFINE_IDA(nvme_instance_ida); 96 static dev_t nvme_chr_devt; 97 static struct class *nvme_class; 98 static struct class *nvme_subsys_class; 99 100 static void nvme_ns_remove(struct nvme_ns *ns); 101 static int nvme_revalidate_disk(struct gendisk *disk); 102 static void nvme_put_subsystem(struct nvme_subsystem *subsys); 103 104 int nvme_reset_ctrl(struct nvme_ctrl *ctrl) 105 { 106 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 107 return -EBUSY; 108 if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 109 return -EBUSY; 110 return 0; 111 } 112 EXPORT_SYMBOL_GPL(nvme_reset_ctrl); 113 114 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) 115 { 116 int ret; 117 118 ret = nvme_reset_ctrl(ctrl); 119 if (!ret) { 120 flush_work(&ctrl->reset_work); 121 if (ctrl->state != NVME_CTRL_LIVE && 122 ctrl->state != NVME_CTRL_ADMIN_ONLY) 123 ret = -ENETRESET; 124 } 125 126 return ret; 127 } 128 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); 129 130 static void nvme_delete_ctrl_work(struct work_struct *work) 131 { 132 struct nvme_ctrl *ctrl = 133 container_of(work, struct nvme_ctrl, delete_work); 134 135 dev_info(ctrl->device, 136 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); 137 138 flush_work(&ctrl->reset_work); 139 nvme_stop_ctrl(ctrl); 140 nvme_remove_namespaces(ctrl); 141 ctrl->ops->delete_ctrl(ctrl); 142 nvme_uninit_ctrl(ctrl); 143 nvme_put_ctrl(ctrl); 144 } 145 146 int nvme_delete_ctrl(struct nvme_ctrl *ctrl) 147 { 148 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 149 return -EBUSY; 150 if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) 151 return -EBUSY; 152 return 0; 153 } 154 EXPORT_SYMBOL_GPL(nvme_delete_ctrl); 155 156 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) 157 { 158 int ret = 0; 159 160 /* 161 * Keep a reference until the work is flushed since ->delete_ctrl 162 * can free the controller. 163 */ 164 nvme_get_ctrl(ctrl); 165 ret = nvme_delete_ctrl(ctrl); 166 if (!ret) 167 flush_work(&ctrl->delete_work); 168 nvme_put_ctrl(ctrl); 169 return ret; 170 } 171 EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); 172 173 static inline bool nvme_ns_has_pi(struct nvme_ns *ns) 174 { 175 return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); 176 } 177 178 static blk_status_t nvme_error_status(struct request *req) 179 { 180 switch (nvme_req(req)->status & 0x7ff) { 181 case NVME_SC_SUCCESS: 182 return BLK_STS_OK; 183 case NVME_SC_CAP_EXCEEDED: 184 return BLK_STS_NOSPC; 185 case NVME_SC_LBA_RANGE: 186 return BLK_STS_TARGET; 187 case NVME_SC_BAD_ATTRIBUTES: 188 case NVME_SC_ONCS_NOT_SUPPORTED: 189 case NVME_SC_INVALID_OPCODE: 190 case NVME_SC_INVALID_FIELD: 191 case NVME_SC_INVALID_NS: 192 return BLK_STS_NOTSUPP; 193 case NVME_SC_WRITE_FAULT: 194 case NVME_SC_READ_ERROR: 195 case NVME_SC_UNWRITTEN_BLOCK: 196 case NVME_SC_ACCESS_DENIED: 197 case NVME_SC_READ_ONLY: 198 case NVME_SC_COMPARE_FAILED: 199 return BLK_STS_MEDIUM; 200 case NVME_SC_GUARD_CHECK: 201 case NVME_SC_APPTAG_CHECK: 202 case NVME_SC_REFTAG_CHECK: 203 case NVME_SC_INVALID_PI: 204 return BLK_STS_PROTECTION; 205 case NVME_SC_RESERVATION_CONFLICT: 206 return BLK_STS_NEXUS; 207 default: 208 return BLK_STS_IOERR; 209 } 210 } 211 212 static inline bool nvme_req_needs_retry(struct request *req) 213 { 214 if (blk_noretry_request(req)) 215 return false; 216 if (nvme_req(req)->status & NVME_SC_DNR) 217 return false; 218 if (nvme_req(req)->retries >= nvme_max_retries) 219 return false; 220 return true; 221 } 222 223 void nvme_complete_rq(struct request *req) 224 { 225 blk_status_t status = nvme_error_status(req); 226 227 trace_nvme_complete_rq(req); 228 229 if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { 230 if (nvme_req_needs_failover(req, status)) { 231 nvme_failover_req(req); 232 return; 233 } 234 235 if (!blk_queue_dying(req->q)) { 236 nvme_req(req)->retries++; 237 blk_mq_requeue_request(req, true); 238 return; 239 } 240 } 241 blk_mq_end_request(req, status); 242 } 243 EXPORT_SYMBOL_GPL(nvme_complete_rq); 244 245 void nvme_cancel_request(struct request *req, void *data, bool reserved) 246 { 247 if (!blk_mq_request_started(req)) 248 return; 249 250 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, 251 "Cancelling I/O %d", req->tag); 252 253 nvme_req(req)->status = NVME_SC_ABORT_REQ; 254 blk_mq_complete_request(req); 255 256 } 257 EXPORT_SYMBOL_GPL(nvme_cancel_request); 258 259 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 260 enum nvme_ctrl_state new_state) 261 { 262 enum nvme_ctrl_state old_state; 263 unsigned long flags; 264 bool changed = false; 265 266 spin_lock_irqsave(&ctrl->lock, flags); 267 268 old_state = ctrl->state; 269 switch (new_state) { 270 case NVME_CTRL_ADMIN_ONLY: 271 switch (old_state) { 272 case NVME_CTRL_CONNECTING: 273 changed = true; 274 /* FALLTHRU */ 275 default: 276 break; 277 } 278 break; 279 case NVME_CTRL_LIVE: 280 switch (old_state) { 281 case NVME_CTRL_NEW: 282 case NVME_CTRL_RESETTING: 283 case NVME_CTRL_CONNECTING: 284 changed = true; 285 /* FALLTHRU */ 286 default: 287 break; 288 } 289 break; 290 case NVME_CTRL_RESETTING: 291 switch (old_state) { 292 case NVME_CTRL_NEW: 293 case NVME_CTRL_LIVE: 294 case NVME_CTRL_ADMIN_ONLY: 295 changed = true; 296 /* FALLTHRU */ 297 default: 298 break; 299 } 300 break; 301 case NVME_CTRL_CONNECTING: 302 switch (old_state) { 303 case NVME_CTRL_NEW: 304 case NVME_CTRL_RESETTING: 305 changed = true; 306 /* FALLTHRU */ 307 default: 308 break; 309 } 310 break; 311 case NVME_CTRL_DELETING: 312 switch (old_state) { 313 case NVME_CTRL_LIVE: 314 case NVME_CTRL_ADMIN_ONLY: 315 case NVME_CTRL_RESETTING: 316 case NVME_CTRL_CONNECTING: 317 changed = true; 318 /* FALLTHRU */ 319 default: 320 break; 321 } 322 break; 323 case NVME_CTRL_DEAD: 324 switch (old_state) { 325 case NVME_CTRL_DELETING: 326 changed = true; 327 /* FALLTHRU */ 328 default: 329 break; 330 } 331 break; 332 default: 333 break; 334 } 335 336 if (changed) 337 ctrl->state = new_state; 338 339 spin_unlock_irqrestore(&ctrl->lock, flags); 340 if (changed && ctrl->state == NVME_CTRL_LIVE) 341 nvme_kick_requeue_lists(ctrl); 342 return changed; 343 } 344 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 345 346 static void nvme_free_ns_head(struct kref *ref) 347 { 348 struct nvme_ns_head *head = 349 container_of(ref, struct nvme_ns_head, ref); 350 351 nvme_mpath_remove_disk(head); 352 ida_simple_remove(&head->subsys->ns_ida, head->instance); 353 list_del_init(&head->entry); 354 cleanup_srcu_struct(&head->srcu); 355 nvme_put_subsystem(head->subsys); 356 kfree(head); 357 } 358 359 static void nvme_put_ns_head(struct nvme_ns_head *head) 360 { 361 kref_put(&head->ref, nvme_free_ns_head); 362 } 363 364 static void nvme_free_ns(struct kref *kref) 365 { 366 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 367 368 if (ns->ndev) 369 nvme_nvm_unregister(ns); 370 371 put_disk(ns->disk); 372 nvme_put_ns_head(ns->head); 373 nvme_put_ctrl(ns->ctrl); 374 kfree(ns); 375 } 376 377 static void nvme_put_ns(struct nvme_ns *ns) 378 { 379 kref_put(&ns->kref, nvme_free_ns); 380 } 381 382 static inline void nvme_clear_nvme_request(struct request *req) 383 { 384 if (!(req->rq_flags & RQF_DONTPREP)) { 385 nvme_req(req)->retries = 0; 386 nvme_req(req)->flags = 0; 387 req->rq_flags |= RQF_DONTPREP; 388 } 389 } 390 391 struct request *nvme_alloc_request(struct request_queue *q, 392 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) 393 { 394 unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; 395 struct request *req; 396 397 if (qid == NVME_QID_ANY) { 398 req = blk_mq_alloc_request(q, op, flags); 399 } else { 400 req = blk_mq_alloc_request_hctx(q, op, flags, 401 qid ? qid - 1 : 0); 402 } 403 if (IS_ERR(req)) 404 return req; 405 406 req->cmd_flags |= REQ_FAILFAST_DRIVER; 407 nvme_clear_nvme_request(req); 408 nvme_req(req)->cmd = cmd; 409 410 return req; 411 } 412 EXPORT_SYMBOL_GPL(nvme_alloc_request); 413 414 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) 415 { 416 struct nvme_command c; 417 418 memset(&c, 0, sizeof(c)); 419 420 c.directive.opcode = nvme_admin_directive_send; 421 c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); 422 c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; 423 c.directive.dtype = NVME_DIR_IDENTIFY; 424 c.directive.tdtype = NVME_DIR_STREAMS; 425 c.directive.endir = enable ? NVME_DIR_ENDIR : 0; 426 427 return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); 428 } 429 430 static int nvme_disable_streams(struct nvme_ctrl *ctrl) 431 { 432 return nvme_toggle_streams(ctrl, false); 433 } 434 435 static int nvme_enable_streams(struct nvme_ctrl *ctrl) 436 { 437 return nvme_toggle_streams(ctrl, true); 438 } 439 440 static int nvme_get_stream_params(struct nvme_ctrl *ctrl, 441 struct streams_directive_params *s, u32 nsid) 442 { 443 struct nvme_command c; 444 445 memset(&c, 0, sizeof(c)); 446 memset(s, 0, sizeof(*s)); 447 448 c.directive.opcode = nvme_admin_directive_recv; 449 c.directive.nsid = cpu_to_le32(nsid); 450 c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1); 451 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; 452 c.directive.dtype = NVME_DIR_STREAMS; 453 454 return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); 455 } 456 457 static int nvme_configure_directives(struct nvme_ctrl *ctrl) 458 { 459 struct streams_directive_params s; 460 int ret; 461 462 if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) 463 return 0; 464 if (!streams) 465 return 0; 466 467 ret = nvme_enable_streams(ctrl); 468 if (ret) 469 return ret; 470 471 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); 472 if (ret) 473 return ret; 474 475 ctrl->nssa = le16_to_cpu(s.nssa); 476 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { 477 dev_info(ctrl->device, "too few streams (%u) available\n", 478 ctrl->nssa); 479 nvme_disable_streams(ctrl); 480 return 0; 481 } 482 483 ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); 484 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); 485 return 0; 486 } 487 488 /* 489 * Check if 'req' has a write hint associated with it. If it does, assign 490 * a valid namespace stream to the write. 491 */ 492 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, 493 struct request *req, u16 *control, 494 u32 *dsmgmt) 495 { 496 enum rw_hint streamid = req->write_hint; 497 498 if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) 499 streamid = 0; 500 else { 501 streamid--; 502 if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) 503 return; 504 505 *control |= NVME_RW_DTYPE_STREAMS; 506 *dsmgmt |= streamid << 16; 507 } 508 509 if (streamid < ARRAY_SIZE(req->q->write_hints)) 510 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; 511 } 512 513 static inline void nvme_setup_flush(struct nvme_ns *ns, 514 struct nvme_command *cmnd) 515 { 516 memset(cmnd, 0, sizeof(*cmnd)); 517 cmnd->common.opcode = nvme_cmd_flush; 518 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); 519 } 520 521 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, 522 struct nvme_command *cmnd) 523 { 524 unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; 525 struct nvme_dsm_range *range; 526 struct bio *bio; 527 528 range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); 529 if (!range) 530 return BLK_STS_RESOURCE; 531 532 __rq_for_each_bio(bio, req) { 533 u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); 534 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; 535 536 if (n < segments) { 537 range[n].cattr = cpu_to_le32(0); 538 range[n].nlb = cpu_to_le32(nlb); 539 range[n].slba = cpu_to_le64(slba); 540 } 541 n++; 542 } 543 544 if (WARN_ON_ONCE(n != segments)) { 545 kfree(range); 546 return BLK_STS_IOERR; 547 } 548 549 memset(cmnd, 0, sizeof(*cmnd)); 550 cmnd->dsm.opcode = nvme_cmd_dsm; 551 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); 552 cmnd->dsm.nr = cpu_to_le32(segments - 1); 553 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 554 555 req->special_vec.bv_page = virt_to_page(range); 556 req->special_vec.bv_offset = offset_in_page(range); 557 req->special_vec.bv_len = sizeof(*range) * segments; 558 req->rq_flags |= RQF_SPECIAL_PAYLOAD; 559 560 return BLK_STS_OK; 561 } 562 563 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, 564 struct request *req, struct nvme_command *cmnd) 565 { 566 struct nvme_ctrl *ctrl = ns->ctrl; 567 u16 control = 0; 568 u32 dsmgmt = 0; 569 570 if (req->cmd_flags & REQ_FUA) 571 control |= NVME_RW_FUA; 572 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 573 control |= NVME_RW_LR; 574 575 if (req->cmd_flags & REQ_RAHEAD) 576 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 577 578 memset(cmnd, 0, sizeof(*cmnd)); 579 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 580 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); 581 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 582 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 583 584 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) 585 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); 586 587 if (ns->ms) { 588 /* 589 * If formated with metadata, the block layer always provides a 590 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else 591 * we enable the PRACT bit for protection information or set the 592 * namespace capacity to zero to prevent any I/O. 593 */ 594 if (!blk_integrity_rq(req)) { 595 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) 596 return BLK_STS_NOTSUPP; 597 control |= NVME_RW_PRINFO_PRACT; 598 } 599 600 switch (ns->pi_type) { 601 case NVME_NS_DPS_PI_TYPE3: 602 control |= NVME_RW_PRINFO_PRCHK_GUARD; 603 break; 604 case NVME_NS_DPS_PI_TYPE1: 605 case NVME_NS_DPS_PI_TYPE2: 606 control |= NVME_RW_PRINFO_PRCHK_GUARD | 607 NVME_RW_PRINFO_PRCHK_REF; 608 cmnd->rw.reftag = cpu_to_le32( 609 nvme_block_nr(ns, blk_rq_pos(req))); 610 break; 611 } 612 } 613 614 cmnd->rw.control = cpu_to_le16(control); 615 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 616 return 0; 617 } 618 619 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 620 struct nvme_command *cmd) 621 { 622 blk_status_t ret = BLK_STS_OK; 623 624 nvme_clear_nvme_request(req); 625 626 switch (req_op(req)) { 627 case REQ_OP_DRV_IN: 628 case REQ_OP_DRV_OUT: 629 memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); 630 break; 631 case REQ_OP_FLUSH: 632 nvme_setup_flush(ns, cmd); 633 break; 634 case REQ_OP_WRITE_ZEROES: 635 /* currently only aliased to deallocate for a few ctrls: */ 636 case REQ_OP_DISCARD: 637 ret = nvme_setup_discard(ns, req, cmd); 638 break; 639 case REQ_OP_READ: 640 case REQ_OP_WRITE: 641 ret = nvme_setup_rw(ns, req, cmd); 642 break; 643 default: 644 WARN_ON_ONCE(1); 645 return BLK_STS_IOERR; 646 } 647 648 cmd->common.command_id = req->tag; 649 if (ns) 650 trace_nvme_setup_nvm_cmd(req->q->id, cmd); 651 else 652 trace_nvme_setup_admin_cmd(cmd); 653 return ret; 654 } 655 EXPORT_SYMBOL_GPL(nvme_setup_cmd); 656 657 /* 658 * Returns 0 on success. If the result is negative, it's a Linux error code; 659 * if the result is positive, it's an NVM Express status code 660 */ 661 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 662 union nvme_result *result, void *buffer, unsigned bufflen, 663 unsigned timeout, int qid, int at_head, 664 blk_mq_req_flags_t flags) 665 { 666 struct request *req; 667 int ret; 668 669 req = nvme_alloc_request(q, cmd, flags, qid); 670 if (IS_ERR(req)) 671 return PTR_ERR(req); 672 673 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 674 675 if (buffer && bufflen) { 676 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); 677 if (ret) 678 goto out; 679 } 680 681 blk_execute_rq(req->q, NULL, req, at_head); 682 if (result) 683 *result = nvme_req(req)->result; 684 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 685 ret = -EINTR; 686 else 687 ret = nvme_req(req)->status; 688 out: 689 blk_mq_free_request(req); 690 return ret; 691 } 692 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); 693 694 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 695 void *buffer, unsigned bufflen) 696 { 697 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, 698 NVME_QID_ANY, 0, 0); 699 } 700 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 701 702 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, 703 unsigned len, u32 seed, bool write) 704 { 705 struct bio_integrity_payload *bip; 706 int ret = -ENOMEM; 707 void *buf; 708 709 buf = kmalloc(len, GFP_KERNEL); 710 if (!buf) 711 goto out; 712 713 ret = -EFAULT; 714 if (write && copy_from_user(buf, ubuf, len)) 715 goto out_free_meta; 716 717 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 718 if (IS_ERR(bip)) { 719 ret = PTR_ERR(bip); 720 goto out_free_meta; 721 } 722 723 bip->bip_iter.bi_size = len; 724 bip->bip_iter.bi_sector = seed; 725 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 726 offset_in_page(buf)); 727 if (ret == len) 728 return buf; 729 ret = -ENOMEM; 730 out_free_meta: 731 kfree(buf); 732 out: 733 return ERR_PTR(ret); 734 } 735 736 static int nvme_submit_user_cmd(struct request_queue *q, 737 struct nvme_command *cmd, void __user *ubuffer, 738 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 739 u32 meta_seed, u32 *result, unsigned timeout) 740 { 741 bool write = nvme_is_write(cmd); 742 struct nvme_ns *ns = q->queuedata; 743 struct gendisk *disk = ns ? ns->disk : NULL; 744 struct request *req; 745 struct bio *bio = NULL; 746 void *meta = NULL; 747 int ret; 748 749 req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); 750 if (IS_ERR(req)) 751 return PTR_ERR(req); 752 753 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 754 nvme_req(req)->flags |= NVME_REQ_USERCMD; 755 756 if (ubuffer && bufflen) { 757 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, 758 GFP_KERNEL); 759 if (ret) 760 goto out; 761 bio = req->bio; 762 bio->bi_disk = disk; 763 if (disk && meta_buffer && meta_len) { 764 meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, 765 meta_seed, write); 766 if (IS_ERR(meta)) { 767 ret = PTR_ERR(meta); 768 goto out_unmap; 769 } 770 req->cmd_flags |= REQ_INTEGRITY; 771 } 772 } 773 774 blk_execute_rq(req->q, disk, req, 0); 775 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 776 ret = -EINTR; 777 else 778 ret = nvme_req(req)->status; 779 if (result) 780 *result = le32_to_cpu(nvme_req(req)->result.u32); 781 if (meta && !ret && !write) { 782 if (copy_to_user(meta_buffer, meta, meta_len)) 783 ret = -EFAULT; 784 } 785 kfree(meta); 786 out_unmap: 787 if (bio) 788 blk_rq_unmap_user(bio); 789 out: 790 blk_mq_free_request(req); 791 return ret; 792 } 793 794 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) 795 { 796 struct nvme_ctrl *ctrl = rq->end_io_data; 797 798 blk_mq_free_request(rq); 799 800 if (status) { 801 dev_err(ctrl->device, 802 "failed nvme_keep_alive_end_io error=%d\n", 803 status); 804 return; 805 } 806 807 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 808 } 809 810 static int nvme_keep_alive(struct nvme_ctrl *ctrl) 811 { 812 struct request *rq; 813 814 rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED, 815 NVME_QID_ANY); 816 if (IS_ERR(rq)) 817 return PTR_ERR(rq); 818 819 rq->timeout = ctrl->kato * HZ; 820 rq->end_io_data = ctrl; 821 822 blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); 823 824 return 0; 825 } 826 827 static void nvme_keep_alive_work(struct work_struct *work) 828 { 829 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 830 struct nvme_ctrl, ka_work); 831 832 if (nvme_keep_alive(ctrl)) { 833 /* allocation failure, reset the controller */ 834 dev_err(ctrl->device, "keep-alive failed\n"); 835 nvme_reset_ctrl(ctrl); 836 return; 837 } 838 } 839 840 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) 841 { 842 if (unlikely(ctrl->kato == 0)) 843 return; 844 845 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); 846 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); 847 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; 848 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 849 } 850 851 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) 852 { 853 if (unlikely(ctrl->kato == 0)) 854 return; 855 856 cancel_delayed_work_sync(&ctrl->ka_work); 857 } 858 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); 859 860 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 861 { 862 struct nvme_command c = { }; 863 int error; 864 865 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 866 c.identify.opcode = nvme_admin_identify; 867 c.identify.cns = NVME_ID_CNS_CTRL; 868 869 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 870 if (!*id) 871 return -ENOMEM; 872 873 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 874 sizeof(struct nvme_id_ctrl)); 875 if (error) 876 kfree(*id); 877 return error; 878 } 879 880 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, 881 struct nvme_ns_ids *ids) 882 { 883 struct nvme_command c = { }; 884 int status; 885 void *data; 886 int pos; 887 int len; 888 889 c.identify.opcode = nvme_admin_identify; 890 c.identify.nsid = cpu_to_le32(nsid); 891 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; 892 893 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 894 if (!data) 895 return -ENOMEM; 896 897 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, 898 NVME_IDENTIFY_DATA_SIZE); 899 if (status) 900 goto free_data; 901 902 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { 903 struct nvme_ns_id_desc *cur = data + pos; 904 905 if (cur->nidl == 0) 906 break; 907 908 switch (cur->nidt) { 909 case NVME_NIDT_EUI64: 910 if (cur->nidl != NVME_NIDT_EUI64_LEN) { 911 dev_warn(ctrl->device, 912 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", 913 cur->nidl); 914 goto free_data; 915 } 916 len = NVME_NIDT_EUI64_LEN; 917 memcpy(ids->eui64, data + pos + sizeof(*cur), len); 918 break; 919 case NVME_NIDT_NGUID: 920 if (cur->nidl != NVME_NIDT_NGUID_LEN) { 921 dev_warn(ctrl->device, 922 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", 923 cur->nidl); 924 goto free_data; 925 } 926 len = NVME_NIDT_NGUID_LEN; 927 memcpy(ids->nguid, data + pos + sizeof(*cur), len); 928 break; 929 case NVME_NIDT_UUID: 930 if (cur->nidl != NVME_NIDT_UUID_LEN) { 931 dev_warn(ctrl->device, 932 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", 933 cur->nidl); 934 goto free_data; 935 } 936 len = NVME_NIDT_UUID_LEN; 937 uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); 938 break; 939 default: 940 /* Skip unnkown types */ 941 len = cur->nidl; 942 break; 943 } 944 945 len += sizeof(*cur); 946 } 947 free_data: 948 kfree(data); 949 return status; 950 } 951 952 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) 953 { 954 struct nvme_command c = { }; 955 956 c.identify.opcode = nvme_admin_identify; 957 c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; 958 c.identify.nsid = cpu_to_le32(nsid); 959 return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 960 NVME_IDENTIFY_DATA_SIZE); 961 } 962 963 static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, 964 unsigned nsid) 965 { 966 struct nvme_id_ns *id; 967 struct nvme_command c = { }; 968 int error; 969 970 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 971 c.identify.opcode = nvme_admin_identify; 972 c.identify.nsid = cpu_to_le32(nsid); 973 c.identify.cns = NVME_ID_CNS_NS; 974 975 id = kmalloc(sizeof(*id), GFP_KERNEL); 976 if (!id) 977 return NULL; 978 979 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); 980 if (error) { 981 dev_warn(ctrl->device, "Identify namespace failed\n"); 982 kfree(id); 983 return NULL; 984 } 985 986 return id; 987 } 988 989 static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, 990 void *buffer, size_t buflen, u32 *result) 991 { 992 struct nvme_command c; 993 union nvme_result res; 994 int ret; 995 996 memset(&c, 0, sizeof(c)); 997 c.features.opcode = nvme_admin_set_features; 998 c.features.fid = cpu_to_le32(fid); 999 c.features.dword11 = cpu_to_le32(dword11); 1000 1001 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, 1002 buffer, buflen, 0, NVME_QID_ANY, 0, 0); 1003 if (ret >= 0 && result) 1004 *result = le32_to_cpu(res.u32); 1005 return ret; 1006 } 1007 1008 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) 1009 { 1010 u32 q_count = (*count - 1) | ((*count - 1) << 16); 1011 u32 result; 1012 int status, nr_io_queues; 1013 1014 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, 1015 &result); 1016 if (status < 0) 1017 return status; 1018 1019 /* 1020 * Degraded controllers might return an error when setting the queue 1021 * count. We still want to be able to bring them online and offer 1022 * access to the admin queue, as that might be only way to fix them up. 1023 */ 1024 if (status > 0) { 1025 dev_err(ctrl->device, "Could not set queue count (%d)\n", status); 1026 *count = 0; 1027 } else { 1028 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 1029 *count = min(*count, nr_io_queues); 1030 } 1031 1032 return 0; 1033 } 1034 EXPORT_SYMBOL_GPL(nvme_set_queue_count); 1035 1036 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1037 { 1038 struct nvme_user_io io; 1039 struct nvme_command c; 1040 unsigned length, meta_len; 1041 void __user *metadata; 1042 1043 if (copy_from_user(&io, uio, sizeof(io))) 1044 return -EFAULT; 1045 if (io.flags) 1046 return -EINVAL; 1047 1048 switch (io.opcode) { 1049 case nvme_cmd_write: 1050 case nvme_cmd_read: 1051 case nvme_cmd_compare: 1052 break; 1053 default: 1054 return -EINVAL; 1055 } 1056 1057 length = (io.nblocks + 1) << ns->lba_shift; 1058 meta_len = (io.nblocks + 1) * ns->ms; 1059 metadata = (void __user *)(uintptr_t)io.metadata; 1060 1061 if (ns->ext) { 1062 length += meta_len; 1063 meta_len = 0; 1064 } else if (meta_len) { 1065 if ((io.metadata & 3) || !io.metadata) 1066 return -EINVAL; 1067 } 1068 1069 memset(&c, 0, sizeof(c)); 1070 c.rw.opcode = io.opcode; 1071 c.rw.flags = io.flags; 1072 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 1073 c.rw.slba = cpu_to_le64(io.slba); 1074 c.rw.length = cpu_to_le16(io.nblocks); 1075 c.rw.control = cpu_to_le16(io.control); 1076 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1077 c.rw.reftag = cpu_to_le32(io.reftag); 1078 c.rw.apptag = cpu_to_le16(io.apptag); 1079 c.rw.appmask = cpu_to_le16(io.appmask); 1080 1081 return nvme_submit_user_cmd(ns->queue, &c, 1082 (void __user *)(uintptr_t)io.addr, length, 1083 metadata, meta_len, io.slba, NULL, 0); 1084 } 1085 1086 static u32 nvme_known_admin_effects(u8 opcode) 1087 { 1088 switch (opcode) { 1089 case nvme_admin_format_nvm: 1090 return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | 1091 NVME_CMD_EFFECTS_CSE_MASK; 1092 case nvme_admin_sanitize_nvm: 1093 return NVME_CMD_EFFECTS_CSE_MASK; 1094 default: 1095 break; 1096 } 1097 return 0; 1098 } 1099 1100 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1101 u8 opcode) 1102 { 1103 u32 effects = 0; 1104 1105 if (ns) { 1106 if (ctrl->effects) 1107 effects = le32_to_cpu(ctrl->effects->iocs[opcode]); 1108 if (effects & ~NVME_CMD_EFFECTS_CSUPP) 1109 dev_warn(ctrl->device, 1110 "IO command:%02x has unhandled effects:%08x\n", 1111 opcode, effects); 1112 return 0; 1113 } 1114 1115 if (ctrl->effects) 1116 effects = le32_to_cpu(ctrl->effects->acs[opcode]); 1117 else 1118 effects = nvme_known_admin_effects(opcode); 1119 1120 /* 1121 * For simplicity, IO to all namespaces is quiesced even if the command 1122 * effects say only one namespace is affected. 1123 */ 1124 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { 1125 nvme_start_freeze(ctrl); 1126 nvme_wait_freeze(ctrl); 1127 } 1128 return effects; 1129 } 1130 1131 static void nvme_update_formats(struct nvme_ctrl *ctrl) 1132 { 1133 struct nvme_ns *ns, *next; 1134 LIST_HEAD(rm_list); 1135 1136 down_write(&ctrl->namespaces_rwsem); 1137 list_for_each_entry(ns, &ctrl->namespaces, list) { 1138 if (ns->disk && nvme_revalidate_disk(ns->disk)) { 1139 list_move_tail(&ns->list, &rm_list); 1140 } 1141 } 1142 up_write(&ctrl->namespaces_rwsem); 1143 1144 list_for_each_entry_safe(ns, next, &rm_list, list) 1145 nvme_ns_remove(ns); 1146 } 1147 1148 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) 1149 { 1150 /* 1151 * Revalidate LBA changes prior to unfreezing. This is necessary to 1152 * prevent memory corruption if a logical block size was changed by 1153 * this command. 1154 */ 1155 if (effects & NVME_CMD_EFFECTS_LBCC) 1156 nvme_update_formats(ctrl); 1157 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) 1158 nvme_unfreeze(ctrl); 1159 if (effects & NVME_CMD_EFFECTS_CCC) 1160 nvme_init_identify(ctrl); 1161 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) 1162 nvme_queue_scan(ctrl); 1163 } 1164 1165 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1166 struct nvme_passthru_cmd __user *ucmd) 1167 { 1168 struct nvme_passthru_cmd cmd; 1169 struct nvme_command c; 1170 unsigned timeout = 0; 1171 u32 effects; 1172 int status; 1173 1174 if (!capable(CAP_SYS_ADMIN)) 1175 return -EACCES; 1176 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1177 return -EFAULT; 1178 if (cmd.flags) 1179 return -EINVAL; 1180 1181 memset(&c, 0, sizeof(c)); 1182 c.common.opcode = cmd.opcode; 1183 c.common.flags = cmd.flags; 1184 c.common.nsid = cpu_to_le32(cmd.nsid); 1185 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1186 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1187 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1188 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1189 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1190 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1191 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1192 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1193 1194 if (cmd.timeout_ms) 1195 timeout = msecs_to_jiffies(cmd.timeout_ms); 1196 1197 effects = nvme_passthru_start(ctrl, ns, cmd.opcode); 1198 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1199 (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 1200 (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, 1201 0, &cmd.result, timeout); 1202 nvme_passthru_end(ctrl, effects); 1203 1204 if (status >= 0) { 1205 if (put_user(cmd.result, &ucmd->result)) 1206 return -EFAULT; 1207 } 1208 1209 return status; 1210 } 1211 1212 /* 1213 * Issue ioctl requests on the first available path. Note that unlike normal 1214 * block layer requests we will not retry failed request on another controller. 1215 */ 1216 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, 1217 struct nvme_ns_head **head, int *srcu_idx) 1218 { 1219 #ifdef CONFIG_NVME_MULTIPATH 1220 if (disk->fops == &nvme_ns_head_ops) { 1221 *head = disk->private_data; 1222 *srcu_idx = srcu_read_lock(&(*head)->srcu); 1223 return nvme_find_path(*head); 1224 } 1225 #endif 1226 *head = NULL; 1227 *srcu_idx = -1; 1228 return disk->private_data; 1229 } 1230 1231 static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) 1232 { 1233 if (head) 1234 srcu_read_unlock(&head->srcu, idx); 1235 } 1236 1237 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg) 1238 { 1239 switch (cmd) { 1240 case NVME_IOCTL_ID: 1241 force_successful_syscall_return(); 1242 return ns->head->ns_id; 1243 case NVME_IOCTL_ADMIN_CMD: 1244 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); 1245 case NVME_IOCTL_IO_CMD: 1246 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); 1247 case NVME_IOCTL_SUBMIT_IO: 1248 return nvme_submit_io(ns, (void __user *)arg); 1249 default: 1250 #ifdef CONFIG_NVM 1251 if (ns->ndev) 1252 return nvme_nvm_ioctl(ns, cmd, arg); 1253 #endif 1254 if (is_sed_ioctl(cmd)) 1255 return sed_ioctl(ns->ctrl->opal_dev, cmd, 1256 (void __user *) arg); 1257 return -ENOTTY; 1258 } 1259 } 1260 1261 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 1262 unsigned int cmd, unsigned long arg) 1263 { 1264 struct nvme_ns_head *head = NULL; 1265 struct nvme_ns *ns; 1266 int srcu_idx, ret; 1267 1268 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 1269 if (unlikely(!ns)) 1270 ret = -EWOULDBLOCK; 1271 else 1272 ret = nvme_ns_ioctl(ns, cmd, arg); 1273 nvme_put_ns_from_disk(head, srcu_idx); 1274 return ret; 1275 } 1276 1277 static int nvme_open(struct block_device *bdev, fmode_t mode) 1278 { 1279 struct nvme_ns *ns = bdev->bd_disk->private_data; 1280 1281 #ifdef CONFIG_NVME_MULTIPATH 1282 /* should never be called due to GENHD_FL_HIDDEN */ 1283 if (WARN_ON_ONCE(ns->head->disk)) 1284 goto fail; 1285 #endif 1286 if (!kref_get_unless_zero(&ns->kref)) 1287 goto fail; 1288 if (!try_module_get(ns->ctrl->ops->module)) 1289 goto fail_put_ns; 1290 1291 return 0; 1292 1293 fail_put_ns: 1294 nvme_put_ns(ns); 1295 fail: 1296 return -ENXIO; 1297 } 1298 1299 static void nvme_release(struct gendisk *disk, fmode_t mode) 1300 { 1301 struct nvme_ns *ns = disk->private_data; 1302 1303 module_put(ns->ctrl->ops->module); 1304 nvme_put_ns(ns); 1305 } 1306 1307 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1308 { 1309 /* some standard values */ 1310 geo->heads = 1 << 6; 1311 geo->sectors = 1 << 5; 1312 geo->cylinders = get_capacity(bdev->bd_disk) >> 11; 1313 return 0; 1314 } 1315 1316 #ifdef CONFIG_BLK_DEV_INTEGRITY 1317 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) 1318 { 1319 struct blk_integrity integrity; 1320 1321 memset(&integrity, 0, sizeof(integrity)); 1322 switch (pi_type) { 1323 case NVME_NS_DPS_PI_TYPE3: 1324 integrity.profile = &t10_pi_type3_crc; 1325 integrity.tag_size = sizeof(u16) + sizeof(u32); 1326 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1327 break; 1328 case NVME_NS_DPS_PI_TYPE1: 1329 case NVME_NS_DPS_PI_TYPE2: 1330 integrity.profile = &t10_pi_type1_crc; 1331 integrity.tag_size = sizeof(u16); 1332 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1333 break; 1334 default: 1335 integrity.profile = NULL; 1336 break; 1337 } 1338 integrity.tuple_size = ms; 1339 blk_integrity_register(disk, &integrity); 1340 blk_queue_max_integrity_segments(disk->queue, 1); 1341 } 1342 #else 1343 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) 1344 { 1345 } 1346 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 1347 1348 static void nvme_set_chunk_size(struct nvme_ns *ns) 1349 { 1350 u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); 1351 blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); 1352 } 1353 1354 static void nvme_config_discard(struct nvme_ctrl *ctrl, 1355 unsigned stream_alignment, struct request_queue *queue) 1356 { 1357 u32 size = queue_logical_block_size(queue); 1358 1359 if (stream_alignment) 1360 size *= stream_alignment; 1361 1362 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < 1363 NVME_DSM_MAX_RANGES); 1364 1365 queue->limits.discard_alignment = 0; 1366 queue->limits.discard_granularity = size; 1367 1368 blk_queue_max_discard_sectors(queue, UINT_MAX); 1369 blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); 1370 blk_queue_flag_set(QUEUE_FLAG_DISCARD, queue); 1371 1372 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 1373 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); 1374 } 1375 1376 static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, 1377 struct nvme_id_ns *id, struct nvme_ns_ids *ids) 1378 { 1379 memset(ids, 0, sizeof(*ids)); 1380 1381 if (ctrl->vs >= NVME_VS(1, 1, 0)) 1382 memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); 1383 if (ctrl->vs >= NVME_VS(1, 2, 0)) 1384 memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); 1385 if (ctrl->vs >= NVME_VS(1, 3, 0)) { 1386 /* Don't treat error as fatal we potentially 1387 * already have a NGUID or EUI-64 1388 */ 1389 if (nvme_identify_ns_descs(ctrl, nsid, ids)) 1390 dev_warn(ctrl->device, 1391 "%s: Identify Descriptors failed\n", __func__); 1392 } 1393 } 1394 1395 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) 1396 { 1397 return !uuid_is_null(&ids->uuid) || 1398 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || 1399 memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); 1400 } 1401 1402 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) 1403 { 1404 return uuid_equal(&a->uuid, &b->uuid) && 1405 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && 1406 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; 1407 } 1408 1409 static void nvme_update_disk_info(struct gendisk *disk, 1410 struct nvme_ns *ns, struct nvme_id_ns *id) 1411 { 1412 sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); 1413 unsigned short bs = 1 << ns->lba_shift; 1414 unsigned stream_alignment = 0; 1415 1416 if (ns->ctrl->nr_streams && ns->sws && ns->sgs) 1417 stream_alignment = ns->sws * ns->sgs; 1418 1419 blk_mq_freeze_queue(disk->queue); 1420 blk_integrity_unregister(disk); 1421 1422 blk_queue_logical_block_size(disk->queue, bs); 1423 blk_queue_physical_block_size(disk->queue, bs); 1424 blk_queue_io_min(disk->queue, bs); 1425 1426 if (ns->ms && !ns->ext && 1427 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) 1428 nvme_init_integrity(disk, ns->ms, ns->pi_type); 1429 if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) 1430 capacity = 0; 1431 set_capacity(disk, capacity); 1432 1433 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) 1434 nvme_config_discard(ns->ctrl, stream_alignment, disk->queue); 1435 blk_mq_unfreeze_queue(disk->queue); 1436 } 1437 1438 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) 1439 { 1440 struct nvme_ns *ns = disk->private_data; 1441 1442 /* 1443 * If identify namespace failed, use default 512 byte block size so 1444 * block layer can use before failing read/write for 0 capacity. 1445 */ 1446 ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; 1447 if (ns->lba_shift == 0) 1448 ns->lba_shift = 9; 1449 ns->noiob = le16_to_cpu(id->noiob); 1450 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 1451 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); 1452 /* the PI implementation requires metadata equal t10 pi tuple size */ 1453 if (ns->ms == sizeof(struct t10_pi_tuple)) 1454 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; 1455 else 1456 ns->pi_type = 0; 1457 1458 if (ns->noiob) 1459 nvme_set_chunk_size(ns); 1460 nvme_update_disk_info(disk, ns, id); 1461 if (ns->ndev) 1462 nvme_nvm_update_nvm_info(ns); 1463 #ifdef CONFIG_NVME_MULTIPATH 1464 if (ns->head->disk) 1465 nvme_update_disk_info(ns->head->disk, ns, id); 1466 #endif 1467 } 1468 1469 static int nvme_revalidate_disk(struct gendisk *disk) 1470 { 1471 struct nvme_ns *ns = disk->private_data; 1472 struct nvme_ctrl *ctrl = ns->ctrl; 1473 struct nvme_id_ns *id; 1474 struct nvme_ns_ids ids; 1475 int ret = 0; 1476 1477 if (test_bit(NVME_NS_DEAD, &ns->flags)) { 1478 set_capacity(disk, 0); 1479 return -ENODEV; 1480 } 1481 1482 id = nvme_identify_ns(ctrl, ns->head->ns_id); 1483 if (!id) 1484 return -ENODEV; 1485 1486 if (id->ncap == 0) { 1487 ret = -ENODEV; 1488 goto out; 1489 } 1490 1491 __nvme_revalidate_disk(disk, id); 1492 nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); 1493 if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { 1494 dev_err(ctrl->device, 1495 "identifiers changed for nsid %d\n", ns->head->ns_id); 1496 ret = -ENODEV; 1497 } 1498 1499 out: 1500 kfree(id); 1501 return ret; 1502 } 1503 1504 static char nvme_pr_type(enum pr_type type) 1505 { 1506 switch (type) { 1507 case PR_WRITE_EXCLUSIVE: 1508 return 1; 1509 case PR_EXCLUSIVE_ACCESS: 1510 return 2; 1511 case PR_WRITE_EXCLUSIVE_REG_ONLY: 1512 return 3; 1513 case PR_EXCLUSIVE_ACCESS_REG_ONLY: 1514 return 4; 1515 case PR_WRITE_EXCLUSIVE_ALL_REGS: 1516 return 5; 1517 case PR_EXCLUSIVE_ACCESS_ALL_REGS: 1518 return 6; 1519 default: 1520 return 0; 1521 } 1522 }; 1523 1524 static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 1525 u64 key, u64 sa_key, u8 op) 1526 { 1527 struct nvme_ns_head *head = NULL; 1528 struct nvme_ns *ns; 1529 struct nvme_command c; 1530 int srcu_idx, ret; 1531 u8 data[16] = { 0, }; 1532 1533 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 1534 if (unlikely(!ns)) 1535 return -EWOULDBLOCK; 1536 1537 put_unaligned_le64(key, &data[0]); 1538 put_unaligned_le64(sa_key, &data[8]); 1539 1540 memset(&c, 0, sizeof(c)); 1541 c.common.opcode = op; 1542 c.common.nsid = cpu_to_le32(ns->head->ns_id); 1543 c.common.cdw10[0] = cpu_to_le32(cdw10); 1544 1545 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); 1546 nvme_put_ns_from_disk(head, srcu_idx); 1547 return ret; 1548 } 1549 1550 static int nvme_pr_register(struct block_device *bdev, u64 old, 1551 u64 new, unsigned flags) 1552 { 1553 u32 cdw10; 1554 1555 if (flags & ~PR_FL_IGNORE_KEY) 1556 return -EOPNOTSUPP; 1557 1558 cdw10 = old ? 2 : 0; 1559 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 1560 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 1561 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 1562 } 1563 1564 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 1565 enum pr_type type, unsigned flags) 1566 { 1567 u32 cdw10; 1568 1569 if (flags & ~PR_FL_IGNORE_KEY) 1570 return -EOPNOTSUPP; 1571 1572 cdw10 = nvme_pr_type(type) << 8; 1573 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 1574 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 1575 } 1576 1577 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 1578 enum pr_type type, bool abort) 1579 { 1580 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; 1581 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 1582 } 1583 1584 static int nvme_pr_clear(struct block_device *bdev, u64 key) 1585 { 1586 u32 cdw10 = 1 | (key ? 1 << 3 : 0); 1587 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); 1588 } 1589 1590 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 1591 { 1592 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; 1593 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 1594 } 1595 1596 static const struct pr_ops nvme_pr_ops = { 1597 .pr_register = nvme_pr_register, 1598 .pr_reserve = nvme_pr_reserve, 1599 .pr_release = nvme_pr_release, 1600 .pr_preempt = nvme_pr_preempt, 1601 .pr_clear = nvme_pr_clear, 1602 }; 1603 1604 #ifdef CONFIG_BLK_SED_OPAL 1605 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, 1606 bool send) 1607 { 1608 struct nvme_ctrl *ctrl = data; 1609 struct nvme_command cmd; 1610 1611 memset(&cmd, 0, sizeof(cmd)); 1612 if (send) 1613 cmd.common.opcode = nvme_admin_security_send; 1614 else 1615 cmd.common.opcode = nvme_admin_security_recv; 1616 cmd.common.nsid = 0; 1617 cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); 1618 cmd.common.cdw10[1] = cpu_to_le32(len); 1619 1620 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 1621 ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); 1622 } 1623 EXPORT_SYMBOL_GPL(nvme_sec_submit); 1624 #endif /* CONFIG_BLK_SED_OPAL */ 1625 1626 static const struct block_device_operations nvme_fops = { 1627 .owner = THIS_MODULE, 1628 .ioctl = nvme_ioctl, 1629 .compat_ioctl = nvme_ioctl, 1630 .open = nvme_open, 1631 .release = nvme_release, 1632 .getgeo = nvme_getgeo, 1633 .revalidate_disk= nvme_revalidate_disk, 1634 .pr_ops = &nvme_pr_ops, 1635 }; 1636 1637 #ifdef CONFIG_NVME_MULTIPATH 1638 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 1639 { 1640 struct nvme_ns_head *head = bdev->bd_disk->private_data; 1641 1642 if (!kref_get_unless_zero(&head->ref)) 1643 return -ENXIO; 1644 return 0; 1645 } 1646 1647 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 1648 { 1649 nvme_put_ns_head(disk->private_data); 1650 } 1651 1652 const struct block_device_operations nvme_ns_head_ops = { 1653 .owner = THIS_MODULE, 1654 .open = nvme_ns_head_open, 1655 .release = nvme_ns_head_release, 1656 .ioctl = nvme_ioctl, 1657 .compat_ioctl = nvme_ioctl, 1658 .getgeo = nvme_getgeo, 1659 .pr_ops = &nvme_pr_ops, 1660 }; 1661 #endif /* CONFIG_NVME_MULTIPATH */ 1662 1663 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 1664 { 1665 unsigned long timeout = 1666 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1667 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; 1668 int ret; 1669 1670 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 1671 if (csts == ~0) 1672 return -ENODEV; 1673 if ((csts & NVME_CSTS_RDY) == bit) 1674 break; 1675 1676 msleep(100); 1677 if (fatal_signal_pending(current)) 1678 return -EINTR; 1679 if (time_after(jiffies, timeout)) { 1680 dev_err(ctrl->device, 1681 "Device not ready; aborting %s\n", enabled ? 1682 "initialisation" : "reset"); 1683 return -ENODEV; 1684 } 1685 } 1686 1687 return ret; 1688 } 1689 1690 /* 1691 * If the device has been passed off to us in an enabled state, just clear 1692 * the enabled bit. The spec says we should set the 'shutdown notification 1693 * bits', but doing so may cause the device to complete commands to the 1694 * admin queue ... and we don't know what memory that might be pointing at! 1695 */ 1696 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 1697 { 1698 int ret; 1699 1700 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 1701 ctrl->ctrl_config &= ~NVME_CC_ENABLE; 1702 1703 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1704 if (ret) 1705 return ret; 1706 1707 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) 1708 msleep(NVME_QUIRK_DELAY_AMOUNT); 1709 1710 return nvme_wait_ready(ctrl, cap, false); 1711 } 1712 EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 1713 1714 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 1715 { 1716 /* 1717 * Default to a 4K page size, with the intention to update this 1718 * path in the future to accomodate architectures with differing 1719 * kernel and IO page sizes. 1720 */ 1721 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; 1722 int ret; 1723 1724 if (page_shift < dev_page_min) { 1725 dev_err(ctrl->device, 1726 "Minimum device page size %u too large for host (%u)\n", 1727 1 << dev_page_min, 1 << page_shift); 1728 return -ENODEV; 1729 } 1730 1731 ctrl->page_size = 1 << page_shift; 1732 1733 ctrl->ctrl_config = NVME_CC_CSS_NVM; 1734 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 1735 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; 1736 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1737 ctrl->ctrl_config |= NVME_CC_ENABLE; 1738 1739 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1740 if (ret) 1741 return ret; 1742 return nvme_wait_ready(ctrl, cap, true); 1743 } 1744 EXPORT_SYMBOL_GPL(nvme_enable_ctrl); 1745 1746 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) 1747 { 1748 unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); 1749 u32 csts; 1750 int ret; 1751 1752 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 1753 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; 1754 1755 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1756 if (ret) 1757 return ret; 1758 1759 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 1760 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) 1761 break; 1762 1763 msleep(100); 1764 if (fatal_signal_pending(current)) 1765 return -EINTR; 1766 if (time_after(jiffies, timeout)) { 1767 dev_err(ctrl->device, 1768 "Device shutdown incomplete; abort shutdown\n"); 1769 return -ENODEV; 1770 } 1771 } 1772 1773 return ret; 1774 } 1775 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); 1776 1777 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 1778 struct request_queue *q) 1779 { 1780 bool vwc = false; 1781 1782 if (ctrl->max_hw_sectors) { 1783 u32 max_segments = 1784 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; 1785 1786 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 1787 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 1788 } 1789 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && 1790 is_power_of_2(ctrl->max_hw_sectors)) 1791 blk_queue_chunk_sectors(q, ctrl->max_hw_sectors); 1792 blk_queue_virt_boundary(q, ctrl->page_size - 1); 1793 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 1794 vwc = true; 1795 blk_queue_write_cache(q, vwc, vwc); 1796 } 1797 1798 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) 1799 { 1800 __le64 ts; 1801 int ret; 1802 1803 if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) 1804 return 0; 1805 1806 ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); 1807 ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), 1808 NULL); 1809 if (ret) 1810 dev_warn_once(ctrl->device, 1811 "could not set timestamp (%d)\n", ret); 1812 return ret; 1813 } 1814 1815 static int nvme_configure_apst(struct nvme_ctrl *ctrl) 1816 { 1817 /* 1818 * APST (Autonomous Power State Transition) lets us program a 1819 * table of power state transitions that the controller will 1820 * perform automatically. We configure it with a simple 1821 * heuristic: we are willing to spend at most 2% of the time 1822 * transitioning between power states. Therefore, when running 1823 * in any given state, we will enter the next lower-power 1824 * non-operational state after waiting 50 * (enlat + exlat) 1825 * microseconds, as long as that state's exit latency is under 1826 * the requested maximum latency. 1827 * 1828 * We will not autonomously enter any non-operational state for 1829 * which the total latency exceeds ps_max_latency_us. Users 1830 * can set ps_max_latency_us to zero to turn off APST. 1831 */ 1832 1833 unsigned apste; 1834 struct nvme_feat_auto_pst *table; 1835 u64 max_lat_us = 0; 1836 int max_ps = -1; 1837 int ret; 1838 1839 /* 1840 * If APST isn't supported or if we haven't been initialized yet, 1841 * then don't do anything. 1842 */ 1843 if (!ctrl->apsta) 1844 return 0; 1845 1846 if (ctrl->npss > 31) { 1847 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); 1848 return 0; 1849 } 1850 1851 table = kzalloc(sizeof(*table), GFP_KERNEL); 1852 if (!table) 1853 return 0; 1854 1855 if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { 1856 /* Turn off APST. */ 1857 apste = 0; 1858 dev_dbg(ctrl->device, "APST disabled\n"); 1859 } else { 1860 __le64 target = cpu_to_le64(0); 1861 int state; 1862 1863 /* 1864 * Walk through all states from lowest- to highest-power. 1865 * According to the spec, lower-numbered states use more 1866 * power. NPSS, despite the name, is the index of the 1867 * lowest-power state, not the number of states. 1868 */ 1869 for (state = (int)ctrl->npss; state >= 0; state--) { 1870 u64 total_latency_us, exit_latency_us, transition_ms; 1871 1872 if (target) 1873 table->entries[state] = target; 1874 1875 /* 1876 * Don't allow transitions to the deepest state 1877 * if it's quirked off. 1878 */ 1879 if (state == ctrl->npss && 1880 (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) 1881 continue; 1882 1883 /* 1884 * Is this state a useful non-operational state for 1885 * higher-power states to autonomously transition to? 1886 */ 1887 if (!(ctrl->psd[state].flags & 1888 NVME_PS_FLAGS_NON_OP_STATE)) 1889 continue; 1890 1891 exit_latency_us = 1892 (u64)le32_to_cpu(ctrl->psd[state].exit_lat); 1893 if (exit_latency_us > ctrl->ps_max_latency_us) 1894 continue; 1895 1896 total_latency_us = 1897 exit_latency_us + 1898 le32_to_cpu(ctrl->psd[state].entry_lat); 1899 1900 /* 1901 * This state is good. Use it as the APST idle 1902 * target for higher power states. 1903 */ 1904 transition_ms = total_latency_us + 19; 1905 do_div(transition_ms, 20); 1906 if (transition_ms > (1 << 24) - 1) 1907 transition_ms = (1 << 24) - 1; 1908 1909 target = cpu_to_le64((state << 3) | 1910 (transition_ms << 8)); 1911 1912 if (max_ps == -1) 1913 max_ps = state; 1914 1915 if (total_latency_us > max_lat_us) 1916 max_lat_us = total_latency_us; 1917 } 1918 1919 apste = 1; 1920 1921 if (max_ps == -1) { 1922 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); 1923 } else { 1924 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", 1925 max_ps, max_lat_us, (int)sizeof(*table), table); 1926 } 1927 } 1928 1929 ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, 1930 table, sizeof(*table), NULL); 1931 if (ret) 1932 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); 1933 1934 kfree(table); 1935 return ret; 1936 } 1937 1938 static void nvme_set_latency_tolerance(struct device *dev, s32 val) 1939 { 1940 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1941 u64 latency; 1942 1943 switch (val) { 1944 case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: 1945 case PM_QOS_LATENCY_ANY: 1946 latency = U64_MAX; 1947 break; 1948 1949 default: 1950 latency = val; 1951 } 1952 1953 if (ctrl->ps_max_latency_us != latency) { 1954 ctrl->ps_max_latency_us = latency; 1955 nvme_configure_apst(ctrl); 1956 } 1957 } 1958 1959 struct nvme_core_quirk_entry { 1960 /* 1961 * NVMe model and firmware strings are padded with spaces. For 1962 * simplicity, strings in the quirk table are padded with NULLs 1963 * instead. 1964 */ 1965 u16 vid; 1966 const char *mn; 1967 const char *fr; 1968 unsigned long quirks; 1969 }; 1970 1971 static const struct nvme_core_quirk_entry core_quirks[] = { 1972 { 1973 /* 1974 * This Toshiba device seems to die using any APST states. See: 1975 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11 1976 */ 1977 .vid = 0x1179, 1978 .mn = "THNSF5256GPUK TOSHIBA", 1979 .quirks = NVME_QUIRK_NO_APST, 1980 } 1981 }; 1982 1983 /* match is null-terminated but idstr is space-padded. */ 1984 static bool string_matches(const char *idstr, const char *match, size_t len) 1985 { 1986 size_t matchlen; 1987 1988 if (!match) 1989 return true; 1990 1991 matchlen = strlen(match); 1992 WARN_ON_ONCE(matchlen > len); 1993 1994 if (memcmp(idstr, match, matchlen)) 1995 return false; 1996 1997 for (; matchlen < len; matchlen++) 1998 if (idstr[matchlen] != ' ') 1999 return false; 2000 2001 return true; 2002 } 2003 2004 static bool quirk_matches(const struct nvme_id_ctrl *id, 2005 const struct nvme_core_quirk_entry *q) 2006 { 2007 return q->vid == le16_to_cpu(id->vid) && 2008 string_matches(id->mn, q->mn, sizeof(id->mn)) && 2009 string_matches(id->fr, q->fr, sizeof(id->fr)); 2010 } 2011 2012 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, 2013 struct nvme_id_ctrl *id) 2014 { 2015 size_t nqnlen; 2016 int off; 2017 2018 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); 2019 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { 2020 strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); 2021 return; 2022 } 2023 2024 if (ctrl->vs >= NVME_VS(1, 2, 1)) 2025 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); 2026 2027 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ 2028 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, 2029 "nqn.2014.08.org.nvmexpress:%4x%4x", 2030 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); 2031 memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); 2032 off += sizeof(id->sn); 2033 memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); 2034 off += sizeof(id->mn); 2035 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); 2036 } 2037 2038 static void __nvme_release_subsystem(struct nvme_subsystem *subsys) 2039 { 2040 ida_simple_remove(&nvme_subsystems_ida, subsys->instance); 2041 kfree(subsys); 2042 } 2043 2044 static void nvme_release_subsystem(struct device *dev) 2045 { 2046 __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev)); 2047 } 2048 2049 static void nvme_destroy_subsystem(struct kref *ref) 2050 { 2051 struct nvme_subsystem *subsys = 2052 container_of(ref, struct nvme_subsystem, ref); 2053 2054 mutex_lock(&nvme_subsystems_lock); 2055 list_del(&subsys->entry); 2056 mutex_unlock(&nvme_subsystems_lock); 2057 2058 ida_destroy(&subsys->ns_ida); 2059 device_del(&subsys->dev); 2060 put_device(&subsys->dev); 2061 } 2062 2063 static void nvme_put_subsystem(struct nvme_subsystem *subsys) 2064 { 2065 kref_put(&subsys->ref, nvme_destroy_subsystem); 2066 } 2067 2068 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) 2069 { 2070 struct nvme_subsystem *subsys; 2071 2072 lockdep_assert_held(&nvme_subsystems_lock); 2073 2074 list_for_each_entry(subsys, &nvme_subsystems, entry) { 2075 if (strcmp(subsys->subnqn, subsysnqn)) 2076 continue; 2077 if (!kref_get_unless_zero(&subsys->ref)) 2078 continue; 2079 return subsys; 2080 } 2081 2082 return NULL; 2083 } 2084 2085 #define SUBSYS_ATTR_RO(_name, _mode, _show) \ 2086 struct device_attribute subsys_attr_##_name = \ 2087 __ATTR(_name, _mode, _show, NULL) 2088 2089 static ssize_t nvme_subsys_show_nqn(struct device *dev, 2090 struct device_attribute *attr, 2091 char *buf) 2092 { 2093 struct nvme_subsystem *subsys = 2094 container_of(dev, struct nvme_subsystem, dev); 2095 2096 return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); 2097 } 2098 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); 2099 2100 #define nvme_subsys_show_str_function(field) \ 2101 static ssize_t subsys_##field##_show(struct device *dev, \ 2102 struct device_attribute *attr, char *buf) \ 2103 { \ 2104 struct nvme_subsystem *subsys = \ 2105 container_of(dev, struct nvme_subsystem, dev); \ 2106 return sprintf(buf, "%.*s\n", \ 2107 (int)sizeof(subsys->field), subsys->field); \ 2108 } \ 2109 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); 2110 2111 nvme_subsys_show_str_function(model); 2112 nvme_subsys_show_str_function(serial); 2113 nvme_subsys_show_str_function(firmware_rev); 2114 2115 static struct attribute *nvme_subsys_attrs[] = { 2116 &subsys_attr_model.attr, 2117 &subsys_attr_serial.attr, 2118 &subsys_attr_firmware_rev.attr, 2119 &subsys_attr_subsysnqn.attr, 2120 NULL, 2121 }; 2122 2123 static struct attribute_group nvme_subsys_attrs_group = { 2124 .attrs = nvme_subsys_attrs, 2125 }; 2126 2127 static const struct attribute_group *nvme_subsys_attrs_groups[] = { 2128 &nvme_subsys_attrs_group, 2129 NULL, 2130 }; 2131 2132 static int nvme_active_ctrls(struct nvme_subsystem *subsys) 2133 { 2134 int count = 0; 2135 struct nvme_ctrl *ctrl; 2136 2137 mutex_lock(&subsys->lock); 2138 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 2139 if (ctrl->state != NVME_CTRL_DELETING && 2140 ctrl->state != NVME_CTRL_DEAD) 2141 count++; 2142 } 2143 mutex_unlock(&subsys->lock); 2144 2145 return count; 2146 } 2147 2148 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2149 { 2150 struct nvme_subsystem *subsys, *found; 2151 int ret; 2152 2153 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); 2154 if (!subsys) 2155 return -ENOMEM; 2156 ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); 2157 if (ret < 0) { 2158 kfree(subsys); 2159 return ret; 2160 } 2161 subsys->instance = ret; 2162 mutex_init(&subsys->lock); 2163 kref_init(&subsys->ref); 2164 INIT_LIST_HEAD(&subsys->ctrls); 2165 INIT_LIST_HEAD(&subsys->nsheads); 2166 nvme_init_subnqn(subsys, ctrl, id); 2167 memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); 2168 memcpy(subsys->model, id->mn, sizeof(subsys->model)); 2169 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); 2170 subsys->vendor_id = le16_to_cpu(id->vid); 2171 subsys->cmic = id->cmic; 2172 2173 subsys->dev.class = nvme_subsys_class; 2174 subsys->dev.release = nvme_release_subsystem; 2175 subsys->dev.groups = nvme_subsys_attrs_groups; 2176 dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); 2177 device_initialize(&subsys->dev); 2178 2179 mutex_lock(&nvme_subsystems_lock); 2180 found = __nvme_find_get_subsystem(subsys->subnqn); 2181 if (found) { 2182 /* 2183 * Verify that the subsystem actually supports multiple 2184 * controllers, else bail out. 2185 */ 2186 if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { 2187 dev_err(ctrl->device, 2188 "ignoring ctrl due to duplicate subnqn (%s).\n", 2189 found->subnqn); 2190 nvme_put_subsystem(found); 2191 ret = -EINVAL; 2192 goto out_unlock; 2193 } 2194 2195 __nvme_release_subsystem(subsys); 2196 subsys = found; 2197 } else { 2198 ret = device_add(&subsys->dev); 2199 if (ret) { 2200 dev_err(ctrl->device, 2201 "failed to register subsystem device.\n"); 2202 goto out_unlock; 2203 } 2204 ida_init(&subsys->ns_ida); 2205 list_add_tail(&subsys->entry, &nvme_subsystems); 2206 } 2207 2208 ctrl->subsys = subsys; 2209 mutex_unlock(&nvme_subsystems_lock); 2210 2211 if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, 2212 dev_name(ctrl->device))) { 2213 dev_err(ctrl->device, 2214 "failed to create sysfs link from subsystem.\n"); 2215 /* the transport driver will eventually put the subsystem */ 2216 return -EINVAL; 2217 } 2218 2219 mutex_lock(&subsys->lock); 2220 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); 2221 mutex_unlock(&subsys->lock); 2222 2223 return 0; 2224 2225 out_unlock: 2226 mutex_unlock(&nvme_subsystems_lock); 2227 put_device(&subsys->dev); 2228 return ret; 2229 } 2230 2231 int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 2232 u8 log_page, void *log, 2233 size_t size, u64 offset) 2234 { 2235 struct nvme_command c = { }; 2236 unsigned long dwlen = size / 4 - 1; 2237 2238 c.get_log_page.opcode = nvme_admin_get_log_page; 2239 2240 if (ns) 2241 c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id); 2242 else 2243 c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL); 2244 2245 c.get_log_page.lid = log_page; 2246 c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); 2247 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); 2248 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); 2249 c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); 2250 2251 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); 2252 } 2253 2254 static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, 2255 size_t size) 2256 { 2257 return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0); 2258 } 2259 2260 static int nvme_get_effects_log(struct nvme_ctrl *ctrl) 2261 { 2262 int ret; 2263 2264 if (!ctrl->effects) 2265 ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); 2266 2267 if (!ctrl->effects) 2268 return 0; 2269 2270 ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, 2271 sizeof(*ctrl->effects)); 2272 if (ret) { 2273 kfree(ctrl->effects); 2274 ctrl->effects = NULL; 2275 } 2276 return ret; 2277 } 2278 2279 /* 2280 * Initialize the cached copies of the Identify data and various controller 2281 * register in our nvme_ctrl structure. This should be called as soon as 2282 * the admin queue is fully up and running. 2283 */ 2284 int nvme_init_identify(struct nvme_ctrl *ctrl) 2285 { 2286 struct nvme_id_ctrl *id; 2287 u64 cap; 2288 int ret, page_shift; 2289 u32 max_hw_sectors; 2290 bool prev_apst_enabled; 2291 2292 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 2293 if (ret) { 2294 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); 2295 return ret; 2296 } 2297 2298 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); 2299 if (ret) { 2300 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 2301 return ret; 2302 } 2303 page_shift = NVME_CAP_MPSMIN(cap) + 12; 2304 2305 if (ctrl->vs >= NVME_VS(1, 1, 0)) 2306 ctrl->subsystem = NVME_CAP_NSSRC(cap); 2307 2308 ret = nvme_identify_ctrl(ctrl, &id); 2309 if (ret) { 2310 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); 2311 return -EIO; 2312 } 2313 2314 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { 2315 ret = nvme_get_effects_log(ctrl); 2316 if (ret < 0) 2317 return ret; 2318 } 2319 2320 if (!ctrl->identified) { 2321 int i; 2322 2323 ret = nvme_init_subsystem(ctrl, id); 2324 if (ret) 2325 goto out_free; 2326 2327 /* 2328 * Check for quirks. Quirk can depend on firmware version, 2329 * so, in principle, the set of quirks present can change 2330 * across a reset. As a possible future enhancement, we 2331 * could re-scan for quirks every time we reinitialize 2332 * the device, but we'd have to make sure that the driver 2333 * behaves intelligently if the quirks change. 2334 */ 2335 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { 2336 if (quirk_matches(id, &core_quirks[i])) 2337 ctrl->quirks |= core_quirks[i].quirks; 2338 } 2339 } 2340 2341 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { 2342 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); 2343 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; 2344 } 2345 2346 ctrl->oacs = le16_to_cpu(id->oacs); 2347 ctrl->oncs = le16_to_cpup(&id->oncs); 2348 atomic_set(&ctrl->abort_limit, id->acl + 1); 2349 ctrl->vwc = id->vwc; 2350 ctrl->cntlid = le16_to_cpup(&id->cntlid); 2351 if (id->mdts) 2352 max_hw_sectors = 1 << (id->mdts + page_shift - 9); 2353 else 2354 max_hw_sectors = UINT_MAX; 2355 ctrl->max_hw_sectors = 2356 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); 2357 2358 nvme_set_queue_limits(ctrl, ctrl->admin_q); 2359 ctrl->sgls = le32_to_cpu(id->sgls); 2360 ctrl->kas = le16_to_cpu(id->kas); 2361 2362 if (id->rtd3e) { 2363 /* us -> s */ 2364 u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; 2365 2366 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, 2367 shutdown_timeout, 60); 2368 2369 if (ctrl->shutdown_timeout != shutdown_timeout) 2370 dev_info(ctrl->device, 2371 "Shutdown timeout set to %u seconds\n", 2372 ctrl->shutdown_timeout); 2373 } else 2374 ctrl->shutdown_timeout = shutdown_timeout; 2375 2376 ctrl->npss = id->npss; 2377 ctrl->apsta = id->apsta; 2378 prev_apst_enabled = ctrl->apst_enabled; 2379 if (ctrl->quirks & NVME_QUIRK_NO_APST) { 2380 if (force_apst && id->apsta) { 2381 dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); 2382 ctrl->apst_enabled = true; 2383 } else { 2384 ctrl->apst_enabled = false; 2385 } 2386 } else { 2387 ctrl->apst_enabled = id->apsta; 2388 } 2389 memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); 2390 2391 if (ctrl->ops->flags & NVME_F_FABRICS) { 2392 ctrl->icdoff = le16_to_cpu(id->icdoff); 2393 ctrl->ioccsz = le32_to_cpu(id->ioccsz); 2394 ctrl->iorcsz = le32_to_cpu(id->iorcsz); 2395 ctrl->maxcmd = le16_to_cpu(id->maxcmd); 2396 2397 /* 2398 * In fabrics we need to verify the cntlid matches the 2399 * admin connect 2400 */ 2401 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { 2402 ret = -EINVAL; 2403 goto out_free; 2404 } 2405 2406 if (!ctrl->opts->discovery_nqn && !ctrl->kas) { 2407 dev_err(ctrl->device, 2408 "keep-alive support is mandatory for fabrics\n"); 2409 ret = -EINVAL; 2410 goto out_free; 2411 } 2412 } else { 2413 ctrl->cntlid = le16_to_cpu(id->cntlid); 2414 ctrl->hmpre = le32_to_cpu(id->hmpre); 2415 ctrl->hmmin = le32_to_cpu(id->hmmin); 2416 ctrl->hmminds = le32_to_cpu(id->hmminds); 2417 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); 2418 } 2419 2420 kfree(id); 2421 2422 if (ctrl->apst_enabled && !prev_apst_enabled) 2423 dev_pm_qos_expose_latency_tolerance(ctrl->device); 2424 else if (!ctrl->apst_enabled && prev_apst_enabled) 2425 dev_pm_qos_hide_latency_tolerance(ctrl->device); 2426 2427 ret = nvme_configure_apst(ctrl); 2428 if (ret < 0) 2429 return ret; 2430 2431 ret = nvme_configure_timestamp(ctrl); 2432 if (ret < 0) 2433 return ret; 2434 2435 ret = nvme_configure_directives(ctrl); 2436 if (ret < 0) 2437 return ret; 2438 2439 ctrl->identified = true; 2440 2441 return 0; 2442 2443 out_free: 2444 kfree(id); 2445 return ret; 2446 } 2447 EXPORT_SYMBOL_GPL(nvme_init_identify); 2448 2449 static int nvme_dev_open(struct inode *inode, struct file *file) 2450 { 2451 struct nvme_ctrl *ctrl = 2452 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 2453 2454 switch (ctrl->state) { 2455 case NVME_CTRL_LIVE: 2456 case NVME_CTRL_ADMIN_ONLY: 2457 break; 2458 default: 2459 return -EWOULDBLOCK; 2460 } 2461 2462 file->private_data = ctrl; 2463 return 0; 2464 } 2465 2466 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) 2467 { 2468 struct nvme_ns *ns; 2469 int ret; 2470 2471 down_read(&ctrl->namespaces_rwsem); 2472 if (list_empty(&ctrl->namespaces)) { 2473 ret = -ENOTTY; 2474 goto out_unlock; 2475 } 2476 2477 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 2478 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 2479 dev_warn(ctrl->device, 2480 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 2481 ret = -EINVAL; 2482 goto out_unlock; 2483 } 2484 2485 dev_warn(ctrl->device, 2486 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 2487 kref_get(&ns->kref); 2488 up_read(&ctrl->namespaces_rwsem); 2489 2490 ret = nvme_user_cmd(ctrl, ns, argp); 2491 nvme_put_ns(ns); 2492 return ret; 2493 2494 out_unlock: 2495 up_read(&ctrl->namespaces_rwsem); 2496 return ret; 2497 } 2498 2499 static long nvme_dev_ioctl(struct file *file, unsigned int cmd, 2500 unsigned long arg) 2501 { 2502 struct nvme_ctrl *ctrl = file->private_data; 2503 void __user *argp = (void __user *)arg; 2504 2505 switch (cmd) { 2506 case NVME_IOCTL_ADMIN_CMD: 2507 return nvme_user_cmd(ctrl, NULL, argp); 2508 case NVME_IOCTL_IO_CMD: 2509 return nvme_dev_user_cmd(ctrl, argp); 2510 case NVME_IOCTL_RESET: 2511 dev_warn(ctrl->device, "resetting controller\n"); 2512 return nvme_reset_ctrl_sync(ctrl); 2513 case NVME_IOCTL_SUBSYS_RESET: 2514 return nvme_reset_subsystem(ctrl); 2515 case NVME_IOCTL_RESCAN: 2516 nvme_queue_scan(ctrl); 2517 return 0; 2518 default: 2519 return -ENOTTY; 2520 } 2521 } 2522 2523 static const struct file_operations nvme_dev_fops = { 2524 .owner = THIS_MODULE, 2525 .open = nvme_dev_open, 2526 .unlocked_ioctl = nvme_dev_ioctl, 2527 .compat_ioctl = nvme_dev_ioctl, 2528 }; 2529 2530 static ssize_t nvme_sysfs_reset(struct device *dev, 2531 struct device_attribute *attr, const char *buf, 2532 size_t count) 2533 { 2534 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2535 int ret; 2536 2537 ret = nvme_reset_ctrl_sync(ctrl); 2538 if (ret < 0) 2539 return ret; 2540 return count; 2541 } 2542 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 2543 2544 static ssize_t nvme_sysfs_rescan(struct device *dev, 2545 struct device_attribute *attr, const char *buf, 2546 size_t count) 2547 { 2548 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2549 2550 nvme_queue_scan(ctrl); 2551 return count; 2552 } 2553 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); 2554 2555 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) 2556 { 2557 struct gendisk *disk = dev_to_disk(dev); 2558 2559 if (disk->fops == &nvme_fops) 2560 return nvme_get_ns_from_dev(dev)->head; 2561 else 2562 return disk->private_data; 2563 } 2564 2565 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 2566 char *buf) 2567 { 2568 struct nvme_ns_head *head = dev_to_ns_head(dev); 2569 struct nvme_ns_ids *ids = &head->ids; 2570 struct nvme_subsystem *subsys = head->subsys; 2571 int serial_len = sizeof(subsys->serial); 2572 int model_len = sizeof(subsys->model); 2573 2574 if (!uuid_is_null(&ids->uuid)) 2575 return sprintf(buf, "uuid.%pU\n", &ids->uuid); 2576 2577 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 2578 return sprintf(buf, "eui.%16phN\n", ids->nguid); 2579 2580 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 2581 return sprintf(buf, "eui.%8phN\n", ids->eui64); 2582 2583 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || 2584 subsys->serial[serial_len - 1] == '\0')) 2585 serial_len--; 2586 while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || 2587 subsys->model[model_len - 1] == '\0')) 2588 model_len--; 2589 2590 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, 2591 serial_len, subsys->serial, model_len, subsys->model, 2592 head->ns_id); 2593 } 2594 static DEVICE_ATTR_RO(wwid); 2595 2596 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, 2597 char *buf) 2598 { 2599 return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); 2600 } 2601 static DEVICE_ATTR_RO(nguid); 2602 2603 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 2604 char *buf) 2605 { 2606 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 2607 2608 /* For backward compatibility expose the NGUID to userspace if 2609 * we have no UUID set 2610 */ 2611 if (uuid_is_null(&ids->uuid)) { 2612 printk_ratelimited(KERN_WARNING 2613 "No UUID available providing old NGUID\n"); 2614 return sprintf(buf, "%pU\n", ids->nguid); 2615 } 2616 return sprintf(buf, "%pU\n", &ids->uuid); 2617 } 2618 static DEVICE_ATTR_RO(uuid); 2619 2620 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 2621 char *buf) 2622 { 2623 return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); 2624 } 2625 static DEVICE_ATTR_RO(eui); 2626 2627 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 2628 char *buf) 2629 { 2630 return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); 2631 } 2632 static DEVICE_ATTR_RO(nsid); 2633 2634 static struct attribute *nvme_ns_id_attrs[] = { 2635 &dev_attr_wwid.attr, 2636 &dev_attr_uuid.attr, 2637 &dev_attr_nguid.attr, 2638 &dev_attr_eui.attr, 2639 &dev_attr_nsid.attr, 2640 NULL, 2641 }; 2642 2643 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, 2644 struct attribute *a, int n) 2645 { 2646 struct device *dev = container_of(kobj, struct device, kobj); 2647 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 2648 2649 if (a == &dev_attr_uuid.attr) { 2650 if (uuid_is_null(&ids->uuid) && 2651 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 2652 return 0; 2653 } 2654 if (a == &dev_attr_nguid.attr) { 2655 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 2656 return 0; 2657 } 2658 if (a == &dev_attr_eui.attr) { 2659 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 2660 return 0; 2661 } 2662 return a->mode; 2663 } 2664 2665 const struct attribute_group nvme_ns_id_attr_group = { 2666 .attrs = nvme_ns_id_attrs, 2667 .is_visible = nvme_ns_id_attrs_are_visible, 2668 }; 2669 2670 #define nvme_show_str_function(field) \ 2671 static ssize_t field##_show(struct device *dev, \ 2672 struct device_attribute *attr, char *buf) \ 2673 { \ 2674 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 2675 return sprintf(buf, "%.*s\n", \ 2676 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ 2677 } \ 2678 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 2679 2680 nvme_show_str_function(model); 2681 nvme_show_str_function(serial); 2682 nvme_show_str_function(firmware_rev); 2683 2684 #define nvme_show_int_function(field) \ 2685 static ssize_t field##_show(struct device *dev, \ 2686 struct device_attribute *attr, char *buf) \ 2687 { \ 2688 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 2689 return sprintf(buf, "%d\n", ctrl->field); \ 2690 } \ 2691 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 2692 2693 nvme_show_int_function(cntlid); 2694 2695 static ssize_t nvme_sysfs_delete(struct device *dev, 2696 struct device_attribute *attr, const char *buf, 2697 size_t count) 2698 { 2699 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2700 2701 if (device_remove_file_self(dev, attr)) 2702 nvme_delete_ctrl_sync(ctrl); 2703 return count; 2704 } 2705 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); 2706 2707 static ssize_t nvme_sysfs_show_transport(struct device *dev, 2708 struct device_attribute *attr, 2709 char *buf) 2710 { 2711 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2712 2713 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); 2714 } 2715 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); 2716 2717 static ssize_t nvme_sysfs_show_state(struct device *dev, 2718 struct device_attribute *attr, 2719 char *buf) 2720 { 2721 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2722 static const char *const state_name[] = { 2723 [NVME_CTRL_NEW] = "new", 2724 [NVME_CTRL_LIVE] = "live", 2725 [NVME_CTRL_ADMIN_ONLY] = "only-admin", 2726 [NVME_CTRL_RESETTING] = "resetting", 2727 [NVME_CTRL_CONNECTING] = "connecting", 2728 [NVME_CTRL_DELETING] = "deleting", 2729 [NVME_CTRL_DEAD] = "dead", 2730 }; 2731 2732 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && 2733 state_name[ctrl->state]) 2734 return sprintf(buf, "%s\n", state_name[ctrl->state]); 2735 2736 return sprintf(buf, "unknown state\n"); 2737 } 2738 2739 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); 2740 2741 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, 2742 struct device_attribute *attr, 2743 char *buf) 2744 { 2745 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2746 2747 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); 2748 } 2749 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); 2750 2751 static ssize_t nvme_sysfs_show_address(struct device *dev, 2752 struct device_attribute *attr, 2753 char *buf) 2754 { 2755 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2756 2757 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); 2758 } 2759 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); 2760 2761 static struct attribute *nvme_dev_attrs[] = { 2762 &dev_attr_reset_controller.attr, 2763 &dev_attr_rescan_controller.attr, 2764 &dev_attr_model.attr, 2765 &dev_attr_serial.attr, 2766 &dev_attr_firmware_rev.attr, 2767 &dev_attr_cntlid.attr, 2768 &dev_attr_delete_controller.attr, 2769 &dev_attr_transport.attr, 2770 &dev_attr_subsysnqn.attr, 2771 &dev_attr_address.attr, 2772 &dev_attr_state.attr, 2773 NULL 2774 }; 2775 2776 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, 2777 struct attribute *a, int n) 2778 { 2779 struct device *dev = container_of(kobj, struct device, kobj); 2780 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2781 2782 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) 2783 return 0; 2784 if (a == &dev_attr_address.attr && !ctrl->ops->get_address) 2785 return 0; 2786 2787 return a->mode; 2788 } 2789 2790 static struct attribute_group nvme_dev_attrs_group = { 2791 .attrs = nvme_dev_attrs, 2792 .is_visible = nvme_dev_attrs_are_visible, 2793 }; 2794 2795 static const struct attribute_group *nvme_dev_attr_groups[] = { 2796 &nvme_dev_attrs_group, 2797 NULL, 2798 }; 2799 2800 static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, 2801 unsigned nsid) 2802 { 2803 struct nvme_ns_head *h; 2804 2805 lockdep_assert_held(&subsys->lock); 2806 2807 list_for_each_entry(h, &subsys->nsheads, entry) { 2808 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) 2809 return h; 2810 } 2811 2812 return NULL; 2813 } 2814 2815 static int __nvme_check_ids(struct nvme_subsystem *subsys, 2816 struct nvme_ns_head *new) 2817 { 2818 struct nvme_ns_head *h; 2819 2820 lockdep_assert_held(&subsys->lock); 2821 2822 list_for_each_entry(h, &subsys->nsheads, entry) { 2823 if (nvme_ns_ids_valid(&new->ids) && 2824 !list_empty(&h->list) && 2825 nvme_ns_ids_equal(&new->ids, &h->ids)) 2826 return -EINVAL; 2827 } 2828 2829 return 0; 2830 } 2831 2832 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, 2833 unsigned nsid, struct nvme_id_ns *id) 2834 { 2835 struct nvme_ns_head *head; 2836 int ret = -ENOMEM; 2837 2838 head = kzalloc(sizeof(*head), GFP_KERNEL); 2839 if (!head) 2840 goto out; 2841 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); 2842 if (ret < 0) 2843 goto out_free_head; 2844 head->instance = ret; 2845 INIT_LIST_HEAD(&head->list); 2846 ret = init_srcu_struct(&head->srcu); 2847 if (ret) 2848 goto out_ida_remove; 2849 head->subsys = ctrl->subsys; 2850 head->ns_id = nsid; 2851 kref_init(&head->ref); 2852 2853 nvme_report_ns_ids(ctrl, nsid, id, &head->ids); 2854 2855 ret = __nvme_check_ids(ctrl->subsys, head); 2856 if (ret) { 2857 dev_err(ctrl->device, 2858 "duplicate IDs for nsid %d\n", nsid); 2859 goto out_cleanup_srcu; 2860 } 2861 2862 ret = nvme_mpath_alloc_disk(ctrl, head); 2863 if (ret) 2864 goto out_cleanup_srcu; 2865 2866 list_add_tail(&head->entry, &ctrl->subsys->nsheads); 2867 2868 kref_get(&ctrl->subsys->ref); 2869 2870 return head; 2871 out_cleanup_srcu: 2872 cleanup_srcu_struct(&head->srcu); 2873 out_ida_remove: 2874 ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); 2875 out_free_head: 2876 kfree(head); 2877 out: 2878 return ERR_PTR(ret); 2879 } 2880 2881 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, 2882 struct nvme_id_ns *id) 2883 { 2884 struct nvme_ctrl *ctrl = ns->ctrl; 2885 bool is_shared = id->nmic & (1 << 0); 2886 struct nvme_ns_head *head = NULL; 2887 int ret = 0; 2888 2889 mutex_lock(&ctrl->subsys->lock); 2890 if (is_shared) 2891 head = __nvme_find_ns_head(ctrl->subsys, nsid); 2892 if (!head) { 2893 head = nvme_alloc_ns_head(ctrl, nsid, id); 2894 if (IS_ERR(head)) { 2895 ret = PTR_ERR(head); 2896 goto out_unlock; 2897 } 2898 } else { 2899 struct nvme_ns_ids ids; 2900 2901 nvme_report_ns_ids(ctrl, nsid, id, &ids); 2902 if (!nvme_ns_ids_equal(&head->ids, &ids)) { 2903 dev_err(ctrl->device, 2904 "IDs don't match for shared namespace %d\n", 2905 nsid); 2906 ret = -EINVAL; 2907 goto out_unlock; 2908 } 2909 } 2910 2911 list_add_tail(&ns->siblings, &head->list); 2912 ns->head = head; 2913 2914 out_unlock: 2915 mutex_unlock(&ctrl->subsys->lock); 2916 return ret; 2917 } 2918 2919 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 2920 { 2921 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 2922 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 2923 2924 return nsa->head->ns_id - nsb->head->ns_id; 2925 } 2926 2927 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) 2928 { 2929 struct nvme_ns *ns, *ret = NULL; 2930 2931 down_read(&ctrl->namespaces_rwsem); 2932 list_for_each_entry(ns, &ctrl->namespaces, list) { 2933 if (ns->head->ns_id == nsid) { 2934 if (!kref_get_unless_zero(&ns->kref)) 2935 continue; 2936 ret = ns; 2937 break; 2938 } 2939 if (ns->head->ns_id > nsid) 2940 break; 2941 } 2942 up_read(&ctrl->namespaces_rwsem); 2943 return ret; 2944 } 2945 2946 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) 2947 { 2948 struct streams_directive_params s; 2949 int ret; 2950 2951 if (!ctrl->nr_streams) 2952 return 0; 2953 2954 ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); 2955 if (ret) 2956 return ret; 2957 2958 ns->sws = le32_to_cpu(s.sws); 2959 ns->sgs = le16_to_cpu(s.sgs); 2960 2961 if (ns->sws) { 2962 unsigned int bs = 1 << ns->lba_shift; 2963 2964 blk_queue_io_min(ns->queue, bs * ns->sws); 2965 if (ns->sgs) 2966 blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs); 2967 } 2968 2969 return 0; 2970 } 2971 2972 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 2973 { 2974 struct nvme_ns *ns; 2975 struct gendisk *disk; 2976 struct nvme_id_ns *id; 2977 char disk_name[DISK_NAME_LEN]; 2978 int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; 2979 2980 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 2981 if (!ns) 2982 return; 2983 2984 ns->queue = blk_mq_init_queue(ctrl->tagset); 2985 if (IS_ERR(ns->queue)) 2986 goto out_free_ns; 2987 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); 2988 ns->queue->queuedata = ns; 2989 ns->ctrl = ctrl; 2990 2991 kref_init(&ns->kref); 2992 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 2993 2994 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 2995 nvme_set_queue_limits(ctrl, ns->queue); 2996 2997 id = nvme_identify_ns(ctrl, nsid); 2998 if (!id) 2999 goto out_free_queue; 3000 3001 if (id->ncap == 0) 3002 goto out_free_id; 3003 3004 if (nvme_init_ns_head(ns, nsid, id)) 3005 goto out_free_id; 3006 nvme_setup_streams_ns(ctrl, ns); 3007 nvme_set_disk_name(disk_name, ns, ctrl, &flags); 3008 3009 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { 3010 if (nvme_nvm_register(ns, disk_name, node)) { 3011 dev_warn(ctrl->device, "LightNVM init failure\n"); 3012 goto out_unlink_ns; 3013 } 3014 } 3015 3016 disk = alloc_disk_node(0, node); 3017 if (!disk) 3018 goto out_unlink_ns; 3019 3020 disk->fops = &nvme_fops; 3021 disk->private_data = ns; 3022 disk->queue = ns->queue; 3023 disk->flags = flags; 3024 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); 3025 ns->disk = disk; 3026 3027 __nvme_revalidate_disk(disk, id); 3028 3029 down_write(&ctrl->namespaces_rwsem); 3030 list_add_tail(&ns->list, &ctrl->namespaces); 3031 up_write(&ctrl->namespaces_rwsem); 3032 3033 nvme_get_ctrl(ctrl); 3034 3035 kfree(id); 3036 3037 device_add_disk(ctrl->device, ns->disk); 3038 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, 3039 &nvme_ns_id_attr_group)) 3040 pr_warn("%s: failed to create sysfs group for identification\n", 3041 ns->disk->disk_name); 3042 if (ns->ndev && nvme_nvm_register_sysfs(ns)) 3043 pr_warn("%s: failed to register lightnvm sysfs group for identification\n", 3044 ns->disk->disk_name); 3045 3046 nvme_mpath_add_disk(ns->head); 3047 nvme_fault_inject_init(ns); 3048 return; 3049 out_unlink_ns: 3050 mutex_lock(&ctrl->subsys->lock); 3051 list_del_rcu(&ns->siblings); 3052 mutex_unlock(&ctrl->subsys->lock); 3053 out_free_id: 3054 kfree(id); 3055 out_free_queue: 3056 blk_cleanup_queue(ns->queue); 3057 out_free_ns: 3058 kfree(ns); 3059 } 3060 3061 static void nvme_ns_remove(struct nvme_ns *ns) 3062 { 3063 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 3064 return; 3065 3066 nvme_fault_inject_fini(ns); 3067 if (ns->disk && ns->disk->flags & GENHD_FL_UP) { 3068 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 3069 &nvme_ns_id_attr_group); 3070 if (ns->ndev) 3071 nvme_nvm_unregister_sysfs(ns); 3072 del_gendisk(ns->disk); 3073 blk_cleanup_queue(ns->queue); 3074 if (blk_get_integrity(ns->disk)) 3075 blk_integrity_unregister(ns->disk); 3076 } 3077 3078 mutex_lock(&ns->ctrl->subsys->lock); 3079 nvme_mpath_clear_current_path(ns); 3080 list_del_rcu(&ns->siblings); 3081 mutex_unlock(&ns->ctrl->subsys->lock); 3082 3083 down_write(&ns->ctrl->namespaces_rwsem); 3084 list_del_init(&ns->list); 3085 up_write(&ns->ctrl->namespaces_rwsem); 3086 3087 synchronize_srcu(&ns->head->srcu); 3088 nvme_mpath_check_last_path(ns); 3089 nvme_put_ns(ns); 3090 } 3091 3092 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) 3093 { 3094 struct nvme_ns *ns; 3095 3096 ns = nvme_find_get_ns(ctrl, nsid); 3097 if (ns) { 3098 if (ns->disk && revalidate_disk(ns->disk)) 3099 nvme_ns_remove(ns); 3100 nvme_put_ns(ns); 3101 } else 3102 nvme_alloc_ns(ctrl, nsid); 3103 } 3104 3105 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 3106 unsigned nsid) 3107 { 3108 struct nvme_ns *ns, *next; 3109 LIST_HEAD(rm_list); 3110 3111 down_write(&ctrl->namespaces_rwsem); 3112 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 3113 if (ns->head->ns_id > nsid) 3114 list_move_tail(&ns->list, &rm_list); 3115 } 3116 up_write(&ctrl->namespaces_rwsem); 3117 3118 list_for_each_entry_safe(ns, next, &rm_list, list) 3119 nvme_ns_remove(ns); 3120 3121 } 3122 3123 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) 3124 { 3125 struct nvme_ns *ns; 3126 __le32 *ns_list; 3127 unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); 3128 int ret = 0; 3129 3130 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 3131 if (!ns_list) 3132 return -ENOMEM; 3133 3134 for (i = 0; i < num_lists; i++) { 3135 ret = nvme_identify_ns_list(ctrl, prev, ns_list); 3136 if (ret) 3137 goto free; 3138 3139 for (j = 0; j < min(nn, 1024U); j++) { 3140 nsid = le32_to_cpu(ns_list[j]); 3141 if (!nsid) 3142 goto out; 3143 3144 nvme_validate_ns(ctrl, nsid); 3145 3146 while (++prev < nsid) { 3147 ns = nvme_find_get_ns(ctrl, prev); 3148 if (ns) { 3149 nvme_ns_remove(ns); 3150 nvme_put_ns(ns); 3151 } 3152 } 3153 } 3154 nn -= j; 3155 } 3156 out: 3157 nvme_remove_invalid_namespaces(ctrl, prev); 3158 free: 3159 kfree(ns_list); 3160 return ret; 3161 } 3162 3163 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) 3164 { 3165 unsigned i; 3166 3167 for (i = 1; i <= nn; i++) 3168 nvme_validate_ns(ctrl, i); 3169 3170 nvme_remove_invalid_namespaces(ctrl, nn); 3171 } 3172 3173 static void nvme_scan_work(struct work_struct *work) 3174 { 3175 struct nvme_ctrl *ctrl = 3176 container_of(work, struct nvme_ctrl, scan_work); 3177 struct nvme_id_ctrl *id; 3178 unsigned nn; 3179 3180 if (ctrl->state != NVME_CTRL_LIVE) 3181 return; 3182 3183 WARN_ON_ONCE(!ctrl->tagset); 3184 3185 if (nvme_identify_ctrl(ctrl, &id)) 3186 return; 3187 3188 nn = le32_to_cpu(id->nn); 3189 if (ctrl->vs >= NVME_VS(1, 1, 0) && 3190 !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { 3191 if (!nvme_scan_ns_list(ctrl, nn)) 3192 goto done; 3193 } 3194 nvme_scan_ns_sequential(ctrl, nn); 3195 done: 3196 down_write(&ctrl->namespaces_rwsem); 3197 list_sort(NULL, &ctrl->namespaces, ns_cmp); 3198 up_write(&ctrl->namespaces_rwsem); 3199 kfree(id); 3200 } 3201 3202 void nvme_queue_scan(struct nvme_ctrl *ctrl) 3203 { 3204 /* 3205 * Only new queue scan work when admin and IO queues are both alive 3206 */ 3207 if (ctrl->state == NVME_CTRL_LIVE) 3208 queue_work(nvme_wq, &ctrl->scan_work); 3209 } 3210 EXPORT_SYMBOL_GPL(nvme_queue_scan); 3211 3212 /* 3213 * This function iterates the namespace list unlocked to allow recovery from 3214 * controller failure. It is up to the caller to ensure the namespace list is 3215 * not modified by scan work while this function is executing. 3216 */ 3217 void nvme_remove_namespaces(struct nvme_ctrl *ctrl) 3218 { 3219 struct nvme_ns *ns, *next; 3220 LIST_HEAD(ns_list); 3221 3222 /* 3223 * The dead states indicates the controller was not gracefully 3224 * disconnected. In that case, we won't be able to flush any data while 3225 * removing the namespaces' disks; fail all the queues now to avoid 3226 * potentially having to clean up the failed sync later. 3227 */ 3228 if (ctrl->state == NVME_CTRL_DEAD) 3229 nvme_kill_queues(ctrl); 3230 3231 down_write(&ctrl->namespaces_rwsem); 3232 list_splice_init(&ctrl->namespaces, &ns_list); 3233 up_write(&ctrl->namespaces_rwsem); 3234 3235 list_for_each_entry_safe(ns, next, &ns_list, list) 3236 nvme_ns_remove(ns); 3237 } 3238 EXPORT_SYMBOL_GPL(nvme_remove_namespaces); 3239 3240 static void nvme_aen_uevent(struct nvme_ctrl *ctrl) 3241 { 3242 char *envp[2] = { NULL, NULL }; 3243 u32 aen_result = ctrl->aen_result; 3244 3245 ctrl->aen_result = 0; 3246 if (!aen_result) 3247 return; 3248 3249 envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); 3250 if (!envp[0]) 3251 return; 3252 kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); 3253 kfree(envp[0]); 3254 } 3255 3256 static void nvme_async_event_work(struct work_struct *work) 3257 { 3258 struct nvme_ctrl *ctrl = 3259 container_of(work, struct nvme_ctrl, async_event_work); 3260 3261 nvme_aen_uevent(ctrl); 3262 ctrl->ops->submit_async_event(ctrl); 3263 } 3264 3265 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) 3266 { 3267 3268 u32 csts; 3269 3270 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) 3271 return false; 3272 3273 if (csts == ~0) 3274 return false; 3275 3276 return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); 3277 } 3278 3279 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) 3280 { 3281 struct nvme_fw_slot_info_log *log; 3282 3283 log = kmalloc(sizeof(*log), GFP_KERNEL); 3284 if (!log) 3285 return; 3286 3287 if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) 3288 dev_warn(ctrl->device, 3289 "Get FW SLOT INFO log error\n"); 3290 kfree(log); 3291 } 3292 3293 static void nvme_fw_act_work(struct work_struct *work) 3294 { 3295 struct nvme_ctrl *ctrl = container_of(work, 3296 struct nvme_ctrl, fw_act_work); 3297 unsigned long fw_act_timeout; 3298 3299 if (ctrl->mtfa) 3300 fw_act_timeout = jiffies + 3301 msecs_to_jiffies(ctrl->mtfa * 100); 3302 else 3303 fw_act_timeout = jiffies + 3304 msecs_to_jiffies(admin_timeout * 1000); 3305 3306 nvme_stop_queues(ctrl); 3307 while (nvme_ctrl_pp_status(ctrl)) { 3308 if (time_after(jiffies, fw_act_timeout)) { 3309 dev_warn(ctrl->device, 3310 "Fw activation timeout, reset controller\n"); 3311 nvme_reset_ctrl(ctrl); 3312 break; 3313 } 3314 msleep(100); 3315 } 3316 3317 if (ctrl->state != NVME_CTRL_LIVE) 3318 return; 3319 3320 nvme_start_queues(ctrl); 3321 /* read FW slot information to clear the AER */ 3322 nvme_get_fw_slot_info(ctrl); 3323 } 3324 3325 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, 3326 union nvme_result *res) 3327 { 3328 u32 result = le32_to_cpu(res->u32); 3329 3330 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) 3331 return; 3332 3333 switch (result & 0x7) { 3334 case NVME_AER_ERROR: 3335 case NVME_AER_SMART: 3336 case NVME_AER_CSS: 3337 case NVME_AER_VS: 3338 ctrl->aen_result = result; 3339 break; 3340 default: 3341 break; 3342 } 3343 3344 switch (result & 0xff07) { 3345 case NVME_AER_NOTICE_NS_CHANGED: 3346 dev_info(ctrl->device, "rescanning\n"); 3347 nvme_queue_scan(ctrl); 3348 break; 3349 case NVME_AER_NOTICE_FW_ACT_STARTING: 3350 queue_work(nvme_wq, &ctrl->fw_act_work); 3351 break; 3352 default: 3353 dev_warn(ctrl->device, "async event result %08x\n", result); 3354 } 3355 queue_work(nvme_wq, &ctrl->async_event_work); 3356 } 3357 EXPORT_SYMBOL_GPL(nvme_complete_async_event); 3358 3359 void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 3360 { 3361 nvme_stop_keep_alive(ctrl); 3362 flush_work(&ctrl->async_event_work); 3363 flush_work(&ctrl->scan_work); 3364 cancel_work_sync(&ctrl->fw_act_work); 3365 if (ctrl->ops->stop_ctrl) 3366 ctrl->ops->stop_ctrl(ctrl); 3367 } 3368 EXPORT_SYMBOL_GPL(nvme_stop_ctrl); 3369 3370 void nvme_start_ctrl(struct nvme_ctrl *ctrl) 3371 { 3372 if (ctrl->kato) 3373 nvme_start_keep_alive(ctrl); 3374 3375 if (ctrl->queue_count > 1) { 3376 nvme_queue_scan(ctrl); 3377 queue_work(nvme_wq, &ctrl->async_event_work); 3378 nvme_start_queues(ctrl); 3379 } 3380 } 3381 EXPORT_SYMBOL_GPL(nvme_start_ctrl); 3382 3383 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 3384 { 3385 cdev_device_del(&ctrl->cdev, ctrl->device); 3386 } 3387 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); 3388 3389 static void nvme_free_ctrl(struct device *dev) 3390 { 3391 struct nvme_ctrl *ctrl = 3392 container_of(dev, struct nvme_ctrl, ctrl_device); 3393 struct nvme_subsystem *subsys = ctrl->subsys; 3394 3395 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 3396 kfree(ctrl->effects); 3397 3398 if (subsys) { 3399 mutex_lock(&subsys->lock); 3400 list_del(&ctrl->subsys_entry); 3401 mutex_unlock(&subsys->lock); 3402 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); 3403 } 3404 3405 ctrl->ops->free_ctrl(ctrl); 3406 3407 if (subsys) 3408 nvme_put_subsystem(subsys); 3409 } 3410 3411 /* 3412 * Initialize a NVMe controller structures. This needs to be called during 3413 * earliest initialization so that we have the initialized structured around 3414 * during probing. 3415 */ 3416 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 3417 const struct nvme_ctrl_ops *ops, unsigned long quirks) 3418 { 3419 int ret; 3420 3421 ctrl->state = NVME_CTRL_NEW; 3422 spin_lock_init(&ctrl->lock); 3423 INIT_LIST_HEAD(&ctrl->namespaces); 3424 init_rwsem(&ctrl->namespaces_rwsem); 3425 ctrl->dev = dev; 3426 ctrl->ops = ops; 3427 ctrl->quirks = quirks; 3428 INIT_WORK(&ctrl->scan_work, nvme_scan_work); 3429 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 3430 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); 3431 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); 3432 3433 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); 3434 if (ret < 0) 3435 goto out; 3436 ctrl->instance = ret; 3437 3438 device_initialize(&ctrl->ctrl_device); 3439 ctrl->device = &ctrl->ctrl_device; 3440 ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); 3441 ctrl->device->class = nvme_class; 3442 ctrl->device->parent = ctrl->dev; 3443 ctrl->device->groups = nvme_dev_attr_groups; 3444 ctrl->device->release = nvme_free_ctrl; 3445 dev_set_drvdata(ctrl->device, ctrl); 3446 ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); 3447 if (ret) 3448 goto out_release_instance; 3449 3450 cdev_init(&ctrl->cdev, &nvme_dev_fops); 3451 ctrl->cdev.owner = ops->module; 3452 ret = cdev_device_add(&ctrl->cdev, ctrl->device); 3453 if (ret) 3454 goto out_free_name; 3455 3456 /* 3457 * Initialize latency tolerance controls. The sysfs files won't 3458 * be visible to userspace unless the device actually supports APST. 3459 */ 3460 ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; 3461 dev_pm_qos_update_user_latency_tolerance(ctrl->device, 3462 min(default_ps_max_latency_us, (unsigned long)S32_MAX)); 3463 3464 return 0; 3465 out_free_name: 3466 kfree_const(dev->kobj.name); 3467 out_release_instance: 3468 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 3469 out: 3470 return ret; 3471 } 3472 EXPORT_SYMBOL_GPL(nvme_init_ctrl); 3473 3474 /** 3475 * nvme_kill_queues(): Ends all namespace queues 3476 * @ctrl: the dead controller that needs to end 3477 * 3478 * Call this function when the driver determines it is unable to get the 3479 * controller in a state capable of servicing IO. 3480 */ 3481 void nvme_kill_queues(struct nvme_ctrl *ctrl) 3482 { 3483 struct nvme_ns *ns; 3484 3485 down_read(&ctrl->namespaces_rwsem); 3486 3487 /* Forcibly unquiesce queues to avoid blocking dispatch */ 3488 if (ctrl->admin_q) 3489 blk_mq_unquiesce_queue(ctrl->admin_q); 3490 3491 list_for_each_entry(ns, &ctrl->namespaces, list) { 3492 /* 3493 * Revalidating a dead namespace sets capacity to 0. This will 3494 * end buffered writers dirtying pages that can't be synced. 3495 */ 3496 if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 3497 continue; 3498 revalidate_disk(ns->disk); 3499 blk_set_queue_dying(ns->queue); 3500 3501 /* Forcibly unquiesce queues to avoid blocking dispatch */ 3502 blk_mq_unquiesce_queue(ns->queue); 3503 } 3504 up_read(&ctrl->namespaces_rwsem); 3505 } 3506 EXPORT_SYMBOL_GPL(nvme_kill_queues); 3507 3508 void nvme_unfreeze(struct nvme_ctrl *ctrl) 3509 { 3510 struct nvme_ns *ns; 3511 3512 down_read(&ctrl->namespaces_rwsem); 3513 list_for_each_entry(ns, &ctrl->namespaces, list) 3514 blk_mq_unfreeze_queue(ns->queue); 3515 up_read(&ctrl->namespaces_rwsem); 3516 } 3517 EXPORT_SYMBOL_GPL(nvme_unfreeze); 3518 3519 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) 3520 { 3521 struct nvme_ns *ns; 3522 3523 down_read(&ctrl->namespaces_rwsem); 3524 list_for_each_entry(ns, &ctrl->namespaces, list) { 3525 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); 3526 if (timeout <= 0) 3527 break; 3528 } 3529 up_read(&ctrl->namespaces_rwsem); 3530 } 3531 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); 3532 3533 void nvme_wait_freeze(struct nvme_ctrl *ctrl) 3534 { 3535 struct nvme_ns *ns; 3536 3537 down_read(&ctrl->namespaces_rwsem); 3538 list_for_each_entry(ns, &ctrl->namespaces, list) 3539 blk_mq_freeze_queue_wait(ns->queue); 3540 up_read(&ctrl->namespaces_rwsem); 3541 } 3542 EXPORT_SYMBOL_GPL(nvme_wait_freeze); 3543 3544 void nvme_start_freeze(struct nvme_ctrl *ctrl) 3545 { 3546 struct nvme_ns *ns; 3547 3548 down_read(&ctrl->namespaces_rwsem); 3549 list_for_each_entry(ns, &ctrl->namespaces, list) 3550 blk_freeze_queue_start(ns->queue); 3551 up_read(&ctrl->namespaces_rwsem); 3552 } 3553 EXPORT_SYMBOL_GPL(nvme_start_freeze); 3554 3555 void nvme_stop_queues(struct nvme_ctrl *ctrl) 3556 { 3557 struct nvme_ns *ns; 3558 3559 down_read(&ctrl->namespaces_rwsem); 3560 list_for_each_entry(ns, &ctrl->namespaces, list) 3561 blk_mq_quiesce_queue(ns->queue); 3562 up_read(&ctrl->namespaces_rwsem); 3563 } 3564 EXPORT_SYMBOL_GPL(nvme_stop_queues); 3565 3566 void nvme_start_queues(struct nvme_ctrl *ctrl) 3567 { 3568 struct nvme_ns *ns; 3569 3570 down_read(&ctrl->namespaces_rwsem); 3571 list_for_each_entry(ns, &ctrl->namespaces, list) 3572 blk_mq_unquiesce_queue(ns->queue); 3573 up_read(&ctrl->namespaces_rwsem); 3574 } 3575 EXPORT_SYMBOL_GPL(nvme_start_queues); 3576 3577 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) 3578 { 3579 if (!ctrl->ops->reinit_request) 3580 return 0; 3581 3582 return blk_mq_tagset_iter(set, set->driver_data, 3583 ctrl->ops->reinit_request); 3584 } 3585 EXPORT_SYMBOL_GPL(nvme_reinit_tagset); 3586 3587 int __init nvme_core_init(void) 3588 { 3589 int result = -ENOMEM; 3590 3591 nvme_wq = alloc_workqueue("nvme-wq", 3592 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 3593 if (!nvme_wq) 3594 goto out; 3595 3596 nvme_reset_wq = alloc_workqueue("nvme-reset-wq", 3597 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 3598 if (!nvme_reset_wq) 3599 goto destroy_wq; 3600 3601 nvme_delete_wq = alloc_workqueue("nvme-delete-wq", 3602 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 3603 if (!nvme_delete_wq) 3604 goto destroy_reset_wq; 3605 3606 result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); 3607 if (result < 0) 3608 goto destroy_delete_wq; 3609 3610 nvme_class = class_create(THIS_MODULE, "nvme"); 3611 if (IS_ERR(nvme_class)) { 3612 result = PTR_ERR(nvme_class); 3613 goto unregister_chrdev; 3614 } 3615 3616 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); 3617 if (IS_ERR(nvme_subsys_class)) { 3618 result = PTR_ERR(nvme_subsys_class); 3619 goto destroy_class; 3620 } 3621 return 0; 3622 3623 destroy_class: 3624 class_destroy(nvme_class); 3625 unregister_chrdev: 3626 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); 3627 destroy_delete_wq: 3628 destroy_workqueue(nvme_delete_wq); 3629 destroy_reset_wq: 3630 destroy_workqueue(nvme_reset_wq); 3631 destroy_wq: 3632 destroy_workqueue(nvme_wq); 3633 out: 3634 return result; 3635 } 3636 3637 void nvme_core_exit(void) 3638 { 3639 ida_destroy(&nvme_subsystems_ida); 3640 class_destroy(nvme_subsys_class); 3641 class_destroy(nvme_class); 3642 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); 3643 destroy_workqueue(nvme_delete_wq); 3644 destroy_workqueue(nvme_reset_wq); 3645 destroy_workqueue(nvme_wq); 3646 } 3647 3648 MODULE_LICENSE("GPL"); 3649 MODULE_VERSION("1.0"); 3650 module_init(nvme_core_init); 3651 module_exit(nvme_core_exit); 3652