1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/blkdev.h> 16 #include <linux/blk-mq.h> 17 #include <linux/delay.h> 18 #include <linux/errno.h> 19 #include <linux/hdreg.h> 20 #include <linux/kernel.h> 21 #include <linux/module.h> 22 #include <linux/list_sort.h> 23 #include <linux/slab.h> 24 #include <linux/types.h> 25 #include <linux/pr.h> 26 #include <linux/ptrace.h> 27 #include <linux/nvme_ioctl.h> 28 #include <linux/t10-pi.h> 29 #include <scsi/sg.h> 30 #include <asm/unaligned.h> 31 32 #include "nvme.h" 33 #include "fabrics.h" 34 35 #define NVME_MINORS (1U << MINORBITS) 36 37 unsigned char admin_timeout = 60; 38 module_param(admin_timeout, byte, 0644); 39 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 40 EXPORT_SYMBOL_GPL(admin_timeout); 41 42 unsigned char nvme_io_timeout = 30; 43 module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 44 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 45 EXPORT_SYMBOL_GPL(nvme_io_timeout); 46 47 unsigned char shutdown_timeout = 5; 48 module_param(shutdown_timeout, byte, 0644); 49 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 50 51 unsigned int nvme_max_retries = 5; 52 module_param_named(max_retries, nvme_max_retries, uint, 0644); 53 MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); 54 EXPORT_SYMBOL_GPL(nvme_max_retries); 55 56 static int nvme_char_major; 57 module_param(nvme_char_major, int, 0); 58 59 static LIST_HEAD(nvme_ctrl_list); 60 static DEFINE_SPINLOCK(dev_list_lock); 61 62 static struct class *nvme_class; 63 64 void nvme_cancel_request(struct request *req, void *data, bool reserved) 65 { 66 int status; 67 68 if (!blk_mq_request_started(req)) 69 return; 70 71 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, 72 "Cancelling I/O %d", req->tag); 73 74 status = NVME_SC_ABORT_REQ; 75 if (blk_queue_dying(req->q)) 76 status |= NVME_SC_DNR; 77 blk_mq_complete_request(req, status); 78 } 79 EXPORT_SYMBOL_GPL(nvme_cancel_request); 80 81 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 82 enum nvme_ctrl_state new_state) 83 { 84 enum nvme_ctrl_state old_state; 85 bool changed = false; 86 87 spin_lock_irq(&ctrl->lock); 88 89 old_state = ctrl->state; 90 switch (new_state) { 91 case NVME_CTRL_LIVE: 92 switch (old_state) { 93 case NVME_CTRL_NEW: 94 case NVME_CTRL_RESETTING: 95 case NVME_CTRL_RECONNECTING: 96 changed = true; 97 /* FALLTHRU */ 98 default: 99 break; 100 } 101 break; 102 case NVME_CTRL_RESETTING: 103 switch (old_state) { 104 case NVME_CTRL_NEW: 105 case NVME_CTRL_LIVE: 106 case NVME_CTRL_RECONNECTING: 107 changed = true; 108 /* FALLTHRU */ 109 default: 110 break; 111 } 112 break; 113 case NVME_CTRL_RECONNECTING: 114 switch (old_state) { 115 case NVME_CTRL_LIVE: 116 changed = true; 117 /* FALLTHRU */ 118 default: 119 break; 120 } 121 break; 122 case NVME_CTRL_DELETING: 123 switch (old_state) { 124 case NVME_CTRL_LIVE: 125 case NVME_CTRL_RESETTING: 126 case NVME_CTRL_RECONNECTING: 127 changed = true; 128 /* FALLTHRU */ 129 default: 130 break; 131 } 132 break; 133 case NVME_CTRL_DEAD: 134 switch (old_state) { 135 case NVME_CTRL_DELETING: 136 changed = true; 137 /* FALLTHRU */ 138 default: 139 break; 140 } 141 break; 142 default: 143 break; 144 } 145 146 if (changed) 147 ctrl->state = new_state; 148 149 spin_unlock_irq(&ctrl->lock); 150 151 return changed; 152 } 153 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 154 155 static void nvme_free_ns(struct kref *kref) 156 { 157 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 158 159 if (ns->ndev) 160 nvme_nvm_unregister(ns); 161 162 if (ns->disk) { 163 spin_lock(&dev_list_lock); 164 ns->disk->private_data = NULL; 165 spin_unlock(&dev_list_lock); 166 } 167 168 put_disk(ns->disk); 169 ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); 170 nvme_put_ctrl(ns->ctrl); 171 kfree(ns); 172 } 173 174 static void nvme_put_ns(struct nvme_ns *ns) 175 { 176 kref_put(&ns->kref, nvme_free_ns); 177 } 178 179 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) 180 { 181 struct nvme_ns *ns; 182 183 spin_lock(&dev_list_lock); 184 ns = disk->private_data; 185 if (ns) { 186 if (!kref_get_unless_zero(&ns->kref)) 187 goto fail; 188 if (!try_module_get(ns->ctrl->ops->module)) 189 goto fail_put_ns; 190 } 191 spin_unlock(&dev_list_lock); 192 193 return ns; 194 195 fail_put_ns: 196 kref_put(&ns->kref, nvme_free_ns); 197 fail: 198 spin_unlock(&dev_list_lock); 199 return NULL; 200 } 201 202 void nvme_requeue_req(struct request *req) 203 { 204 blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q)); 205 } 206 EXPORT_SYMBOL_GPL(nvme_requeue_req); 207 208 struct request *nvme_alloc_request(struct request_queue *q, 209 struct nvme_command *cmd, unsigned int flags, int qid) 210 { 211 struct request *req; 212 213 if (qid == NVME_QID_ANY) { 214 req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags); 215 } else { 216 req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags, 217 qid ? qid - 1 : 0); 218 } 219 if (IS_ERR(req)) 220 return req; 221 222 req->cmd_type = REQ_TYPE_DRV_PRIV; 223 req->cmd_flags |= REQ_FAILFAST_DRIVER; 224 nvme_req(req)->cmd = cmd; 225 226 return req; 227 } 228 EXPORT_SYMBOL_GPL(nvme_alloc_request); 229 230 static inline void nvme_setup_flush(struct nvme_ns *ns, 231 struct nvme_command *cmnd) 232 { 233 memset(cmnd, 0, sizeof(*cmnd)); 234 cmnd->common.opcode = nvme_cmd_flush; 235 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 236 } 237 238 static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, 239 struct nvme_command *cmnd) 240 { 241 struct nvme_dsm_range *range; 242 struct page *page; 243 int offset; 244 unsigned int nr_bytes = blk_rq_bytes(req); 245 246 range = kmalloc(sizeof(*range), GFP_ATOMIC); 247 if (!range) 248 return BLK_MQ_RQ_QUEUE_BUSY; 249 250 range->cattr = cpu_to_le32(0); 251 range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); 252 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 253 254 memset(cmnd, 0, sizeof(*cmnd)); 255 cmnd->dsm.opcode = nvme_cmd_dsm; 256 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 257 cmnd->dsm.nr = 0; 258 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 259 260 req->completion_data = range; 261 page = virt_to_page(range); 262 offset = offset_in_page(range); 263 blk_add_request_payload(req, page, offset, sizeof(*range)); 264 265 /* 266 * we set __data_len back to the size of the area to be discarded 267 * on disk. This allows us to report completion on the full amount 268 * of blocks described by the request. 269 */ 270 req->__data_len = nr_bytes; 271 272 return BLK_MQ_RQ_QUEUE_OK; 273 } 274 275 static inline void nvme_setup_write_zeroes(struct nvme_ns *ns, 276 struct request *req, struct nvme_command *cmnd) 277 { 278 struct nvme_write_zeroes_cmd *write_zeroes = &cmnd->write_zeroes; 279 280 memset(cmnd, 0, sizeof(*cmnd)); 281 write_zeroes->opcode = nvme_cmd_write_zeroes; 282 write_zeroes->nsid = cpu_to_le32(ns->ns_id); 283 write_zeroes->slba = 284 cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 285 write_zeroes->length = 286 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 287 write_zeroes->control = 0; 288 } 289 290 static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, 291 struct nvme_command *cmnd) 292 { 293 u16 control = 0; 294 u32 dsmgmt = 0; 295 296 if (req->cmd_flags & REQ_FUA) 297 control |= NVME_RW_FUA; 298 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 299 control |= NVME_RW_LR; 300 301 if (req->cmd_flags & REQ_RAHEAD) 302 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 303 304 memset(cmnd, 0, sizeof(*cmnd)); 305 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 306 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 307 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 308 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 309 310 if (ns->ms) { 311 switch (ns->pi_type) { 312 case NVME_NS_DPS_PI_TYPE3: 313 control |= NVME_RW_PRINFO_PRCHK_GUARD; 314 break; 315 case NVME_NS_DPS_PI_TYPE1: 316 case NVME_NS_DPS_PI_TYPE2: 317 control |= NVME_RW_PRINFO_PRCHK_GUARD | 318 NVME_RW_PRINFO_PRCHK_REF; 319 cmnd->rw.reftag = cpu_to_le32( 320 nvme_block_nr(ns, blk_rq_pos(req))); 321 break; 322 } 323 if (!blk_integrity_rq(req)) 324 control |= NVME_RW_PRINFO_PRACT; 325 } 326 327 cmnd->rw.control = cpu_to_le16(control); 328 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 329 } 330 331 int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 332 struct nvme_command *cmd) 333 { 334 int ret = BLK_MQ_RQ_QUEUE_OK; 335 336 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 337 memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); 338 else if (req_op(req) == REQ_OP_FLUSH) 339 nvme_setup_flush(ns, cmd); 340 else if (req_op(req) == REQ_OP_DISCARD) 341 ret = nvme_setup_discard(ns, req, cmd); 342 else if (req_op(req) == REQ_OP_WRITE_ZEROES) 343 nvme_setup_write_zeroes(ns, req, cmd); 344 else 345 nvme_setup_rw(ns, req, cmd); 346 347 cmd->common.command_id = req->tag; 348 349 return ret; 350 } 351 EXPORT_SYMBOL_GPL(nvme_setup_cmd); 352 353 /* 354 * Returns 0 on success. If the result is negative, it's a Linux error code; 355 * if the result is positive, it's an NVM Express status code 356 */ 357 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 358 union nvme_result *result, void *buffer, unsigned bufflen, 359 unsigned timeout, int qid, int at_head, int flags) 360 { 361 struct request *req; 362 int ret; 363 364 req = nvme_alloc_request(q, cmd, flags, qid); 365 if (IS_ERR(req)) 366 return PTR_ERR(req); 367 368 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 369 370 if (buffer && bufflen) { 371 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); 372 if (ret) 373 goto out; 374 } 375 376 blk_execute_rq(req->q, NULL, req, at_head); 377 if (result) 378 *result = nvme_req(req)->result; 379 ret = req->errors; 380 out: 381 blk_mq_free_request(req); 382 return ret; 383 } 384 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); 385 386 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 387 void *buffer, unsigned bufflen) 388 { 389 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, 390 NVME_QID_ANY, 0, 0); 391 } 392 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 393 394 int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, 395 void __user *ubuffer, unsigned bufflen, 396 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 397 u32 *result, unsigned timeout) 398 { 399 bool write = nvme_is_write(cmd); 400 struct nvme_ns *ns = q->queuedata; 401 struct gendisk *disk = ns ? ns->disk : NULL; 402 struct request *req; 403 struct bio *bio = NULL; 404 void *meta = NULL; 405 int ret; 406 407 req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); 408 if (IS_ERR(req)) 409 return PTR_ERR(req); 410 411 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 412 413 if (ubuffer && bufflen) { 414 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, 415 GFP_KERNEL); 416 if (ret) 417 goto out; 418 bio = req->bio; 419 420 if (!disk) 421 goto submit; 422 bio->bi_bdev = bdget_disk(disk, 0); 423 if (!bio->bi_bdev) { 424 ret = -ENODEV; 425 goto out_unmap; 426 } 427 428 if (meta_buffer && meta_len) { 429 struct bio_integrity_payload *bip; 430 431 meta = kmalloc(meta_len, GFP_KERNEL); 432 if (!meta) { 433 ret = -ENOMEM; 434 goto out_unmap; 435 } 436 437 if (write) { 438 if (copy_from_user(meta, meta_buffer, 439 meta_len)) { 440 ret = -EFAULT; 441 goto out_free_meta; 442 } 443 } 444 445 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 446 if (IS_ERR(bip)) { 447 ret = PTR_ERR(bip); 448 goto out_free_meta; 449 } 450 451 bip->bip_iter.bi_size = meta_len; 452 bip->bip_iter.bi_sector = meta_seed; 453 454 ret = bio_integrity_add_page(bio, virt_to_page(meta), 455 meta_len, offset_in_page(meta)); 456 if (ret != meta_len) { 457 ret = -ENOMEM; 458 goto out_free_meta; 459 } 460 } 461 } 462 submit: 463 blk_execute_rq(req->q, disk, req, 0); 464 ret = req->errors; 465 if (result) 466 *result = le32_to_cpu(nvme_req(req)->result.u32); 467 if (meta && !ret && !write) { 468 if (copy_to_user(meta_buffer, meta, meta_len)) 469 ret = -EFAULT; 470 } 471 out_free_meta: 472 kfree(meta); 473 out_unmap: 474 if (bio) { 475 if (disk && bio->bi_bdev) 476 bdput(bio->bi_bdev); 477 blk_rq_unmap_user(bio); 478 } 479 out: 480 blk_mq_free_request(req); 481 return ret; 482 } 483 484 int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, 485 void __user *ubuffer, unsigned bufflen, u32 *result, 486 unsigned timeout) 487 { 488 return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, 489 result, timeout); 490 } 491 492 static void nvme_keep_alive_end_io(struct request *rq, int error) 493 { 494 struct nvme_ctrl *ctrl = rq->end_io_data; 495 496 blk_mq_free_request(rq); 497 498 if (error) { 499 dev_err(ctrl->device, 500 "failed nvme_keep_alive_end_io error=%d\n", error); 501 return; 502 } 503 504 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 505 } 506 507 static int nvme_keep_alive(struct nvme_ctrl *ctrl) 508 { 509 struct nvme_command c; 510 struct request *rq; 511 512 memset(&c, 0, sizeof(c)); 513 c.common.opcode = nvme_admin_keep_alive; 514 515 rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED, 516 NVME_QID_ANY); 517 if (IS_ERR(rq)) 518 return PTR_ERR(rq); 519 520 rq->timeout = ctrl->kato * HZ; 521 rq->end_io_data = ctrl; 522 523 blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); 524 525 return 0; 526 } 527 528 static void nvme_keep_alive_work(struct work_struct *work) 529 { 530 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 531 struct nvme_ctrl, ka_work); 532 533 if (nvme_keep_alive(ctrl)) { 534 /* allocation failure, reset the controller */ 535 dev_err(ctrl->device, "keep-alive failed\n"); 536 ctrl->ops->reset_ctrl(ctrl); 537 return; 538 } 539 } 540 541 void nvme_start_keep_alive(struct nvme_ctrl *ctrl) 542 { 543 if (unlikely(ctrl->kato == 0)) 544 return; 545 546 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); 547 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 548 } 549 EXPORT_SYMBOL_GPL(nvme_start_keep_alive); 550 551 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) 552 { 553 if (unlikely(ctrl->kato == 0)) 554 return; 555 556 cancel_delayed_work_sync(&ctrl->ka_work); 557 } 558 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); 559 560 int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 561 { 562 struct nvme_command c = { }; 563 int error; 564 565 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 566 c.identify.opcode = nvme_admin_identify; 567 c.identify.cns = cpu_to_le32(1); 568 569 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 570 if (!*id) 571 return -ENOMEM; 572 573 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 574 sizeof(struct nvme_id_ctrl)); 575 if (error) 576 kfree(*id); 577 return error; 578 } 579 580 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) 581 { 582 struct nvme_command c = { }; 583 584 c.identify.opcode = nvme_admin_identify; 585 c.identify.cns = cpu_to_le32(2); 586 c.identify.nsid = cpu_to_le32(nsid); 587 return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); 588 } 589 590 int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, 591 struct nvme_id_ns **id) 592 { 593 struct nvme_command c = { }; 594 int error; 595 596 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 597 c.identify.opcode = nvme_admin_identify, 598 c.identify.nsid = cpu_to_le32(nsid), 599 600 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); 601 if (!*id) 602 return -ENOMEM; 603 604 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 605 sizeof(struct nvme_id_ns)); 606 if (error) 607 kfree(*id); 608 return error; 609 } 610 611 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, 612 void *buffer, size_t buflen, u32 *result) 613 { 614 struct nvme_command c; 615 union nvme_result res; 616 int ret; 617 618 memset(&c, 0, sizeof(c)); 619 c.features.opcode = nvme_admin_get_features; 620 c.features.nsid = cpu_to_le32(nsid); 621 c.features.fid = cpu_to_le32(fid); 622 623 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, buffer, buflen, 0, 624 NVME_QID_ANY, 0, 0); 625 if (ret >= 0 && result) 626 *result = le32_to_cpu(res.u32); 627 return ret; 628 } 629 630 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, 631 void *buffer, size_t buflen, u32 *result) 632 { 633 struct nvme_command c; 634 union nvme_result res; 635 int ret; 636 637 memset(&c, 0, sizeof(c)); 638 c.features.opcode = nvme_admin_set_features; 639 c.features.fid = cpu_to_le32(fid); 640 c.features.dword11 = cpu_to_le32(dword11); 641 642 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, 643 buffer, buflen, 0, NVME_QID_ANY, 0, 0); 644 if (ret >= 0 && result) 645 *result = le32_to_cpu(res.u32); 646 return ret; 647 } 648 649 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) 650 { 651 struct nvme_command c = { }; 652 int error; 653 654 c.common.opcode = nvme_admin_get_log_page, 655 c.common.nsid = cpu_to_le32(0xFFFFFFFF), 656 c.common.cdw10[0] = cpu_to_le32( 657 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | 658 NVME_LOG_SMART), 659 660 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); 661 if (!*log) 662 return -ENOMEM; 663 664 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, 665 sizeof(struct nvme_smart_log)); 666 if (error) 667 kfree(*log); 668 return error; 669 } 670 671 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) 672 { 673 u32 q_count = (*count - 1) | ((*count - 1) << 16); 674 u32 result; 675 int status, nr_io_queues; 676 677 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, 678 &result); 679 if (status < 0) 680 return status; 681 682 /* 683 * Degraded controllers might return an error when setting the queue 684 * count. We still want to be able to bring them online and offer 685 * access to the admin queue, as that might be only way to fix them up. 686 */ 687 if (status > 0) { 688 dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); 689 *count = 0; 690 } else { 691 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 692 *count = min(*count, nr_io_queues); 693 } 694 695 return 0; 696 } 697 EXPORT_SYMBOL_GPL(nvme_set_queue_count); 698 699 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 700 { 701 struct nvme_user_io io; 702 struct nvme_command c; 703 unsigned length, meta_len; 704 void __user *metadata; 705 706 if (copy_from_user(&io, uio, sizeof(io))) 707 return -EFAULT; 708 if (io.flags) 709 return -EINVAL; 710 711 switch (io.opcode) { 712 case nvme_cmd_write: 713 case nvme_cmd_read: 714 case nvme_cmd_compare: 715 break; 716 default: 717 return -EINVAL; 718 } 719 720 length = (io.nblocks + 1) << ns->lba_shift; 721 meta_len = (io.nblocks + 1) * ns->ms; 722 metadata = (void __user *)(uintptr_t)io.metadata; 723 724 if (ns->ext) { 725 length += meta_len; 726 meta_len = 0; 727 } else if (meta_len) { 728 if ((io.metadata & 3) || !io.metadata) 729 return -EINVAL; 730 } 731 732 memset(&c, 0, sizeof(c)); 733 c.rw.opcode = io.opcode; 734 c.rw.flags = io.flags; 735 c.rw.nsid = cpu_to_le32(ns->ns_id); 736 c.rw.slba = cpu_to_le64(io.slba); 737 c.rw.length = cpu_to_le16(io.nblocks); 738 c.rw.control = cpu_to_le16(io.control); 739 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 740 c.rw.reftag = cpu_to_le32(io.reftag); 741 c.rw.apptag = cpu_to_le16(io.apptag); 742 c.rw.appmask = cpu_to_le16(io.appmask); 743 744 return __nvme_submit_user_cmd(ns->queue, &c, 745 (void __user *)(uintptr_t)io.addr, length, 746 metadata, meta_len, io.slba, NULL, 0); 747 } 748 749 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 750 struct nvme_passthru_cmd __user *ucmd) 751 { 752 struct nvme_passthru_cmd cmd; 753 struct nvme_command c; 754 unsigned timeout = 0; 755 int status; 756 757 if (!capable(CAP_SYS_ADMIN)) 758 return -EACCES; 759 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 760 return -EFAULT; 761 if (cmd.flags) 762 return -EINVAL; 763 764 memset(&c, 0, sizeof(c)); 765 c.common.opcode = cmd.opcode; 766 c.common.flags = cmd.flags; 767 c.common.nsid = cpu_to_le32(cmd.nsid); 768 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 769 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 770 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 771 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 772 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 773 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 774 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 775 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 776 777 if (cmd.timeout_ms) 778 timeout = msecs_to_jiffies(cmd.timeout_ms); 779 780 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 781 (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 782 &cmd.result, timeout); 783 if (status >= 0) { 784 if (put_user(cmd.result, &ucmd->result)) 785 return -EFAULT; 786 } 787 788 return status; 789 } 790 791 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 792 unsigned int cmd, unsigned long arg) 793 { 794 struct nvme_ns *ns = bdev->bd_disk->private_data; 795 796 switch (cmd) { 797 case NVME_IOCTL_ID: 798 force_successful_syscall_return(); 799 return ns->ns_id; 800 case NVME_IOCTL_ADMIN_CMD: 801 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); 802 case NVME_IOCTL_IO_CMD: 803 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); 804 case NVME_IOCTL_SUBMIT_IO: 805 return nvme_submit_io(ns, (void __user *)arg); 806 #ifdef CONFIG_BLK_DEV_NVME_SCSI 807 case SG_GET_VERSION_NUM: 808 return nvme_sg_get_version_num((void __user *)arg); 809 case SG_IO: 810 return nvme_sg_io(ns, (void __user *)arg); 811 #endif 812 default: 813 return -ENOTTY; 814 } 815 } 816 817 #ifdef CONFIG_COMPAT 818 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 819 unsigned int cmd, unsigned long arg) 820 { 821 switch (cmd) { 822 case SG_IO: 823 return -ENOIOCTLCMD; 824 } 825 return nvme_ioctl(bdev, mode, cmd, arg); 826 } 827 #else 828 #define nvme_compat_ioctl NULL 829 #endif 830 831 static int nvme_open(struct block_device *bdev, fmode_t mode) 832 { 833 return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; 834 } 835 836 static void nvme_release(struct gendisk *disk, fmode_t mode) 837 { 838 struct nvme_ns *ns = disk->private_data; 839 840 module_put(ns->ctrl->ops->module); 841 nvme_put_ns(ns); 842 } 843 844 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 845 { 846 /* some standard values */ 847 geo->heads = 1 << 6; 848 geo->sectors = 1 << 5; 849 geo->cylinders = get_capacity(bdev->bd_disk) >> 11; 850 return 0; 851 } 852 853 #ifdef CONFIG_BLK_DEV_INTEGRITY 854 static void nvme_init_integrity(struct nvme_ns *ns) 855 { 856 struct blk_integrity integrity; 857 858 memset(&integrity, 0, sizeof(integrity)); 859 switch (ns->pi_type) { 860 case NVME_NS_DPS_PI_TYPE3: 861 integrity.profile = &t10_pi_type3_crc; 862 integrity.tag_size = sizeof(u16) + sizeof(u32); 863 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 864 break; 865 case NVME_NS_DPS_PI_TYPE1: 866 case NVME_NS_DPS_PI_TYPE2: 867 integrity.profile = &t10_pi_type1_crc; 868 integrity.tag_size = sizeof(u16); 869 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 870 break; 871 default: 872 integrity.profile = NULL; 873 break; 874 } 875 integrity.tuple_size = ns->ms; 876 blk_integrity_register(ns->disk, &integrity); 877 blk_queue_max_integrity_segments(ns->queue, 1); 878 } 879 #else 880 static void nvme_init_integrity(struct nvme_ns *ns) 881 { 882 } 883 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 884 885 static void nvme_config_discard(struct nvme_ns *ns) 886 { 887 struct nvme_ctrl *ctrl = ns->ctrl; 888 u32 logical_block_size = queue_logical_block_size(ns->queue); 889 890 if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES) 891 ns->queue->limits.discard_zeroes_data = 1; 892 else 893 ns->queue->limits.discard_zeroes_data = 0; 894 895 ns->queue->limits.discard_alignment = logical_block_size; 896 ns->queue->limits.discard_granularity = logical_block_size; 897 blk_queue_max_discard_sectors(ns->queue, UINT_MAX); 898 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 899 } 900 901 static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) 902 { 903 if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { 904 dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__); 905 return -ENODEV; 906 } 907 908 if ((*id)->ncap == 0) { 909 kfree(*id); 910 return -ENODEV; 911 } 912 913 if (ns->ctrl->vs >= NVME_VS(1, 1)) 914 memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); 915 if (ns->ctrl->vs >= NVME_VS(1, 2)) 916 memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); 917 918 return 0; 919 } 920 921 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) 922 { 923 struct nvme_ns *ns = disk->private_data; 924 u8 lbaf, pi_type; 925 u16 old_ms; 926 unsigned short bs; 927 928 old_ms = ns->ms; 929 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 930 ns->lba_shift = id->lbaf[lbaf].ds; 931 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 932 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 933 934 /* 935 * If identify namespace failed, use default 512 byte block size so 936 * block layer can use before failing read/write for 0 capacity. 937 */ 938 if (ns->lba_shift == 0) 939 ns->lba_shift = 9; 940 bs = 1 << ns->lba_shift; 941 /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 942 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 943 id->dps & NVME_NS_DPS_PI_MASK : 0; 944 945 blk_mq_freeze_queue(disk->queue); 946 if (blk_get_integrity(disk) && (ns->pi_type != pi_type || 947 ns->ms != old_ms || 948 bs != queue_logical_block_size(disk->queue) || 949 (ns->ms && ns->ext))) 950 blk_integrity_unregister(disk); 951 952 ns->pi_type = pi_type; 953 blk_queue_logical_block_size(ns->queue, bs); 954 955 if (ns->ms && !blk_get_integrity(disk) && !ns->ext) 956 nvme_init_integrity(ns); 957 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) 958 set_capacity(disk, 0); 959 else 960 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 961 962 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) 963 nvme_config_discard(ns); 964 if (ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) 965 blk_queue_max_write_zeroes_sectors(ns->queue, 966 ((u32)(USHRT_MAX + 1) * bs) >> 9); 967 968 blk_mq_unfreeze_queue(disk->queue); 969 } 970 971 static int nvme_revalidate_disk(struct gendisk *disk) 972 { 973 struct nvme_ns *ns = disk->private_data; 974 struct nvme_id_ns *id = NULL; 975 int ret; 976 977 if (test_bit(NVME_NS_DEAD, &ns->flags)) { 978 set_capacity(disk, 0); 979 return -ENODEV; 980 } 981 982 ret = nvme_revalidate_ns(ns, &id); 983 if (ret) 984 return ret; 985 986 __nvme_revalidate_disk(disk, id); 987 kfree(id); 988 989 return 0; 990 } 991 992 static char nvme_pr_type(enum pr_type type) 993 { 994 switch (type) { 995 case PR_WRITE_EXCLUSIVE: 996 return 1; 997 case PR_EXCLUSIVE_ACCESS: 998 return 2; 999 case PR_WRITE_EXCLUSIVE_REG_ONLY: 1000 return 3; 1001 case PR_EXCLUSIVE_ACCESS_REG_ONLY: 1002 return 4; 1003 case PR_WRITE_EXCLUSIVE_ALL_REGS: 1004 return 5; 1005 case PR_EXCLUSIVE_ACCESS_ALL_REGS: 1006 return 6; 1007 default: 1008 return 0; 1009 } 1010 }; 1011 1012 static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 1013 u64 key, u64 sa_key, u8 op) 1014 { 1015 struct nvme_ns *ns = bdev->bd_disk->private_data; 1016 struct nvme_command c; 1017 u8 data[16] = { 0, }; 1018 1019 put_unaligned_le64(key, &data[0]); 1020 put_unaligned_le64(sa_key, &data[8]); 1021 1022 memset(&c, 0, sizeof(c)); 1023 c.common.opcode = op; 1024 c.common.nsid = cpu_to_le32(ns->ns_id); 1025 c.common.cdw10[0] = cpu_to_le32(cdw10); 1026 1027 return nvme_submit_sync_cmd(ns->queue, &c, data, 16); 1028 } 1029 1030 static int nvme_pr_register(struct block_device *bdev, u64 old, 1031 u64 new, unsigned flags) 1032 { 1033 u32 cdw10; 1034 1035 if (flags & ~PR_FL_IGNORE_KEY) 1036 return -EOPNOTSUPP; 1037 1038 cdw10 = old ? 2 : 0; 1039 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 1040 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 1041 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 1042 } 1043 1044 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 1045 enum pr_type type, unsigned flags) 1046 { 1047 u32 cdw10; 1048 1049 if (flags & ~PR_FL_IGNORE_KEY) 1050 return -EOPNOTSUPP; 1051 1052 cdw10 = nvme_pr_type(type) << 8; 1053 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 1054 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 1055 } 1056 1057 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 1058 enum pr_type type, bool abort) 1059 { 1060 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; 1061 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 1062 } 1063 1064 static int nvme_pr_clear(struct block_device *bdev, u64 key) 1065 { 1066 u32 cdw10 = 1 | (key ? 1 << 3 : 0); 1067 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); 1068 } 1069 1070 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 1071 { 1072 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; 1073 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 1074 } 1075 1076 static const struct pr_ops nvme_pr_ops = { 1077 .pr_register = nvme_pr_register, 1078 .pr_reserve = nvme_pr_reserve, 1079 .pr_release = nvme_pr_release, 1080 .pr_preempt = nvme_pr_preempt, 1081 .pr_clear = nvme_pr_clear, 1082 }; 1083 1084 static const struct block_device_operations nvme_fops = { 1085 .owner = THIS_MODULE, 1086 .ioctl = nvme_ioctl, 1087 .compat_ioctl = nvme_compat_ioctl, 1088 .open = nvme_open, 1089 .release = nvme_release, 1090 .getgeo = nvme_getgeo, 1091 .revalidate_disk= nvme_revalidate_disk, 1092 .pr_ops = &nvme_pr_ops, 1093 }; 1094 1095 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 1096 { 1097 unsigned long timeout = 1098 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1099 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; 1100 int ret; 1101 1102 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 1103 if ((csts & NVME_CSTS_RDY) == bit) 1104 break; 1105 1106 msleep(100); 1107 if (fatal_signal_pending(current)) 1108 return -EINTR; 1109 if (time_after(jiffies, timeout)) { 1110 dev_err(ctrl->device, 1111 "Device not ready; aborting %s\n", enabled ? 1112 "initialisation" : "reset"); 1113 return -ENODEV; 1114 } 1115 } 1116 1117 return ret; 1118 } 1119 1120 /* 1121 * If the device has been passed off to us in an enabled state, just clear 1122 * the enabled bit. The spec says we should set the 'shutdown notification 1123 * bits', but doing so may cause the device to complete commands to the 1124 * admin queue ... and we don't know what memory that might be pointing at! 1125 */ 1126 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 1127 { 1128 int ret; 1129 1130 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 1131 ctrl->ctrl_config &= ~NVME_CC_ENABLE; 1132 1133 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1134 if (ret) 1135 return ret; 1136 1137 /* Checking for ctrl->tagset is a trick to avoid sleeping on module 1138 * load, since we only need the quirk on reset_controller. Notice 1139 * that the HGST device needs this delay only in firmware activation 1140 * procedure; unfortunately we have no (easy) way to verify this. 1141 */ 1142 if ((ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) && ctrl->tagset) 1143 msleep(NVME_QUIRK_DELAY_AMOUNT); 1144 1145 return nvme_wait_ready(ctrl, cap, false); 1146 } 1147 EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 1148 1149 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 1150 { 1151 /* 1152 * Default to a 4K page size, with the intention to update this 1153 * path in the future to accomodate architectures with differing 1154 * kernel and IO page sizes. 1155 */ 1156 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; 1157 int ret; 1158 1159 if (page_shift < dev_page_min) { 1160 dev_err(ctrl->device, 1161 "Minimum device page size %u too large for host (%u)\n", 1162 1 << dev_page_min, 1 << page_shift); 1163 return -ENODEV; 1164 } 1165 1166 ctrl->page_size = 1 << page_shift; 1167 1168 ctrl->ctrl_config = NVME_CC_CSS_NVM; 1169 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 1170 ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1171 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1172 ctrl->ctrl_config |= NVME_CC_ENABLE; 1173 1174 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1175 if (ret) 1176 return ret; 1177 return nvme_wait_ready(ctrl, cap, true); 1178 } 1179 EXPORT_SYMBOL_GPL(nvme_enable_ctrl); 1180 1181 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) 1182 { 1183 unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; 1184 u32 csts; 1185 int ret; 1186 1187 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 1188 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; 1189 1190 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1191 if (ret) 1192 return ret; 1193 1194 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 1195 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) 1196 break; 1197 1198 msleep(100); 1199 if (fatal_signal_pending(current)) 1200 return -EINTR; 1201 if (time_after(jiffies, timeout)) { 1202 dev_err(ctrl->device, 1203 "Device shutdown incomplete; abort shutdown\n"); 1204 return -ENODEV; 1205 } 1206 } 1207 1208 return ret; 1209 } 1210 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); 1211 1212 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 1213 struct request_queue *q) 1214 { 1215 bool vwc = false; 1216 1217 if (ctrl->max_hw_sectors) { 1218 u32 max_segments = 1219 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; 1220 1221 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 1222 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 1223 } 1224 if (ctrl->stripe_size) 1225 blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); 1226 blk_queue_virt_boundary(q, ctrl->page_size - 1); 1227 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 1228 vwc = true; 1229 blk_queue_write_cache(q, vwc, vwc); 1230 } 1231 1232 /* 1233 * Initialize the cached copies of the Identify data and various controller 1234 * register in our nvme_ctrl structure. This should be called as soon as 1235 * the admin queue is fully up and running. 1236 */ 1237 int nvme_init_identify(struct nvme_ctrl *ctrl) 1238 { 1239 struct nvme_id_ctrl *id; 1240 u64 cap; 1241 int ret, page_shift; 1242 u32 max_hw_sectors; 1243 1244 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 1245 if (ret) { 1246 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); 1247 return ret; 1248 } 1249 1250 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); 1251 if (ret) { 1252 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 1253 return ret; 1254 } 1255 page_shift = NVME_CAP_MPSMIN(cap) + 12; 1256 1257 if (ctrl->vs >= NVME_VS(1, 1)) 1258 ctrl->subsystem = NVME_CAP_NSSRC(cap); 1259 1260 ret = nvme_identify_ctrl(ctrl, &id); 1261 if (ret) { 1262 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); 1263 return -EIO; 1264 } 1265 1266 ctrl->vid = le16_to_cpu(id->vid); 1267 ctrl->oncs = le16_to_cpup(&id->oncs); 1268 atomic_set(&ctrl->abort_limit, id->acl + 1); 1269 ctrl->vwc = id->vwc; 1270 ctrl->cntlid = le16_to_cpup(&id->cntlid); 1271 memcpy(ctrl->serial, id->sn, sizeof(id->sn)); 1272 memcpy(ctrl->model, id->mn, sizeof(id->mn)); 1273 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); 1274 if (id->mdts) 1275 max_hw_sectors = 1 << (id->mdts + page_shift - 9); 1276 else 1277 max_hw_sectors = UINT_MAX; 1278 ctrl->max_hw_sectors = 1279 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); 1280 1281 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { 1282 unsigned int max_hw_sectors; 1283 1284 ctrl->stripe_size = 1 << (id->vs[3] + page_shift); 1285 max_hw_sectors = ctrl->stripe_size >> (page_shift - 9); 1286 if (ctrl->max_hw_sectors) { 1287 ctrl->max_hw_sectors = min(max_hw_sectors, 1288 ctrl->max_hw_sectors); 1289 } else { 1290 ctrl->max_hw_sectors = max_hw_sectors; 1291 } 1292 } 1293 1294 nvme_set_queue_limits(ctrl, ctrl->admin_q); 1295 ctrl->sgls = le32_to_cpu(id->sgls); 1296 ctrl->kas = le16_to_cpu(id->kas); 1297 1298 if (ctrl->ops->is_fabrics) { 1299 ctrl->icdoff = le16_to_cpu(id->icdoff); 1300 ctrl->ioccsz = le32_to_cpu(id->ioccsz); 1301 ctrl->iorcsz = le32_to_cpu(id->iorcsz); 1302 ctrl->maxcmd = le16_to_cpu(id->maxcmd); 1303 1304 /* 1305 * In fabrics we need to verify the cntlid matches the 1306 * admin connect 1307 */ 1308 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) 1309 ret = -EINVAL; 1310 1311 if (!ctrl->opts->discovery_nqn && !ctrl->kas) { 1312 dev_err(ctrl->dev, 1313 "keep-alive support is mandatory for fabrics\n"); 1314 ret = -EINVAL; 1315 } 1316 } else { 1317 ctrl->cntlid = le16_to_cpu(id->cntlid); 1318 } 1319 1320 kfree(id); 1321 return ret; 1322 } 1323 EXPORT_SYMBOL_GPL(nvme_init_identify); 1324 1325 static int nvme_dev_open(struct inode *inode, struct file *file) 1326 { 1327 struct nvme_ctrl *ctrl; 1328 int instance = iminor(inode); 1329 int ret = -ENODEV; 1330 1331 spin_lock(&dev_list_lock); 1332 list_for_each_entry(ctrl, &nvme_ctrl_list, node) { 1333 if (ctrl->instance != instance) 1334 continue; 1335 1336 if (!ctrl->admin_q) { 1337 ret = -EWOULDBLOCK; 1338 break; 1339 } 1340 if (!kref_get_unless_zero(&ctrl->kref)) 1341 break; 1342 file->private_data = ctrl; 1343 ret = 0; 1344 break; 1345 } 1346 spin_unlock(&dev_list_lock); 1347 1348 return ret; 1349 } 1350 1351 static int nvme_dev_release(struct inode *inode, struct file *file) 1352 { 1353 nvme_put_ctrl(file->private_data); 1354 return 0; 1355 } 1356 1357 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) 1358 { 1359 struct nvme_ns *ns; 1360 int ret; 1361 1362 mutex_lock(&ctrl->namespaces_mutex); 1363 if (list_empty(&ctrl->namespaces)) { 1364 ret = -ENOTTY; 1365 goto out_unlock; 1366 } 1367 1368 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 1369 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 1370 dev_warn(ctrl->device, 1371 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 1372 ret = -EINVAL; 1373 goto out_unlock; 1374 } 1375 1376 dev_warn(ctrl->device, 1377 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 1378 kref_get(&ns->kref); 1379 mutex_unlock(&ctrl->namespaces_mutex); 1380 1381 ret = nvme_user_cmd(ctrl, ns, argp); 1382 nvme_put_ns(ns); 1383 return ret; 1384 1385 out_unlock: 1386 mutex_unlock(&ctrl->namespaces_mutex); 1387 return ret; 1388 } 1389 1390 static long nvme_dev_ioctl(struct file *file, unsigned int cmd, 1391 unsigned long arg) 1392 { 1393 struct nvme_ctrl *ctrl = file->private_data; 1394 void __user *argp = (void __user *)arg; 1395 1396 switch (cmd) { 1397 case NVME_IOCTL_ADMIN_CMD: 1398 return nvme_user_cmd(ctrl, NULL, argp); 1399 case NVME_IOCTL_IO_CMD: 1400 return nvme_dev_user_cmd(ctrl, argp); 1401 case NVME_IOCTL_RESET: 1402 dev_warn(ctrl->device, "resetting controller\n"); 1403 return ctrl->ops->reset_ctrl(ctrl); 1404 case NVME_IOCTL_SUBSYS_RESET: 1405 return nvme_reset_subsystem(ctrl); 1406 case NVME_IOCTL_RESCAN: 1407 nvme_queue_scan(ctrl); 1408 return 0; 1409 default: 1410 return -ENOTTY; 1411 } 1412 } 1413 1414 static const struct file_operations nvme_dev_fops = { 1415 .owner = THIS_MODULE, 1416 .open = nvme_dev_open, 1417 .release = nvme_dev_release, 1418 .unlocked_ioctl = nvme_dev_ioctl, 1419 .compat_ioctl = nvme_dev_ioctl, 1420 }; 1421 1422 static ssize_t nvme_sysfs_reset(struct device *dev, 1423 struct device_attribute *attr, const char *buf, 1424 size_t count) 1425 { 1426 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1427 int ret; 1428 1429 ret = ctrl->ops->reset_ctrl(ctrl); 1430 if (ret < 0) 1431 return ret; 1432 return count; 1433 } 1434 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 1435 1436 static ssize_t nvme_sysfs_rescan(struct device *dev, 1437 struct device_attribute *attr, const char *buf, 1438 size_t count) 1439 { 1440 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1441 1442 nvme_queue_scan(ctrl); 1443 return count; 1444 } 1445 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); 1446 1447 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 1448 char *buf) 1449 { 1450 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1451 struct nvme_ctrl *ctrl = ns->ctrl; 1452 int serial_len = sizeof(ctrl->serial); 1453 int model_len = sizeof(ctrl->model); 1454 1455 if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) 1456 return sprintf(buf, "eui.%16phN\n", ns->uuid); 1457 1458 if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) 1459 return sprintf(buf, "eui.%8phN\n", ns->eui); 1460 1461 while (ctrl->serial[serial_len - 1] == ' ') 1462 serial_len--; 1463 while (ctrl->model[model_len - 1] == ' ') 1464 model_len--; 1465 1466 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, 1467 serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); 1468 } 1469 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); 1470 1471 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 1472 char *buf) 1473 { 1474 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1475 return sprintf(buf, "%pU\n", ns->uuid); 1476 } 1477 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); 1478 1479 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 1480 char *buf) 1481 { 1482 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1483 return sprintf(buf, "%8phd\n", ns->eui); 1484 } 1485 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); 1486 1487 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 1488 char *buf) 1489 { 1490 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1491 return sprintf(buf, "%d\n", ns->ns_id); 1492 } 1493 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); 1494 1495 static struct attribute *nvme_ns_attrs[] = { 1496 &dev_attr_wwid.attr, 1497 &dev_attr_uuid.attr, 1498 &dev_attr_eui.attr, 1499 &dev_attr_nsid.attr, 1500 NULL, 1501 }; 1502 1503 static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, 1504 struct attribute *a, int n) 1505 { 1506 struct device *dev = container_of(kobj, struct device, kobj); 1507 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1508 1509 if (a == &dev_attr_uuid.attr) { 1510 if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) 1511 return 0; 1512 } 1513 if (a == &dev_attr_eui.attr) { 1514 if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) 1515 return 0; 1516 } 1517 return a->mode; 1518 } 1519 1520 static const struct attribute_group nvme_ns_attr_group = { 1521 .attrs = nvme_ns_attrs, 1522 .is_visible = nvme_ns_attrs_are_visible, 1523 }; 1524 1525 #define nvme_show_str_function(field) \ 1526 static ssize_t field##_show(struct device *dev, \ 1527 struct device_attribute *attr, char *buf) \ 1528 { \ 1529 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 1530 return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ 1531 } \ 1532 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 1533 1534 #define nvme_show_int_function(field) \ 1535 static ssize_t field##_show(struct device *dev, \ 1536 struct device_attribute *attr, char *buf) \ 1537 { \ 1538 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 1539 return sprintf(buf, "%d\n", ctrl->field); \ 1540 } \ 1541 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 1542 1543 nvme_show_str_function(model); 1544 nvme_show_str_function(serial); 1545 nvme_show_str_function(firmware_rev); 1546 nvme_show_int_function(cntlid); 1547 1548 static ssize_t nvme_sysfs_delete(struct device *dev, 1549 struct device_attribute *attr, const char *buf, 1550 size_t count) 1551 { 1552 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1553 1554 if (device_remove_file_self(dev, attr)) 1555 ctrl->ops->delete_ctrl(ctrl); 1556 return count; 1557 } 1558 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); 1559 1560 static ssize_t nvme_sysfs_show_transport(struct device *dev, 1561 struct device_attribute *attr, 1562 char *buf) 1563 { 1564 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1565 1566 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); 1567 } 1568 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); 1569 1570 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, 1571 struct device_attribute *attr, 1572 char *buf) 1573 { 1574 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1575 1576 return snprintf(buf, PAGE_SIZE, "%s\n", 1577 ctrl->ops->get_subsysnqn(ctrl)); 1578 } 1579 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); 1580 1581 static ssize_t nvme_sysfs_show_address(struct device *dev, 1582 struct device_attribute *attr, 1583 char *buf) 1584 { 1585 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1586 1587 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); 1588 } 1589 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); 1590 1591 static struct attribute *nvme_dev_attrs[] = { 1592 &dev_attr_reset_controller.attr, 1593 &dev_attr_rescan_controller.attr, 1594 &dev_attr_model.attr, 1595 &dev_attr_serial.attr, 1596 &dev_attr_firmware_rev.attr, 1597 &dev_attr_cntlid.attr, 1598 &dev_attr_delete_controller.attr, 1599 &dev_attr_transport.attr, 1600 &dev_attr_subsysnqn.attr, 1601 &dev_attr_address.attr, 1602 NULL 1603 }; 1604 1605 #define CHECK_ATTR(ctrl, a, name) \ 1606 if ((a) == &dev_attr_##name.attr && \ 1607 !(ctrl)->ops->get_##name) \ 1608 return 0 1609 1610 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, 1611 struct attribute *a, int n) 1612 { 1613 struct device *dev = container_of(kobj, struct device, kobj); 1614 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1615 1616 if (a == &dev_attr_delete_controller.attr) { 1617 if (!ctrl->ops->delete_ctrl) 1618 return 0; 1619 } 1620 1621 CHECK_ATTR(ctrl, a, subsysnqn); 1622 CHECK_ATTR(ctrl, a, address); 1623 1624 return a->mode; 1625 } 1626 1627 static struct attribute_group nvme_dev_attrs_group = { 1628 .attrs = nvme_dev_attrs, 1629 .is_visible = nvme_dev_attrs_are_visible, 1630 }; 1631 1632 static const struct attribute_group *nvme_dev_attr_groups[] = { 1633 &nvme_dev_attrs_group, 1634 NULL, 1635 }; 1636 1637 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 1638 { 1639 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 1640 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 1641 1642 return nsa->ns_id - nsb->ns_id; 1643 } 1644 1645 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1646 { 1647 struct nvme_ns *ns, *ret = NULL; 1648 1649 mutex_lock(&ctrl->namespaces_mutex); 1650 list_for_each_entry(ns, &ctrl->namespaces, list) { 1651 if (ns->ns_id == nsid) { 1652 kref_get(&ns->kref); 1653 ret = ns; 1654 break; 1655 } 1656 if (ns->ns_id > nsid) 1657 break; 1658 } 1659 mutex_unlock(&ctrl->namespaces_mutex); 1660 return ret; 1661 } 1662 1663 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1664 { 1665 struct nvme_ns *ns; 1666 struct gendisk *disk; 1667 struct nvme_id_ns *id; 1668 char disk_name[DISK_NAME_LEN]; 1669 int node = dev_to_node(ctrl->dev); 1670 1671 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 1672 if (!ns) 1673 return; 1674 1675 ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); 1676 if (ns->instance < 0) 1677 goto out_free_ns; 1678 1679 ns->queue = blk_mq_init_queue(ctrl->tagset); 1680 if (IS_ERR(ns->queue)) 1681 goto out_release_instance; 1682 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1683 ns->queue->queuedata = ns; 1684 ns->ctrl = ctrl; 1685 1686 kref_init(&ns->kref); 1687 ns->ns_id = nsid; 1688 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1689 1690 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1691 nvme_set_queue_limits(ctrl, ns->queue); 1692 1693 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); 1694 1695 if (nvme_revalidate_ns(ns, &id)) 1696 goto out_free_queue; 1697 1698 if (nvme_nvm_ns_supported(ns, id) && 1699 nvme_nvm_register(ns, disk_name, node)) { 1700 dev_warn(ctrl->dev, "%s: LightNVM init failure\n", __func__); 1701 goto out_free_id; 1702 } 1703 1704 disk = alloc_disk_node(0, node); 1705 if (!disk) 1706 goto out_free_id; 1707 1708 disk->fops = &nvme_fops; 1709 disk->private_data = ns; 1710 disk->queue = ns->queue; 1711 disk->flags = GENHD_FL_EXT_DEVT; 1712 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); 1713 ns->disk = disk; 1714 1715 __nvme_revalidate_disk(disk, id); 1716 1717 mutex_lock(&ctrl->namespaces_mutex); 1718 list_add_tail(&ns->list, &ctrl->namespaces); 1719 mutex_unlock(&ctrl->namespaces_mutex); 1720 1721 kref_get(&ctrl->kref); 1722 1723 kfree(id); 1724 1725 device_add_disk(ctrl->device, ns->disk); 1726 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, 1727 &nvme_ns_attr_group)) 1728 pr_warn("%s: failed to create sysfs group for identification\n", 1729 ns->disk->disk_name); 1730 if (ns->ndev && nvme_nvm_register_sysfs(ns)) 1731 pr_warn("%s: failed to register lightnvm sysfs group for identification\n", 1732 ns->disk->disk_name); 1733 return; 1734 out_free_id: 1735 kfree(id); 1736 out_free_queue: 1737 blk_cleanup_queue(ns->queue); 1738 out_release_instance: 1739 ida_simple_remove(&ctrl->ns_ida, ns->instance); 1740 out_free_ns: 1741 kfree(ns); 1742 } 1743 1744 static void nvme_ns_remove(struct nvme_ns *ns) 1745 { 1746 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 1747 return; 1748 1749 if (ns->disk && ns->disk->flags & GENHD_FL_UP) { 1750 if (blk_get_integrity(ns->disk)) 1751 blk_integrity_unregister(ns->disk); 1752 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 1753 &nvme_ns_attr_group); 1754 if (ns->ndev) 1755 nvme_nvm_unregister_sysfs(ns); 1756 del_gendisk(ns->disk); 1757 blk_mq_abort_requeue_list(ns->queue); 1758 blk_cleanup_queue(ns->queue); 1759 } 1760 1761 mutex_lock(&ns->ctrl->namespaces_mutex); 1762 list_del_init(&ns->list); 1763 mutex_unlock(&ns->ctrl->namespaces_mutex); 1764 1765 nvme_put_ns(ns); 1766 } 1767 1768 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1769 { 1770 struct nvme_ns *ns; 1771 1772 ns = nvme_find_get_ns(ctrl, nsid); 1773 if (ns) { 1774 if (ns->disk && revalidate_disk(ns->disk)) 1775 nvme_ns_remove(ns); 1776 nvme_put_ns(ns); 1777 } else 1778 nvme_alloc_ns(ctrl, nsid); 1779 } 1780 1781 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 1782 unsigned nsid) 1783 { 1784 struct nvme_ns *ns, *next; 1785 1786 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 1787 if (ns->ns_id > nsid) 1788 nvme_ns_remove(ns); 1789 } 1790 } 1791 1792 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) 1793 { 1794 struct nvme_ns *ns; 1795 __le32 *ns_list; 1796 unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); 1797 int ret = 0; 1798 1799 ns_list = kzalloc(0x1000, GFP_KERNEL); 1800 if (!ns_list) 1801 return -ENOMEM; 1802 1803 for (i = 0; i < num_lists; i++) { 1804 ret = nvme_identify_ns_list(ctrl, prev, ns_list); 1805 if (ret) 1806 goto free; 1807 1808 for (j = 0; j < min(nn, 1024U); j++) { 1809 nsid = le32_to_cpu(ns_list[j]); 1810 if (!nsid) 1811 goto out; 1812 1813 nvme_validate_ns(ctrl, nsid); 1814 1815 while (++prev < nsid) { 1816 ns = nvme_find_get_ns(ctrl, prev); 1817 if (ns) { 1818 nvme_ns_remove(ns); 1819 nvme_put_ns(ns); 1820 } 1821 } 1822 } 1823 nn -= j; 1824 } 1825 out: 1826 nvme_remove_invalid_namespaces(ctrl, prev); 1827 free: 1828 kfree(ns_list); 1829 return ret; 1830 } 1831 1832 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) 1833 { 1834 unsigned i; 1835 1836 for (i = 1; i <= nn; i++) 1837 nvme_validate_ns(ctrl, i); 1838 1839 nvme_remove_invalid_namespaces(ctrl, nn); 1840 } 1841 1842 static void nvme_scan_work(struct work_struct *work) 1843 { 1844 struct nvme_ctrl *ctrl = 1845 container_of(work, struct nvme_ctrl, scan_work); 1846 struct nvme_id_ctrl *id; 1847 unsigned nn; 1848 1849 if (ctrl->state != NVME_CTRL_LIVE) 1850 return; 1851 1852 if (nvme_identify_ctrl(ctrl, &id)) 1853 return; 1854 1855 nn = le32_to_cpu(id->nn); 1856 if (ctrl->vs >= NVME_VS(1, 1) && 1857 !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { 1858 if (!nvme_scan_ns_list(ctrl, nn)) 1859 goto done; 1860 } 1861 nvme_scan_ns_sequential(ctrl, nn); 1862 done: 1863 mutex_lock(&ctrl->namespaces_mutex); 1864 list_sort(NULL, &ctrl->namespaces, ns_cmp); 1865 mutex_unlock(&ctrl->namespaces_mutex); 1866 kfree(id); 1867 } 1868 1869 void nvme_queue_scan(struct nvme_ctrl *ctrl) 1870 { 1871 /* 1872 * Do not queue new scan work when a controller is reset during 1873 * removal. 1874 */ 1875 if (ctrl->state == NVME_CTRL_LIVE) 1876 schedule_work(&ctrl->scan_work); 1877 } 1878 EXPORT_SYMBOL_GPL(nvme_queue_scan); 1879 1880 /* 1881 * This function iterates the namespace list unlocked to allow recovery from 1882 * controller failure. It is up to the caller to ensure the namespace list is 1883 * not modified by scan work while this function is executing. 1884 */ 1885 void nvme_remove_namespaces(struct nvme_ctrl *ctrl) 1886 { 1887 struct nvme_ns *ns, *next; 1888 1889 /* 1890 * The dead states indicates the controller was not gracefully 1891 * disconnected. In that case, we won't be able to flush any data while 1892 * removing the namespaces' disks; fail all the queues now to avoid 1893 * potentially having to clean up the failed sync later. 1894 */ 1895 if (ctrl->state == NVME_CTRL_DEAD) 1896 nvme_kill_queues(ctrl); 1897 1898 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) 1899 nvme_ns_remove(ns); 1900 } 1901 EXPORT_SYMBOL_GPL(nvme_remove_namespaces); 1902 1903 static void nvme_async_event_work(struct work_struct *work) 1904 { 1905 struct nvme_ctrl *ctrl = 1906 container_of(work, struct nvme_ctrl, async_event_work); 1907 1908 spin_lock_irq(&ctrl->lock); 1909 while (ctrl->event_limit > 0) { 1910 int aer_idx = --ctrl->event_limit; 1911 1912 spin_unlock_irq(&ctrl->lock); 1913 ctrl->ops->submit_async_event(ctrl, aer_idx); 1914 spin_lock_irq(&ctrl->lock); 1915 } 1916 spin_unlock_irq(&ctrl->lock); 1917 } 1918 1919 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, 1920 union nvme_result *res) 1921 { 1922 u32 result = le32_to_cpu(res->u32); 1923 bool done = true; 1924 1925 switch (le16_to_cpu(status) >> 1) { 1926 case NVME_SC_SUCCESS: 1927 done = false; 1928 /*FALLTHRU*/ 1929 case NVME_SC_ABORT_REQ: 1930 ++ctrl->event_limit; 1931 schedule_work(&ctrl->async_event_work); 1932 break; 1933 default: 1934 break; 1935 } 1936 1937 if (done) 1938 return; 1939 1940 switch (result & 0xff07) { 1941 case NVME_AER_NOTICE_NS_CHANGED: 1942 dev_info(ctrl->device, "rescanning\n"); 1943 nvme_queue_scan(ctrl); 1944 break; 1945 default: 1946 dev_warn(ctrl->device, "async event result %08x\n", result); 1947 } 1948 } 1949 EXPORT_SYMBOL_GPL(nvme_complete_async_event); 1950 1951 void nvme_queue_async_events(struct nvme_ctrl *ctrl) 1952 { 1953 ctrl->event_limit = NVME_NR_AERS; 1954 schedule_work(&ctrl->async_event_work); 1955 } 1956 EXPORT_SYMBOL_GPL(nvme_queue_async_events); 1957 1958 static DEFINE_IDA(nvme_instance_ida); 1959 1960 static int nvme_set_instance(struct nvme_ctrl *ctrl) 1961 { 1962 int instance, error; 1963 1964 do { 1965 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 1966 return -ENODEV; 1967 1968 spin_lock(&dev_list_lock); 1969 error = ida_get_new(&nvme_instance_ida, &instance); 1970 spin_unlock(&dev_list_lock); 1971 } while (error == -EAGAIN); 1972 1973 if (error) 1974 return -ENODEV; 1975 1976 ctrl->instance = instance; 1977 return 0; 1978 } 1979 1980 static void nvme_release_instance(struct nvme_ctrl *ctrl) 1981 { 1982 spin_lock(&dev_list_lock); 1983 ida_remove(&nvme_instance_ida, ctrl->instance); 1984 spin_unlock(&dev_list_lock); 1985 } 1986 1987 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 1988 { 1989 flush_work(&ctrl->async_event_work); 1990 flush_work(&ctrl->scan_work); 1991 nvme_remove_namespaces(ctrl); 1992 1993 device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); 1994 1995 spin_lock(&dev_list_lock); 1996 list_del(&ctrl->node); 1997 spin_unlock(&dev_list_lock); 1998 } 1999 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); 2000 2001 static void nvme_free_ctrl(struct kref *kref) 2002 { 2003 struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); 2004 2005 put_device(ctrl->device); 2006 nvme_release_instance(ctrl); 2007 ida_destroy(&ctrl->ns_ida); 2008 2009 ctrl->ops->free_ctrl(ctrl); 2010 } 2011 2012 void nvme_put_ctrl(struct nvme_ctrl *ctrl) 2013 { 2014 kref_put(&ctrl->kref, nvme_free_ctrl); 2015 } 2016 EXPORT_SYMBOL_GPL(nvme_put_ctrl); 2017 2018 /* 2019 * Initialize a NVMe controller structures. This needs to be called during 2020 * earliest initialization so that we have the initialized structured around 2021 * during probing. 2022 */ 2023 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 2024 const struct nvme_ctrl_ops *ops, unsigned long quirks) 2025 { 2026 int ret; 2027 2028 ctrl->state = NVME_CTRL_NEW; 2029 spin_lock_init(&ctrl->lock); 2030 INIT_LIST_HEAD(&ctrl->namespaces); 2031 mutex_init(&ctrl->namespaces_mutex); 2032 kref_init(&ctrl->kref); 2033 ctrl->dev = dev; 2034 ctrl->ops = ops; 2035 ctrl->quirks = quirks; 2036 INIT_WORK(&ctrl->scan_work, nvme_scan_work); 2037 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 2038 2039 ret = nvme_set_instance(ctrl); 2040 if (ret) 2041 goto out; 2042 2043 ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, 2044 MKDEV(nvme_char_major, ctrl->instance), 2045 ctrl, nvme_dev_attr_groups, 2046 "nvme%d", ctrl->instance); 2047 if (IS_ERR(ctrl->device)) { 2048 ret = PTR_ERR(ctrl->device); 2049 goto out_release_instance; 2050 } 2051 get_device(ctrl->device); 2052 ida_init(&ctrl->ns_ida); 2053 2054 spin_lock(&dev_list_lock); 2055 list_add_tail(&ctrl->node, &nvme_ctrl_list); 2056 spin_unlock(&dev_list_lock); 2057 2058 return 0; 2059 out_release_instance: 2060 nvme_release_instance(ctrl); 2061 out: 2062 return ret; 2063 } 2064 EXPORT_SYMBOL_GPL(nvme_init_ctrl); 2065 2066 /** 2067 * nvme_kill_queues(): Ends all namespace queues 2068 * @ctrl: the dead controller that needs to end 2069 * 2070 * Call this function when the driver determines it is unable to get the 2071 * controller in a state capable of servicing IO. 2072 */ 2073 void nvme_kill_queues(struct nvme_ctrl *ctrl) 2074 { 2075 struct nvme_ns *ns; 2076 2077 mutex_lock(&ctrl->namespaces_mutex); 2078 list_for_each_entry(ns, &ctrl->namespaces, list) { 2079 /* 2080 * Revalidating a dead namespace sets capacity to 0. This will 2081 * end buffered writers dirtying pages that can't be synced. 2082 */ 2083 if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 2084 revalidate_disk(ns->disk); 2085 2086 blk_set_queue_dying(ns->queue); 2087 blk_mq_abort_requeue_list(ns->queue); 2088 blk_mq_start_stopped_hw_queues(ns->queue, true); 2089 } 2090 mutex_unlock(&ctrl->namespaces_mutex); 2091 } 2092 EXPORT_SYMBOL_GPL(nvme_kill_queues); 2093 2094 void nvme_stop_queues(struct nvme_ctrl *ctrl) 2095 { 2096 struct nvme_ns *ns; 2097 2098 mutex_lock(&ctrl->namespaces_mutex); 2099 list_for_each_entry(ns, &ctrl->namespaces, list) 2100 blk_mq_quiesce_queue(ns->queue); 2101 mutex_unlock(&ctrl->namespaces_mutex); 2102 } 2103 EXPORT_SYMBOL_GPL(nvme_stop_queues); 2104 2105 void nvme_start_queues(struct nvme_ctrl *ctrl) 2106 { 2107 struct nvme_ns *ns; 2108 2109 mutex_lock(&ctrl->namespaces_mutex); 2110 list_for_each_entry(ns, &ctrl->namespaces, list) { 2111 blk_mq_start_stopped_hw_queues(ns->queue, true); 2112 blk_mq_kick_requeue_list(ns->queue); 2113 } 2114 mutex_unlock(&ctrl->namespaces_mutex); 2115 } 2116 EXPORT_SYMBOL_GPL(nvme_start_queues); 2117 2118 int __init nvme_core_init(void) 2119 { 2120 int result; 2121 2122 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 2123 &nvme_dev_fops); 2124 if (result < 0) 2125 return result; 2126 else if (result > 0) 2127 nvme_char_major = result; 2128 2129 nvme_class = class_create(THIS_MODULE, "nvme"); 2130 if (IS_ERR(nvme_class)) { 2131 result = PTR_ERR(nvme_class); 2132 goto unregister_chrdev; 2133 } 2134 2135 return 0; 2136 2137 unregister_chrdev: 2138 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 2139 return result; 2140 } 2141 2142 void nvme_core_exit(void) 2143 { 2144 class_destroy(nvme_class); 2145 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 2146 } 2147 2148 MODULE_LICENSE("GPL"); 2149 MODULE_VERSION("1.0"); 2150 module_init(nvme_core_init); 2151 module_exit(nvme_core_exit); 2152