1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/blkdev.h> 16 #include <linux/blk-mq.h> 17 #include <linux/delay.h> 18 #include <linux/errno.h> 19 #include <linux/hdreg.h> 20 #include <linux/kernel.h> 21 #include <linux/module.h> 22 #include <linux/list_sort.h> 23 #include <linux/slab.h> 24 #include <linux/types.h> 25 #include <linux/pr.h> 26 #include <linux/ptrace.h> 27 #include <linux/nvme_ioctl.h> 28 #include <linux/t10-pi.h> 29 #include <scsi/sg.h> 30 #include <asm/unaligned.h> 31 32 #include "nvme.h" 33 34 #define NVME_MINORS (1U << MINORBITS) 35 36 unsigned char admin_timeout = 60; 37 module_param(admin_timeout, byte, 0644); 38 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 39 EXPORT_SYMBOL_GPL(admin_timeout); 40 41 unsigned char nvme_io_timeout = 30; 42 module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 43 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 44 EXPORT_SYMBOL_GPL(nvme_io_timeout); 45 46 unsigned char shutdown_timeout = 5; 47 module_param(shutdown_timeout, byte, 0644); 48 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 49 50 static int nvme_major; 51 module_param(nvme_major, int, 0); 52 53 static int nvme_char_major; 54 module_param(nvme_char_major, int, 0); 55 56 static LIST_HEAD(nvme_ctrl_list); 57 static DEFINE_SPINLOCK(dev_list_lock); 58 59 static struct class *nvme_class; 60 61 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 62 enum nvme_ctrl_state new_state) 63 { 64 enum nvme_ctrl_state old_state = ctrl->state; 65 bool changed = false; 66 67 spin_lock_irq(&ctrl->lock); 68 switch (new_state) { 69 case NVME_CTRL_LIVE: 70 switch (old_state) { 71 case NVME_CTRL_RESETTING: 72 changed = true; 73 /* FALLTHRU */ 74 default: 75 break; 76 } 77 break; 78 case NVME_CTRL_RESETTING: 79 switch (old_state) { 80 case NVME_CTRL_NEW: 81 case NVME_CTRL_LIVE: 82 changed = true; 83 /* FALLTHRU */ 84 default: 85 break; 86 } 87 break; 88 case NVME_CTRL_DELETING: 89 switch (old_state) { 90 case NVME_CTRL_LIVE: 91 case NVME_CTRL_RESETTING: 92 changed = true; 93 /* FALLTHRU */ 94 default: 95 break; 96 } 97 break; 98 default: 99 break; 100 } 101 spin_unlock_irq(&ctrl->lock); 102 103 if (changed) 104 ctrl->state = new_state; 105 106 return changed; 107 } 108 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 109 110 static void nvme_free_ns(struct kref *kref) 111 { 112 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 113 114 if (ns->type == NVME_NS_LIGHTNVM) 115 nvme_nvm_unregister(ns->queue, ns->disk->disk_name); 116 117 spin_lock(&dev_list_lock); 118 ns->disk->private_data = NULL; 119 spin_unlock(&dev_list_lock); 120 121 put_disk(ns->disk); 122 ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); 123 nvme_put_ctrl(ns->ctrl); 124 kfree(ns); 125 } 126 127 static void nvme_put_ns(struct nvme_ns *ns) 128 { 129 kref_put(&ns->kref, nvme_free_ns); 130 } 131 132 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) 133 { 134 struct nvme_ns *ns; 135 136 spin_lock(&dev_list_lock); 137 ns = disk->private_data; 138 if (ns) { 139 if (!kref_get_unless_zero(&ns->kref)) 140 goto fail; 141 if (!try_module_get(ns->ctrl->ops->module)) 142 goto fail_put_ns; 143 } 144 spin_unlock(&dev_list_lock); 145 146 return ns; 147 148 fail_put_ns: 149 kref_put(&ns->kref, nvme_free_ns); 150 fail: 151 spin_unlock(&dev_list_lock); 152 return NULL; 153 } 154 155 void nvme_requeue_req(struct request *req) 156 { 157 unsigned long flags; 158 159 blk_mq_requeue_request(req); 160 spin_lock_irqsave(req->q->queue_lock, flags); 161 if (!blk_queue_stopped(req->q)) 162 blk_mq_kick_requeue_list(req->q); 163 spin_unlock_irqrestore(req->q->queue_lock, flags); 164 } 165 EXPORT_SYMBOL_GPL(nvme_requeue_req); 166 167 struct request *nvme_alloc_request(struct request_queue *q, 168 struct nvme_command *cmd, unsigned int flags) 169 { 170 bool write = cmd->common.opcode & 1; 171 struct request *req; 172 173 req = blk_mq_alloc_request(q, write, flags); 174 if (IS_ERR(req)) 175 return req; 176 177 req->cmd_type = REQ_TYPE_DRV_PRIV; 178 req->cmd_flags |= REQ_FAILFAST_DRIVER; 179 req->__data_len = 0; 180 req->__sector = (sector_t) -1; 181 req->bio = req->biotail = NULL; 182 183 req->cmd = (unsigned char *)cmd; 184 req->cmd_len = sizeof(struct nvme_command); 185 186 return req; 187 } 188 EXPORT_SYMBOL_GPL(nvme_alloc_request); 189 190 static inline void nvme_setup_flush(struct nvme_ns *ns, 191 struct nvme_command *cmnd) 192 { 193 memset(cmnd, 0, sizeof(*cmnd)); 194 cmnd->common.opcode = nvme_cmd_flush; 195 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 196 } 197 198 static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, 199 struct nvme_command *cmnd) 200 { 201 struct nvme_dsm_range *range; 202 struct page *page; 203 int offset; 204 unsigned int nr_bytes = blk_rq_bytes(req); 205 206 range = kmalloc(sizeof(*range), GFP_ATOMIC); 207 if (!range) 208 return BLK_MQ_RQ_QUEUE_BUSY; 209 210 range->cattr = cpu_to_le32(0); 211 range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); 212 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 213 214 memset(cmnd, 0, sizeof(*cmnd)); 215 cmnd->dsm.opcode = nvme_cmd_dsm; 216 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 217 cmnd->dsm.nr = 0; 218 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 219 220 req->completion_data = range; 221 page = virt_to_page(range); 222 offset = offset_in_page(range); 223 blk_add_request_payload(req, page, offset, sizeof(*range)); 224 225 /* 226 * we set __data_len back to the size of the area to be discarded 227 * on disk. This allows us to report completion on the full amount 228 * of blocks described by the request. 229 */ 230 req->__data_len = nr_bytes; 231 232 return 0; 233 } 234 235 static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, 236 struct nvme_command *cmnd) 237 { 238 u16 control = 0; 239 u32 dsmgmt = 0; 240 241 if (req->cmd_flags & REQ_FUA) 242 control |= NVME_RW_FUA; 243 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 244 control |= NVME_RW_LR; 245 246 if (req->cmd_flags & REQ_RAHEAD) 247 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 248 249 memset(cmnd, 0, sizeof(*cmnd)); 250 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 251 cmnd->rw.command_id = req->tag; 252 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 253 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 254 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 255 256 if (ns->ms) { 257 switch (ns->pi_type) { 258 case NVME_NS_DPS_PI_TYPE3: 259 control |= NVME_RW_PRINFO_PRCHK_GUARD; 260 break; 261 case NVME_NS_DPS_PI_TYPE1: 262 case NVME_NS_DPS_PI_TYPE2: 263 control |= NVME_RW_PRINFO_PRCHK_GUARD | 264 NVME_RW_PRINFO_PRCHK_REF; 265 cmnd->rw.reftag = cpu_to_le32( 266 nvme_block_nr(ns, blk_rq_pos(req))); 267 break; 268 } 269 if (!blk_integrity_rq(req)) 270 control |= NVME_RW_PRINFO_PRACT; 271 } 272 273 cmnd->rw.control = cpu_to_le16(control); 274 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 275 } 276 277 int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 278 struct nvme_command *cmd) 279 { 280 int ret = 0; 281 282 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 283 memcpy(cmd, req->cmd, sizeof(*cmd)); 284 else if (req->cmd_flags & REQ_FLUSH) 285 nvme_setup_flush(ns, cmd); 286 else if (req->cmd_flags & REQ_DISCARD) 287 ret = nvme_setup_discard(ns, req, cmd); 288 else 289 nvme_setup_rw(ns, req, cmd); 290 291 return ret; 292 } 293 EXPORT_SYMBOL_GPL(nvme_setup_cmd); 294 295 /* 296 * Returns 0 on success. If the result is negative, it's a Linux error code; 297 * if the result is positive, it's an NVM Express status code 298 */ 299 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 300 struct nvme_completion *cqe, void *buffer, unsigned bufflen, 301 unsigned timeout) 302 { 303 struct request *req; 304 int ret; 305 306 req = nvme_alloc_request(q, cmd, 0); 307 if (IS_ERR(req)) 308 return PTR_ERR(req); 309 310 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 311 req->special = cqe; 312 313 if (buffer && bufflen) { 314 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); 315 if (ret) 316 goto out; 317 } 318 319 blk_execute_rq(req->q, NULL, req, 0); 320 ret = req->errors; 321 out: 322 blk_mq_free_request(req); 323 return ret; 324 } 325 326 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 327 void *buffer, unsigned bufflen) 328 { 329 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0); 330 } 331 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 332 333 int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, 334 void __user *ubuffer, unsigned bufflen, 335 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 336 u32 *result, unsigned timeout) 337 { 338 bool write = cmd->common.opcode & 1; 339 struct nvme_completion cqe; 340 struct nvme_ns *ns = q->queuedata; 341 struct gendisk *disk = ns ? ns->disk : NULL; 342 struct request *req; 343 struct bio *bio = NULL; 344 void *meta = NULL; 345 int ret; 346 347 req = nvme_alloc_request(q, cmd, 0); 348 if (IS_ERR(req)) 349 return PTR_ERR(req); 350 351 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 352 req->special = &cqe; 353 354 if (ubuffer && bufflen) { 355 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, 356 GFP_KERNEL); 357 if (ret) 358 goto out; 359 bio = req->bio; 360 361 if (!disk) 362 goto submit; 363 bio->bi_bdev = bdget_disk(disk, 0); 364 if (!bio->bi_bdev) { 365 ret = -ENODEV; 366 goto out_unmap; 367 } 368 369 if (meta_buffer && meta_len) { 370 struct bio_integrity_payload *bip; 371 372 meta = kmalloc(meta_len, GFP_KERNEL); 373 if (!meta) { 374 ret = -ENOMEM; 375 goto out_unmap; 376 } 377 378 if (write) { 379 if (copy_from_user(meta, meta_buffer, 380 meta_len)) { 381 ret = -EFAULT; 382 goto out_free_meta; 383 } 384 } 385 386 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 387 if (IS_ERR(bip)) { 388 ret = PTR_ERR(bip); 389 goto out_free_meta; 390 } 391 392 bip->bip_iter.bi_size = meta_len; 393 bip->bip_iter.bi_sector = meta_seed; 394 395 ret = bio_integrity_add_page(bio, virt_to_page(meta), 396 meta_len, offset_in_page(meta)); 397 if (ret != meta_len) { 398 ret = -ENOMEM; 399 goto out_free_meta; 400 } 401 } 402 } 403 submit: 404 blk_execute_rq(req->q, disk, req, 0); 405 ret = req->errors; 406 if (result) 407 *result = le32_to_cpu(cqe.result); 408 if (meta && !ret && !write) { 409 if (copy_to_user(meta_buffer, meta, meta_len)) 410 ret = -EFAULT; 411 } 412 out_free_meta: 413 kfree(meta); 414 out_unmap: 415 if (bio) { 416 if (disk && bio->bi_bdev) 417 bdput(bio->bi_bdev); 418 blk_rq_unmap_user(bio); 419 } 420 out: 421 blk_mq_free_request(req); 422 return ret; 423 } 424 425 int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, 426 void __user *ubuffer, unsigned bufflen, u32 *result, 427 unsigned timeout) 428 { 429 return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, 430 result, timeout); 431 } 432 433 int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 434 { 435 struct nvme_command c = { }; 436 int error; 437 438 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 439 c.identify.opcode = nvme_admin_identify; 440 c.identify.cns = cpu_to_le32(1); 441 442 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 443 if (!*id) 444 return -ENOMEM; 445 446 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 447 sizeof(struct nvme_id_ctrl)); 448 if (error) 449 kfree(*id); 450 return error; 451 } 452 453 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) 454 { 455 struct nvme_command c = { }; 456 457 c.identify.opcode = nvme_admin_identify; 458 c.identify.cns = cpu_to_le32(2); 459 c.identify.nsid = cpu_to_le32(nsid); 460 return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); 461 } 462 463 int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, 464 struct nvme_id_ns **id) 465 { 466 struct nvme_command c = { }; 467 int error; 468 469 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 470 c.identify.opcode = nvme_admin_identify, 471 c.identify.nsid = cpu_to_le32(nsid), 472 473 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); 474 if (!*id) 475 return -ENOMEM; 476 477 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 478 sizeof(struct nvme_id_ns)); 479 if (error) 480 kfree(*id); 481 return error; 482 } 483 484 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, 485 dma_addr_t dma_addr, u32 *result) 486 { 487 struct nvme_command c; 488 struct nvme_completion cqe; 489 int ret; 490 491 memset(&c, 0, sizeof(c)); 492 c.features.opcode = nvme_admin_get_features; 493 c.features.nsid = cpu_to_le32(nsid); 494 c.features.prp1 = cpu_to_le64(dma_addr); 495 c.features.fid = cpu_to_le32(fid); 496 497 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); 498 if (ret >= 0) 499 *result = le32_to_cpu(cqe.result); 500 return ret; 501 } 502 503 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, 504 dma_addr_t dma_addr, u32 *result) 505 { 506 struct nvme_command c; 507 struct nvme_completion cqe; 508 int ret; 509 510 memset(&c, 0, sizeof(c)); 511 c.features.opcode = nvme_admin_set_features; 512 c.features.prp1 = cpu_to_le64(dma_addr); 513 c.features.fid = cpu_to_le32(fid); 514 c.features.dword11 = cpu_to_le32(dword11); 515 516 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); 517 if (ret >= 0) 518 *result = le32_to_cpu(cqe.result); 519 return ret; 520 } 521 522 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) 523 { 524 struct nvme_command c = { }; 525 int error; 526 527 c.common.opcode = nvme_admin_get_log_page, 528 c.common.nsid = cpu_to_le32(0xFFFFFFFF), 529 c.common.cdw10[0] = cpu_to_le32( 530 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | 531 NVME_LOG_SMART), 532 533 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); 534 if (!*log) 535 return -ENOMEM; 536 537 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, 538 sizeof(struct nvme_smart_log)); 539 if (error) 540 kfree(*log); 541 return error; 542 } 543 544 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) 545 { 546 u32 q_count = (*count - 1) | ((*count - 1) << 16); 547 u32 result; 548 int status, nr_io_queues; 549 550 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, 551 &result); 552 if (status) 553 return status; 554 555 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 556 *count = min(*count, nr_io_queues); 557 return 0; 558 } 559 EXPORT_SYMBOL_GPL(nvme_set_queue_count); 560 561 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 562 { 563 struct nvme_user_io io; 564 struct nvme_command c; 565 unsigned length, meta_len; 566 void __user *metadata; 567 568 if (copy_from_user(&io, uio, sizeof(io))) 569 return -EFAULT; 570 if (io.flags) 571 return -EINVAL; 572 573 switch (io.opcode) { 574 case nvme_cmd_write: 575 case nvme_cmd_read: 576 case nvme_cmd_compare: 577 break; 578 default: 579 return -EINVAL; 580 } 581 582 length = (io.nblocks + 1) << ns->lba_shift; 583 meta_len = (io.nblocks + 1) * ns->ms; 584 metadata = (void __user *)(uintptr_t)io.metadata; 585 586 if (ns->ext) { 587 length += meta_len; 588 meta_len = 0; 589 } else if (meta_len) { 590 if ((io.metadata & 3) || !io.metadata) 591 return -EINVAL; 592 } 593 594 memset(&c, 0, sizeof(c)); 595 c.rw.opcode = io.opcode; 596 c.rw.flags = io.flags; 597 c.rw.nsid = cpu_to_le32(ns->ns_id); 598 c.rw.slba = cpu_to_le64(io.slba); 599 c.rw.length = cpu_to_le16(io.nblocks); 600 c.rw.control = cpu_to_le16(io.control); 601 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 602 c.rw.reftag = cpu_to_le32(io.reftag); 603 c.rw.apptag = cpu_to_le16(io.apptag); 604 c.rw.appmask = cpu_to_le16(io.appmask); 605 606 return __nvme_submit_user_cmd(ns->queue, &c, 607 (void __user *)(uintptr_t)io.addr, length, 608 metadata, meta_len, io.slba, NULL, 0); 609 } 610 611 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 612 struct nvme_passthru_cmd __user *ucmd) 613 { 614 struct nvme_passthru_cmd cmd; 615 struct nvme_command c; 616 unsigned timeout = 0; 617 int status; 618 619 if (!capable(CAP_SYS_ADMIN)) 620 return -EACCES; 621 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 622 return -EFAULT; 623 if (cmd.flags) 624 return -EINVAL; 625 626 memset(&c, 0, sizeof(c)); 627 c.common.opcode = cmd.opcode; 628 c.common.flags = cmd.flags; 629 c.common.nsid = cpu_to_le32(cmd.nsid); 630 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 631 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 632 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 633 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 634 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 635 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 636 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 637 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 638 639 if (cmd.timeout_ms) 640 timeout = msecs_to_jiffies(cmd.timeout_ms); 641 642 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 643 (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 644 &cmd.result, timeout); 645 if (status >= 0) { 646 if (put_user(cmd.result, &ucmd->result)) 647 return -EFAULT; 648 } 649 650 return status; 651 } 652 653 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 654 unsigned int cmd, unsigned long arg) 655 { 656 struct nvme_ns *ns = bdev->bd_disk->private_data; 657 658 switch (cmd) { 659 case NVME_IOCTL_ID: 660 force_successful_syscall_return(); 661 return ns->ns_id; 662 case NVME_IOCTL_ADMIN_CMD: 663 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); 664 case NVME_IOCTL_IO_CMD: 665 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); 666 case NVME_IOCTL_SUBMIT_IO: 667 return nvme_submit_io(ns, (void __user *)arg); 668 #ifdef CONFIG_BLK_DEV_NVME_SCSI 669 case SG_GET_VERSION_NUM: 670 return nvme_sg_get_version_num((void __user *)arg); 671 case SG_IO: 672 return nvme_sg_io(ns, (void __user *)arg); 673 #endif 674 default: 675 return -ENOTTY; 676 } 677 } 678 679 #ifdef CONFIG_COMPAT 680 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 681 unsigned int cmd, unsigned long arg) 682 { 683 switch (cmd) { 684 case SG_IO: 685 return -ENOIOCTLCMD; 686 } 687 return nvme_ioctl(bdev, mode, cmd, arg); 688 } 689 #else 690 #define nvme_compat_ioctl NULL 691 #endif 692 693 static int nvme_open(struct block_device *bdev, fmode_t mode) 694 { 695 return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; 696 } 697 698 static void nvme_release(struct gendisk *disk, fmode_t mode) 699 { 700 struct nvme_ns *ns = disk->private_data; 701 702 module_put(ns->ctrl->ops->module); 703 nvme_put_ns(ns); 704 } 705 706 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 707 { 708 /* some standard values */ 709 geo->heads = 1 << 6; 710 geo->sectors = 1 << 5; 711 geo->cylinders = get_capacity(bdev->bd_disk) >> 11; 712 return 0; 713 } 714 715 #ifdef CONFIG_BLK_DEV_INTEGRITY 716 static void nvme_init_integrity(struct nvme_ns *ns) 717 { 718 struct blk_integrity integrity; 719 720 switch (ns->pi_type) { 721 case NVME_NS_DPS_PI_TYPE3: 722 integrity.profile = &t10_pi_type3_crc; 723 break; 724 case NVME_NS_DPS_PI_TYPE1: 725 case NVME_NS_DPS_PI_TYPE2: 726 integrity.profile = &t10_pi_type1_crc; 727 break; 728 default: 729 integrity.profile = NULL; 730 break; 731 } 732 integrity.tuple_size = ns->ms; 733 blk_integrity_register(ns->disk, &integrity); 734 blk_queue_max_integrity_segments(ns->queue, 1); 735 } 736 #else 737 static void nvme_init_integrity(struct nvme_ns *ns) 738 { 739 } 740 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 741 742 static void nvme_config_discard(struct nvme_ns *ns) 743 { 744 struct nvme_ctrl *ctrl = ns->ctrl; 745 u32 logical_block_size = queue_logical_block_size(ns->queue); 746 747 if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES) 748 ns->queue->limits.discard_zeroes_data = 1; 749 else 750 ns->queue->limits.discard_zeroes_data = 0; 751 752 ns->queue->limits.discard_alignment = logical_block_size; 753 ns->queue->limits.discard_granularity = logical_block_size; 754 blk_queue_max_discard_sectors(ns->queue, 0xffffffff); 755 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 756 } 757 758 static int nvme_revalidate_disk(struct gendisk *disk) 759 { 760 struct nvme_ns *ns = disk->private_data; 761 struct nvme_id_ns *id; 762 u8 lbaf, pi_type; 763 u16 old_ms; 764 unsigned short bs; 765 766 if (test_bit(NVME_NS_DEAD, &ns->flags)) { 767 set_capacity(disk, 0); 768 return -ENODEV; 769 } 770 if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { 771 dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", 772 __func__); 773 return -ENODEV; 774 } 775 if (id->ncap == 0) { 776 kfree(id); 777 return -ENODEV; 778 } 779 780 if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { 781 if (nvme_nvm_register(ns->queue, disk->disk_name)) { 782 dev_warn(disk_to_dev(ns->disk), 783 "%s: LightNVM init failure\n", __func__); 784 kfree(id); 785 return -ENODEV; 786 } 787 ns->type = NVME_NS_LIGHTNVM; 788 } 789 790 if (ns->ctrl->vs >= NVME_VS(1, 1)) 791 memcpy(ns->eui, id->eui64, sizeof(ns->eui)); 792 if (ns->ctrl->vs >= NVME_VS(1, 2)) 793 memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); 794 795 old_ms = ns->ms; 796 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 797 ns->lba_shift = id->lbaf[lbaf].ds; 798 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 799 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 800 801 /* 802 * If identify namespace failed, use default 512 byte block size so 803 * block layer can use before failing read/write for 0 capacity. 804 */ 805 if (ns->lba_shift == 0) 806 ns->lba_shift = 9; 807 bs = 1 << ns->lba_shift; 808 /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 809 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 810 id->dps & NVME_NS_DPS_PI_MASK : 0; 811 812 blk_mq_freeze_queue(disk->queue); 813 if (blk_get_integrity(disk) && (ns->pi_type != pi_type || 814 ns->ms != old_ms || 815 bs != queue_logical_block_size(disk->queue) || 816 (ns->ms && ns->ext))) 817 blk_integrity_unregister(disk); 818 819 ns->pi_type = pi_type; 820 blk_queue_logical_block_size(ns->queue, bs); 821 822 if (ns->ms && !blk_get_integrity(disk) && !ns->ext) 823 nvme_init_integrity(ns); 824 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) 825 set_capacity(disk, 0); 826 else 827 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 828 829 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) 830 nvme_config_discard(ns); 831 blk_mq_unfreeze_queue(disk->queue); 832 833 kfree(id); 834 return 0; 835 } 836 837 static char nvme_pr_type(enum pr_type type) 838 { 839 switch (type) { 840 case PR_WRITE_EXCLUSIVE: 841 return 1; 842 case PR_EXCLUSIVE_ACCESS: 843 return 2; 844 case PR_WRITE_EXCLUSIVE_REG_ONLY: 845 return 3; 846 case PR_EXCLUSIVE_ACCESS_REG_ONLY: 847 return 4; 848 case PR_WRITE_EXCLUSIVE_ALL_REGS: 849 return 5; 850 case PR_EXCLUSIVE_ACCESS_ALL_REGS: 851 return 6; 852 default: 853 return 0; 854 } 855 }; 856 857 static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 858 u64 key, u64 sa_key, u8 op) 859 { 860 struct nvme_ns *ns = bdev->bd_disk->private_data; 861 struct nvme_command c; 862 u8 data[16] = { 0, }; 863 864 put_unaligned_le64(key, &data[0]); 865 put_unaligned_le64(sa_key, &data[8]); 866 867 memset(&c, 0, sizeof(c)); 868 c.common.opcode = op; 869 c.common.nsid = cpu_to_le32(ns->ns_id); 870 c.common.cdw10[0] = cpu_to_le32(cdw10); 871 872 return nvme_submit_sync_cmd(ns->queue, &c, data, 16); 873 } 874 875 static int nvme_pr_register(struct block_device *bdev, u64 old, 876 u64 new, unsigned flags) 877 { 878 u32 cdw10; 879 880 if (flags & ~PR_FL_IGNORE_KEY) 881 return -EOPNOTSUPP; 882 883 cdw10 = old ? 2 : 0; 884 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 885 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 886 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 887 } 888 889 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 890 enum pr_type type, unsigned flags) 891 { 892 u32 cdw10; 893 894 if (flags & ~PR_FL_IGNORE_KEY) 895 return -EOPNOTSUPP; 896 897 cdw10 = nvme_pr_type(type) << 8; 898 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 899 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 900 } 901 902 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 903 enum pr_type type, bool abort) 904 { 905 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; 906 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 907 } 908 909 static int nvme_pr_clear(struct block_device *bdev, u64 key) 910 { 911 u32 cdw10 = 1 | (key ? 1 << 3 : 0); 912 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); 913 } 914 915 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 916 { 917 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; 918 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 919 } 920 921 static const struct pr_ops nvme_pr_ops = { 922 .pr_register = nvme_pr_register, 923 .pr_reserve = nvme_pr_reserve, 924 .pr_release = nvme_pr_release, 925 .pr_preempt = nvme_pr_preempt, 926 .pr_clear = nvme_pr_clear, 927 }; 928 929 static const struct block_device_operations nvme_fops = { 930 .owner = THIS_MODULE, 931 .ioctl = nvme_ioctl, 932 .compat_ioctl = nvme_compat_ioctl, 933 .open = nvme_open, 934 .release = nvme_release, 935 .getgeo = nvme_getgeo, 936 .revalidate_disk= nvme_revalidate_disk, 937 .pr_ops = &nvme_pr_ops, 938 }; 939 940 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 941 { 942 unsigned long timeout = 943 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 944 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; 945 int ret; 946 947 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 948 if ((csts & NVME_CSTS_RDY) == bit) 949 break; 950 951 msleep(100); 952 if (fatal_signal_pending(current)) 953 return -EINTR; 954 if (time_after(jiffies, timeout)) { 955 dev_err(ctrl->device, 956 "Device not ready; aborting %s\n", enabled ? 957 "initialisation" : "reset"); 958 return -ENODEV; 959 } 960 } 961 962 return ret; 963 } 964 965 /* 966 * If the device has been passed off to us in an enabled state, just clear 967 * the enabled bit. The spec says we should set the 'shutdown notification 968 * bits', but doing so may cause the device to complete commands to the 969 * admin queue ... and we don't know what memory that might be pointing at! 970 */ 971 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 972 { 973 int ret; 974 975 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 976 ctrl->ctrl_config &= ~NVME_CC_ENABLE; 977 978 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 979 if (ret) 980 return ret; 981 return nvme_wait_ready(ctrl, cap, false); 982 } 983 EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 984 985 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 986 { 987 /* 988 * Default to a 4K page size, with the intention to update this 989 * path in the future to accomodate architectures with differing 990 * kernel and IO page sizes. 991 */ 992 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; 993 int ret; 994 995 if (page_shift < dev_page_min) { 996 dev_err(ctrl->device, 997 "Minimum device page size %u too large for host (%u)\n", 998 1 << dev_page_min, 1 << page_shift); 999 return -ENODEV; 1000 } 1001 1002 ctrl->page_size = 1 << page_shift; 1003 1004 ctrl->ctrl_config = NVME_CC_CSS_NVM; 1005 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 1006 ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1007 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1008 ctrl->ctrl_config |= NVME_CC_ENABLE; 1009 1010 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1011 if (ret) 1012 return ret; 1013 return nvme_wait_ready(ctrl, cap, true); 1014 } 1015 EXPORT_SYMBOL_GPL(nvme_enable_ctrl); 1016 1017 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) 1018 { 1019 unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; 1020 u32 csts; 1021 int ret; 1022 1023 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 1024 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; 1025 1026 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1027 if (ret) 1028 return ret; 1029 1030 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 1031 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) 1032 break; 1033 1034 msleep(100); 1035 if (fatal_signal_pending(current)) 1036 return -EINTR; 1037 if (time_after(jiffies, timeout)) { 1038 dev_err(ctrl->device, 1039 "Device shutdown incomplete; abort shutdown\n"); 1040 return -ENODEV; 1041 } 1042 } 1043 1044 return ret; 1045 } 1046 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); 1047 1048 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 1049 struct request_queue *q) 1050 { 1051 bool vwc = false; 1052 1053 if (ctrl->max_hw_sectors) { 1054 u32 max_segments = 1055 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; 1056 1057 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 1058 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 1059 } 1060 if (ctrl->stripe_size) 1061 blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); 1062 blk_queue_virt_boundary(q, ctrl->page_size - 1); 1063 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 1064 vwc = true; 1065 blk_queue_write_cache(q, vwc, vwc); 1066 } 1067 1068 /* 1069 * Initialize the cached copies of the Identify data and various controller 1070 * register in our nvme_ctrl structure. This should be called as soon as 1071 * the admin queue is fully up and running. 1072 */ 1073 int nvme_init_identify(struct nvme_ctrl *ctrl) 1074 { 1075 struct nvme_id_ctrl *id; 1076 u64 cap; 1077 int ret, page_shift; 1078 1079 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 1080 if (ret) { 1081 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); 1082 return ret; 1083 } 1084 1085 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); 1086 if (ret) { 1087 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 1088 return ret; 1089 } 1090 page_shift = NVME_CAP_MPSMIN(cap) + 12; 1091 1092 if (ctrl->vs >= NVME_VS(1, 1)) 1093 ctrl->subsystem = NVME_CAP_NSSRC(cap); 1094 1095 ret = nvme_identify_ctrl(ctrl, &id); 1096 if (ret) { 1097 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); 1098 return -EIO; 1099 } 1100 1101 ctrl->vid = le16_to_cpu(id->vid); 1102 ctrl->oncs = le16_to_cpup(&id->oncs); 1103 atomic_set(&ctrl->abort_limit, id->acl + 1); 1104 ctrl->vwc = id->vwc; 1105 ctrl->cntlid = le16_to_cpup(&id->cntlid); 1106 memcpy(ctrl->serial, id->sn, sizeof(id->sn)); 1107 memcpy(ctrl->model, id->mn, sizeof(id->mn)); 1108 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); 1109 if (id->mdts) 1110 ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9); 1111 else 1112 ctrl->max_hw_sectors = UINT_MAX; 1113 1114 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { 1115 unsigned int max_hw_sectors; 1116 1117 ctrl->stripe_size = 1 << (id->vs[3] + page_shift); 1118 max_hw_sectors = ctrl->stripe_size >> (page_shift - 9); 1119 if (ctrl->max_hw_sectors) { 1120 ctrl->max_hw_sectors = min(max_hw_sectors, 1121 ctrl->max_hw_sectors); 1122 } else { 1123 ctrl->max_hw_sectors = max_hw_sectors; 1124 } 1125 } 1126 1127 nvme_set_queue_limits(ctrl, ctrl->admin_q); 1128 1129 kfree(id); 1130 return 0; 1131 } 1132 EXPORT_SYMBOL_GPL(nvme_init_identify); 1133 1134 static int nvme_dev_open(struct inode *inode, struct file *file) 1135 { 1136 struct nvme_ctrl *ctrl; 1137 int instance = iminor(inode); 1138 int ret = -ENODEV; 1139 1140 spin_lock(&dev_list_lock); 1141 list_for_each_entry(ctrl, &nvme_ctrl_list, node) { 1142 if (ctrl->instance != instance) 1143 continue; 1144 1145 if (!ctrl->admin_q) { 1146 ret = -EWOULDBLOCK; 1147 break; 1148 } 1149 if (!kref_get_unless_zero(&ctrl->kref)) 1150 break; 1151 file->private_data = ctrl; 1152 ret = 0; 1153 break; 1154 } 1155 spin_unlock(&dev_list_lock); 1156 1157 return ret; 1158 } 1159 1160 static int nvme_dev_release(struct inode *inode, struct file *file) 1161 { 1162 nvme_put_ctrl(file->private_data); 1163 return 0; 1164 } 1165 1166 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) 1167 { 1168 struct nvme_ns *ns; 1169 int ret; 1170 1171 mutex_lock(&ctrl->namespaces_mutex); 1172 if (list_empty(&ctrl->namespaces)) { 1173 ret = -ENOTTY; 1174 goto out_unlock; 1175 } 1176 1177 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 1178 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 1179 dev_warn(ctrl->device, 1180 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 1181 ret = -EINVAL; 1182 goto out_unlock; 1183 } 1184 1185 dev_warn(ctrl->device, 1186 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 1187 kref_get(&ns->kref); 1188 mutex_unlock(&ctrl->namespaces_mutex); 1189 1190 ret = nvme_user_cmd(ctrl, ns, argp); 1191 nvme_put_ns(ns); 1192 return ret; 1193 1194 out_unlock: 1195 mutex_unlock(&ctrl->namespaces_mutex); 1196 return ret; 1197 } 1198 1199 static long nvme_dev_ioctl(struct file *file, unsigned int cmd, 1200 unsigned long arg) 1201 { 1202 struct nvme_ctrl *ctrl = file->private_data; 1203 void __user *argp = (void __user *)arg; 1204 1205 switch (cmd) { 1206 case NVME_IOCTL_ADMIN_CMD: 1207 return nvme_user_cmd(ctrl, NULL, argp); 1208 case NVME_IOCTL_IO_CMD: 1209 return nvme_dev_user_cmd(ctrl, argp); 1210 case NVME_IOCTL_RESET: 1211 dev_warn(ctrl->device, "resetting controller\n"); 1212 return ctrl->ops->reset_ctrl(ctrl); 1213 case NVME_IOCTL_SUBSYS_RESET: 1214 return nvme_reset_subsystem(ctrl); 1215 default: 1216 return -ENOTTY; 1217 } 1218 } 1219 1220 static const struct file_operations nvme_dev_fops = { 1221 .owner = THIS_MODULE, 1222 .open = nvme_dev_open, 1223 .release = nvme_dev_release, 1224 .unlocked_ioctl = nvme_dev_ioctl, 1225 .compat_ioctl = nvme_dev_ioctl, 1226 }; 1227 1228 static ssize_t nvme_sysfs_reset(struct device *dev, 1229 struct device_attribute *attr, const char *buf, 1230 size_t count) 1231 { 1232 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 1233 int ret; 1234 1235 ret = ctrl->ops->reset_ctrl(ctrl); 1236 if (ret < 0) 1237 return ret; 1238 return count; 1239 } 1240 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 1241 1242 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 1243 char *buf) 1244 { 1245 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1246 struct nvme_ctrl *ctrl = ns->ctrl; 1247 int serial_len = sizeof(ctrl->serial); 1248 int model_len = sizeof(ctrl->model); 1249 1250 if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) 1251 return sprintf(buf, "eui.%16phN\n", ns->uuid); 1252 1253 if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) 1254 return sprintf(buf, "eui.%8phN\n", ns->eui); 1255 1256 while (ctrl->serial[serial_len - 1] == ' ') 1257 serial_len--; 1258 while (ctrl->model[model_len - 1] == ' ') 1259 model_len--; 1260 1261 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, 1262 serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); 1263 } 1264 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); 1265 1266 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 1267 char *buf) 1268 { 1269 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1270 return sprintf(buf, "%pU\n", ns->uuid); 1271 } 1272 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); 1273 1274 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 1275 char *buf) 1276 { 1277 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1278 return sprintf(buf, "%8phd\n", ns->eui); 1279 } 1280 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); 1281 1282 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 1283 char *buf) 1284 { 1285 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1286 return sprintf(buf, "%d\n", ns->ns_id); 1287 } 1288 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); 1289 1290 static struct attribute *nvme_ns_attrs[] = { 1291 &dev_attr_wwid.attr, 1292 &dev_attr_uuid.attr, 1293 &dev_attr_eui.attr, 1294 &dev_attr_nsid.attr, 1295 NULL, 1296 }; 1297 1298 static umode_t nvme_attrs_are_visible(struct kobject *kobj, 1299 struct attribute *a, int n) 1300 { 1301 struct device *dev = container_of(kobj, struct device, kobj); 1302 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1303 1304 if (a == &dev_attr_uuid.attr) { 1305 if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) 1306 return 0; 1307 } 1308 if (a == &dev_attr_eui.attr) { 1309 if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) 1310 return 0; 1311 } 1312 return a->mode; 1313 } 1314 1315 static const struct attribute_group nvme_ns_attr_group = { 1316 .attrs = nvme_ns_attrs, 1317 .is_visible = nvme_attrs_are_visible, 1318 }; 1319 1320 #define nvme_show_str_function(field) \ 1321 static ssize_t field##_show(struct device *dev, \ 1322 struct device_attribute *attr, char *buf) \ 1323 { \ 1324 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 1325 return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ 1326 } \ 1327 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 1328 1329 #define nvme_show_int_function(field) \ 1330 static ssize_t field##_show(struct device *dev, \ 1331 struct device_attribute *attr, char *buf) \ 1332 { \ 1333 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 1334 return sprintf(buf, "%d\n", ctrl->field); \ 1335 } \ 1336 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 1337 1338 nvme_show_str_function(model); 1339 nvme_show_str_function(serial); 1340 nvme_show_str_function(firmware_rev); 1341 nvme_show_int_function(cntlid); 1342 1343 static struct attribute *nvme_dev_attrs[] = { 1344 &dev_attr_reset_controller.attr, 1345 &dev_attr_model.attr, 1346 &dev_attr_serial.attr, 1347 &dev_attr_firmware_rev.attr, 1348 &dev_attr_cntlid.attr, 1349 NULL 1350 }; 1351 1352 static struct attribute_group nvme_dev_attrs_group = { 1353 .attrs = nvme_dev_attrs, 1354 }; 1355 1356 static const struct attribute_group *nvme_dev_attr_groups[] = { 1357 &nvme_dev_attrs_group, 1358 NULL, 1359 }; 1360 1361 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 1362 { 1363 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 1364 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 1365 1366 return nsa->ns_id - nsb->ns_id; 1367 } 1368 1369 static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1370 { 1371 struct nvme_ns *ns; 1372 1373 lockdep_assert_held(&ctrl->namespaces_mutex); 1374 1375 list_for_each_entry(ns, &ctrl->namespaces, list) { 1376 if (ns->ns_id == nsid) 1377 return ns; 1378 if (ns->ns_id > nsid) 1379 break; 1380 } 1381 return NULL; 1382 } 1383 1384 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1385 { 1386 struct nvme_ns *ns; 1387 struct gendisk *disk; 1388 int node = dev_to_node(ctrl->dev); 1389 1390 lockdep_assert_held(&ctrl->namespaces_mutex); 1391 1392 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 1393 if (!ns) 1394 return; 1395 1396 ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); 1397 if (ns->instance < 0) 1398 goto out_free_ns; 1399 1400 ns->queue = blk_mq_init_queue(ctrl->tagset); 1401 if (IS_ERR(ns->queue)) 1402 goto out_release_instance; 1403 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1404 ns->queue->queuedata = ns; 1405 ns->ctrl = ctrl; 1406 1407 disk = alloc_disk_node(0, node); 1408 if (!disk) 1409 goto out_free_queue; 1410 1411 kref_init(&ns->kref); 1412 ns->ns_id = nsid; 1413 ns->disk = disk; 1414 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1415 1416 1417 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1418 nvme_set_queue_limits(ctrl, ns->queue); 1419 1420 disk->major = nvme_major; 1421 disk->first_minor = 0; 1422 disk->fops = &nvme_fops; 1423 disk->private_data = ns; 1424 disk->queue = ns->queue; 1425 disk->driverfs_dev = ctrl->device; 1426 disk->flags = GENHD_FL_EXT_DEVT; 1427 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); 1428 1429 if (nvme_revalidate_disk(ns->disk)) 1430 goto out_free_disk; 1431 1432 list_add_tail_rcu(&ns->list, &ctrl->namespaces); 1433 kref_get(&ctrl->kref); 1434 if (ns->type == NVME_NS_LIGHTNVM) 1435 return; 1436 1437 add_disk(ns->disk); 1438 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, 1439 &nvme_ns_attr_group)) 1440 pr_warn("%s: failed to create sysfs group for identification\n", 1441 ns->disk->disk_name); 1442 return; 1443 out_free_disk: 1444 kfree(disk); 1445 out_free_queue: 1446 blk_cleanup_queue(ns->queue); 1447 out_release_instance: 1448 ida_simple_remove(&ctrl->ns_ida, ns->instance); 1449 out_free_ns: 1450 kfree(ns); 1451 } 1452 1453 static void nvme_ns_remove(struct nvme_ns *ns) 1454 { 1455 lockdep_assert_held(&ns->ctrl->namespaces_mutex); 1456 1457 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 1458 return; 1459 1460 if (ns->disk->flags & GENHD_FL_UP) { 1461 if (blk_get_integrity(ns->disk)) 1462 blk_integrity_unregister(ns->disk); 1463 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 1464 &nvme_ns_attr_group); 1465 del_gendisk(ns->disk); 1466 blk_mq_abort_requeue_list(ns->queue); 1467 blk_cleanup_queue(ns->queue); 1468 } 1469 list_del_init(&ns->list); 1470 synchronize_rcu(); 1471 nvme_put_ns(ns); 1472 } 1473 1474 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1475 { 1476 struct nvme_ns *ns; 1477 1478 ns = nvme_find_ns(ctrl, nsid); 1479 if (ns) { 1480 if (revalidate_disk(ns->disk)) 1481 nvme_ns_remove(ns); 1482 } else 1483 nvme_alloc_ns(ctrl, nsid); 1484 } 1485 1486 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) 1487 { 1488 struct nvme_ns *ns; 1489 __le32 *ns_list; 1490 unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); 1491 int ret = 0; 1492 1493 ns_list = kzalloc(0x1000, GFP_KERNEL); 1494 if (!ns_list) 1495 return -ENOMEM; 1496 1497 for (i = 0; i < num_lists; i++) { 1498 ret = nvme_identify_ns_list(ctrl, prev, ns_list); 1499 if (ret) 1500 goto out; 1501 1502 for (j = 0; j < min(nn, 1024U); j++) { 1503 nsid = le32_to_cpu(ns_list[j]); 1504 if (!nsid) 1505 goto out; 1506 1507 nvme_validate_ns(ctrl, nsid); 1508 1509 while (++prev < nsid) { 1510 ns = nvme_find_ns(ctrl, prev); 1511 if (ns) 1512 nvme_ns_remove(ns); 1513 } 1514 } 1515 nn -= j; 1516 } 1517 out: 1518 kfree(ns_list); 1519 return ret; 1520 } 1521 1522 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) 1523 { 1524 struct nvme_ns *ns, *next; 1525 unsigned i; 1526 1527 lockdep_assert_held(&ctrl->namespaces_mutex); 1528 1529 for (i = 1; i <= nn; i++) 1530 nvme_validate_ns(ctrl, i); 1531 1532 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 1533 if (ns->ns_id > nn) 1534 nvme_ns_remove(ns); 1535 } 1536 } 1537 1538 static void nvme_scan_work(struct work_struct *work) 1539 { 1540 struct nvme_ctrl *ctrl = 1541 container_of(work, struct nvme_ctrl, scan_work); 1542 struct nvme_id_ctrl *id; 1543 unsigned nn; 1544 1545 if (ctrl->state != NVME_CTRL_LIVE) 1546 return; 1547 1548 if (nvme_identify_ctrl(ctrl, &id)) 1549 return; 1550 1551 mutex_lock(&ctrl->namespaces_mutex); 1552 nn = le32_to_cpu(id->nn); 1553 if (ctrl->vs >= NVME_VS(1, 1) && 1554 !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { 1555 if (!nvme_scan_ns_list(ctrl, nn)) 1556 goto done; 1557 } 1558 nvme_scan_ns_sequential(ctrl, nn); 1559 done: 1560 list_sort(NULL, &ctrl->namespaces, ns_cmp); 1561 mutex_unlock(&ctrl->namespaces_mutex); 1562 kfree(id); 1563 1564 if (ctrl->ops->post_scan) 1565 ctrl->ops->post_scan(ctrl); 1566 } 1567 1568 void nvme_queue_scan(struct nvme_ctrl *ctrl) 1569 { 1570 /* 1571 * Do not queue new scan work when a controller is reset during 1572 * removal. 1573 */ 1574 if (ctrl->state == NVME_CTRL_LIVE) 1575 schedule_work(&ctrl->scan_work); 1576 } 1577 EXPORT_SYMBOL_GPL(nvme_queue_scan); 1578 1579 void nvme_remove_namespaces(struct nvme_ctrl *ctrl) 1580 { 1581 struct nvme_ns *ns, *next; 1582 1583 mutex_lock(&ctrl->namespaces_mutex); 1584 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) 1585 nvme_ns_remove(ns); 1586 mutex_unlock(&ctrl->namespaces_mutex); 1587 } 1588 EXPORT_SYMBOL_GPL(nvme_remove_namespaces); 1589 1590 static void nvme_async_event_work(struct work_struct *work) 1591 { 1592 struct nvme_ctrl *ctrl = 1593 container_of(work, struct nvme_ctrl, async_event_work); 1594 1595 spin_lock_irq(&ctrl->lock); 1596 while (ctrl->event_limit > 0) { 1597 int aer_idx = --ctrl->event_limit; 1598 1599 spin_unlock_irq(&ctrl->lock); 1600 ctrl->ops->submit_async_event(ctrl, aer_idx); 1601 spin_lock_irq(&ctrl->lock); 1602 } 1603 spin_unlock_irq(&ctrl->lock); 1604 } 1605 1606 void nvme_complete_async_event(struct nvme_ctrl *ctrl, 1607 struct nvme_completion *cqe) 1608 { 1609 u16 status = le16_to_cpu(cqe->status) >> 1; 1610 u32 result = le32_to_cpu(cqe->result); 1611 1612 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { 1613 ++ctrl->event_limit; 1614 schedule_work(&ctrl->async_event_work); 1615 } 1616 1617 if (status != NVME_SC_SUCCESS) 1618 return; 1619 1620 switch (result & 0xff07) { 1621 case NVME_AER_NOTICE_NS_CHANGED: 1622 dev_info(ctrl->device, "rescanning\n"); 1623 nvme_queue_scan(ctrl); 1624 break; 1625 default: 1626 dev_warn(ctrl->device, "async event result %08x\n", result); 1627 } 1628 } 1629 EXPORT_SYMBOL_GPL(nvme_complete_async_event); 1630 1631 void nvme_queue_async_events(struct nvme_ctrl *ctrl) 1632 { 1633 ctrl->event_limit = NVME_NR_AERS; 1634 schedule_work(&ctrl->async_event_work); 1635 } 1636 EXPORT_SYMBOL_GPL(nvme_queue_async_events); 1637 1638 static DEFINE_IDA(nvme_instance_ida); 1639 1640 static int nvme_set_instance(struct nvme_ctrl *ctrl) 1641 { 1642 int instance, error; 1643 1644 do { 1645 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 1646 return -ENODEV; 1647 1648 spin_lock(&dev_list_lock); 1649 error = ida_get_new(&nvme_instance_ida, &instance); 1650 spin_unlock(&dev_list_lock); 1651 } while (error == -EAGAIN); 1652 1653 if (error) 1654 return -ENODEV; 1655 1656 ctrl->instance = instance; 1657 return 0; 1658 } 1659 1660 static void nvme_release_instance(struct nvme_ctrl *ctrl) 1661 { 1662 spin_lock(&dev_list_lock); 1663 ida_remove(&nvme_instance_ida, ctrl->instance); 1664 spin_unlock(&dev_list_lock); 1665 } 1666 1667 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 1668 { 1669 flush_work(&ctrl->async_event_work); 1670 flush_work(&ctrl->scan_work); 1671 nvme_remove_namespaces(ctrl); 1672 1673 device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); 1674 1675 spin_lock(&dev_list_lock); 1676 list_del(&ctrl->node); 1677 spin_unlock(&dev_list_lock); 1678 } 1679 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); 1680 1681 static void nvme_free_ctrl(struct kref *kref) 1682 { 1683 struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); 1684 1685 put_device(ctrl->device); 1686 nvme_release_instance(ctrl); 1687 ida_destroy(&ctrl->ns_ida); 1688 1689 ctrl->ops->free_ctrl(ctrl); 1690 } 1691 1692 void nvme_put_ctrl(struct nvme_ctrl *ctrl) 1693 { 1694 kref_put(&ctrl->kref, nvme_free_ctrl); 1695 } 1696 EXPORT_SYMBOL_GPL(nvme_put_ctrl); 1697 1698 /* 1699 * Initialize a NVMe controller structures. This needs to be called during 1700 * earliest initialization so that we have the initialized structured around 1701 * during probing. 1702 */ 1703 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 1704 const struct nvme_ctrl_ops *ops, unsigned long quirks) 1705 { 1706 int ret; 1707 1708 ctrl->state = NVME_CTRL_NEW; 1709 spin_lock_init(&ctrl->lock); 1710 INIT_LIST_HEAD(&ctrl->namespaces); 1711 mutex_init(&ctrl->namespaces_mutex); 1712 kref_init(&ctrl->kref); 1713 ctrl->dev = dev; 1714 ctrl->ops = ops; 1715 ctrl->quirks = quirks; 1716 INIT_WORK(&ctrl->scan_work, nvme_scan_work); 1717 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 1718 1719 ret = nvme_set_instance(ctrl); 1720 if (ret) 1721 goto out; 1722 1723 ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, 1724 MKDEV(nvme_char_major, ctrl->instance), 1725 ctrl, nvme_dev_attr_groups, 1726 "nvme%d", ctrl->instance); 1727 if (IS_ERR(ctrl->device)) { 1728 ret = PTR_ERR(ctrl->device); 1729 goto out_release_instance; 1730 } 1731 get_device(ctrl->device); 1732 ida_init(&ctrl->ns_ida); 1733 1734 spin_lock(&dev_list_lock); 1735 list_add_tail(&ctrl->node, &nvme_ctrl_list); 1736 spin_unlock(&dev_list_lock); 1737 1738 return 0; 1739 out_release_instance: 1740 nvme_release_instance(ctrl); 1741 out: 1742 return ret; 1743 } 1744 EXPORT_SYMBOL_GPL(nvme_init_ctrl); 1745 1746 /** 1747 * nvme_kill_queues(): Ends all namespace queues 1748 * @ctrl: the dead controller that needs to end 1749 * 1750 * Call this function when the driver determines it is unable to get the 1751 * controller in a state capable of servicing IO. 1752 */ 1753 void nvme_kill_queues(struct nvme_ctrl *ctrl) 1754 { 1755 struct nvme_ns *ns; 1756 1757 rcu_read_lock(); 1758 list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 1759 if (!kref_get_unless_zero(&ns->kref)) 1760 continue; 1761 1762 /* 1763 * Revalidating a dead namespace sets capacity to 0. This will 1764 * end buffered writers dirtying pages that can't be synced. 1765 */ 1766 if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 1767 revalidate_disk(ns->disk); 1768 1769 blk_set_queue_dying(ns->queue); 1770 blk_mq_abort_requeue_list(ns->queue); 1771 blk_mq_start_stopped_hw_queues(ns->queue, true); 1772 1773 nvme_put_ns(ns); 1774 } 1775 rcu_read_unlock(); 1776 } 1777 EXPORT_SYMBOL_GPL(nvme_kill_queues); 1778 1779 void nvme_stop_queues(struct nvme_ctrl *ctrl) 1780 { 1781 struct nvme_ns *ns; 1782 1783 rcu_read_lock(); 1784 list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 1785 spin_lock_irq(ns->queue->queue_lock); 1786 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); 1787 spin_unlock_irq(ns->queue->queue_lock); 1788 1789 blk_mq_cancel_requeue_work(ns->queue); 1790 blk_mq_stop_hw_queues(ns->queue); 1791 } 1792 rcu_read_unlock(); 1793 } 1794 EXPORT_SYMBOL_GPL(nvme_stop_queues); 1795 1796 void nvme_start_queues(struct nvme_ctrl *ctrl) 1797 { 1798 struct nvme_ns *ns; 1799 1800 rcu_read_lock(); 1801 list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 1802 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); 1803 blk_mq_start_stopped_hw_queues(ns->queue, true); 1804 blk_mq_kick_requeue_list(ns->queue); 1805 } 1806 rcu_read_unlock(); 1807 } 1808 EXPORT_SYMBOL_GPL(nvme_start_queues); 1809 1810 int __init nvme_core_init(void) 1811 { 1812 int result; 1813 1814 result = register_blkdev(nvme_major, "nvme"); 1815 if (result < 0) 1816 return result; 1817 else if (result > 0) 1818 nvme_major = result; 1819 1820 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 1821 &nvme_dev_fops); 1822 if (result < 0) 1823 goto unregister_blkdev; 1824 else if (result > 0) 1825 nvme_char_major = result; 1826 1827 nvme_class = class_create(THIS_MODULE, "nvme"); 1828 if (IS_ERR(nvme_class)) { 1829 result = PTR_ERR(nvme_class); 1830 goto unregister_chrdev; 1831 } 1832 1833 return 0; 1834 1835 unregister_chrdev: 1836 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 1837 unregister_blkdev: 1838 unregister_blkdev(nvme_major, "nvme"); 1839 return result; 1840 } 1841 1842 void nvme_core_exit(void) 1843 { 1844 class_destroy(nvme_class); 1845 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 1846 unregister_blkdev(nvme_major, "nvme"); 1847 } 1848 1849 MODULE_LICENSE("GPL"); 1850 MODULE_VERSION("1.0"); 1851 module_init(nvme_core_init); 1852 module_exit(nvme_core_exit); 1853