1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/blkdev.h> 16 #include <linux/blk-mq.h> 17 #include <linux/delay.h> 18 #include <linux/errno.h> 19 #include <linux/hdreg.h> 20 #include <linux/kernel.h> 21 #include <linux/module.h> 22 #include <linux/list_sort.h> 23 #include <linux/slab.h> 24 #include <linux/types.h> 25 #include <linux/pr.h> 26 #include <linux/ptrace.h> 27 #include <linux/nvme_ioctl.h> 28 #include <linux/t10-pi.h> 29 #include <scsi/sg.h> 30 #include <asm/unaligned.h> 31 32 #include "nvme.h" 33 34 #define NVME_MINORS (1U << MINORBITS) 35 36 static int nvme_major; 37 module_param(nvme_major, int, 0); 38 39 static int nvme_char_major; 40 module_param(nvme_char_major, int, 0); 41 42 static LIST_HEAD(nvme_ctrl_list); 43 DEFINE_SPINLOCK(dev_list_lock); 44 45 static struct class *nvme_class; 46 47 static void nvme_free_ns(struct kref *kref) 48 { 49 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 50 51 if (ns->type == NVME_NS_LIGHTNVM) 52 nvme_nvm_unregister(ns->queue, ns->disk->disk_name); 53 54 spin_lock(&dev_list_lock); 55 ns->disk->private_data = NULL; 56 spin_unlock(&dev_list_lock); 57 58 nvme_put_ctrl(ns->ctrl); 59 put_disk(ns->disk); 60 kfree(ns); 61 } 62 63 static void nvme_put_ns(struct nvme_ns *ns) 64 { 65 kref_put(&ns->kref, nvme_free_ns); 66 } 67 68 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) 69 { 70 struct nvme_ns *ns; 71 72 spin_lock(&dev_list_lock); 73 ns = disk->private_data; 74 if (ns && !kref_get_unless_zero(&ns->kref)) 75 ns = NULL; 76 spin_unlock(&dev_list_lock); 77 78 return ns; 79 } 80 81 void nvme_requeue_req(struct request *req) 82 { 83 unsigned long flags; 84 85 blk_mq_requeue_request(req); 86 spin_lock_irqsave(req->q->queue_lock, flags); 87 if (!blk_queue_stopped(req->q)) 88 blk_mq_kick_requeue_list(req->q); 89 spin_unlock_irqrestore(req->q->queue_lock, flags); 90 } 91 92 struct request *nvme_alloc_request(struct request_queue *q, 93 struct nvme_command *cmd, unsigned int flags) 94 { 95 bool write = cmd->common.opcode & 1; 96 struct request *req; 97 98 req = blk_mq_alloc_request(q, write, flags); 99 if (IS_ERR(req)) 100 return req; 101 102 req->cmd_type = REQ_TYPE_DRV_PRIV; 103 req->cmd_flags |= REQ_FAILFAST_DRIVER; 104 req->__data_len = 0; 105 req->__sector = (sector_t) -1; 106 req->bio = req->biotail = NULL; 107 108 req->cmd = (unsigned char *)cmd; 109 req->cmd_len = sizeof(struct nvme_command); 110 req->special = (void *)0; 111 112 return req; 113 } 114 115 /* 116 * Returns 0 on success. If the result is negative, it's a Linux error code; 117 * if the result is positive, it's an NVM Express status code 118 */ 119 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 120 void *buffer, unsigned bufflen, u32 *result, unsigned timeout) 121 { 122 struct request *req; 123 int ret; 124 125 req = nvme_alloc_request(q, cmd, 0); 126 if (IS_ERR(req)) 127 return PTR_ERR(req); 128 129 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 130 131 if (buffer && bufflen) { 132 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); 133 if (ret) 134 goto out; 135 } 136 137 blk_execute_rq(req->q, NULL, req, 0); 138 if (result) 139 *result = (u32)(uintptr_t)req->special; 140 ret = req->errors; 141 out: 142 blk_mq_free_request(req); 143 return ret; 144 } 145 146 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 147 void *buffer, unsigned bufflen) 148 { 149 return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); 150 } 151 152 int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, 153 void __user *ubuffer, unsigned bufflen, 154 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 155 u32 *result, unsigned timeout) 156 { 157 bool write = cmd->common.opcode & 1; 158 struct nvme_ns *ns = q->queuedata; 159 struct gendisk *disk = ns ? ns->disk : NULL; 160 struct request *req; 161 struct bio *bio = NULL; 162 void *meta = NULL; 163 int ret; 164 165 req = nvme_alloc_request(q, cmd, 0); 166 if (IS_ERR(req)) 167 return PTR_ERR(req); 168 169 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 170 171 if (ubuffer && bufflen) { 172 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, 173 GFP_KERNEL); 174 if (ret) 175 goto out; 176 bio = req->bio; 177 178 if (!disk) 179 goto submit; 180 bio->bi_bdev = bdget_disk(disk, 0); 181 if (!bio->bi_bdev) { 182 ret = -ENODEV; 183 goto out_unmap; 184 } 185 186 if (meta_buffer) { 187 struct bio_integrity_payload *bip; 188 189 meta = kmalloc(meta_len, GFP_KERNEL); 190 if (!meta) { 191 ret = -ENOMEM; 192 goto out_unmap; 193 } 194 195 if (write) { 196 if (copy_from_user(meta, meta_buffer, 197 meta_len)) { 198 ret = -EFAULT; 199 goto out_free_meta; 200 } 201 } 202 203 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 204 if (IS_ERR(bip)) { 205 ret = PTR_ERR(bip); 206 goto out_free_meta; 207 } 208 209 bip->bip_iter.bi_size = meta_len; 210 bip->bip_iter.bi_sector = meta_seed; 211 212 ret = bio_integrity_add_page(bio, virt_to_page(meta), 213 meta_len, offset_in_page(meta)); 214 if (ret != meta_len) { 215 ret = -ENOMEM; 216 goto out_free_meta; 217 } 218 } 219 } 220 submit: 221 blk_execute_rq(req->q, disk, req, 0); 222 ret = req->errors; 223 if (result) 224 *result = (u32)(uintptr_t)req->special; 225 if (meta && !ret && !write) { 226 if (copy_to_user(meta_buffer, meta, meta_len)) 227 ret = -EFAULT; 228 } 229 out_free_meta: 230 kfree(meta); 231 out_unmap: 232 if (bio) { 233 if (disk && bio->bi_bdev) 234 bdput(bio->bi_bdev); 235 blk_rq_unmap_user(bio); 236 } 237 out: 238 blk_mq_free_request(req); 239 return ret; 240 } 241 242 int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, 243 void __user *ubuffer, unsigned bufflen, u32 *result, 244 unsigned timeout) 245 { 246 return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, 247 result, timeout); 248 } 249 250 int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 251 { 252 struct nvme_command c = { }; 253 int error; 254 255 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 256 c.identify.opcode = nvme_admin_identify; 257 c.identify.cns = cpu_to_le32(1); 258 259 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 260 if (!*id) 261 return -ENOMEM; 262 263 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 264 sizeof(struct nvme_id_ctrl)); 265 if (error) 266 kfree(*id); 267 return error; 268 } 269 270 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) 271 { 272 struct nvme_command c = { }; 273 274 c.identify.opcode = nvme_admin_identify; 275 c.identify.cns = cpu_to_le32(2); 276 c.identify.nsid = cpu_to_le32(nsid); 277 return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); 278 } 279 280 int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, 281 struct nvme_id_ns **id) 282 { 283 struct nvme_command c = { }; 284 int error; 285 286 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 287 c.identify.opcode = nvme_admin_identify, 288 c.identify.nsid = cpu_to_le32(nsid), 289 290 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); 291 if (!*id) 292 return -ENOMEM; 293 294 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 295 sizeof(struct nvme_id_ns)); 296 if (error) 297 kfree(*id); 298 return error; 299 } 300 301 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, 302 dma_addr_t dma_addr, u32 *result) 303 { 304 struct nvme_command c; 305 306 memset(&c, 0, sizeof(c)); 307 c.features.opcode = nvme_admin_get_features; 308 c.features.nsid = cpu_to_le32(nsid); 309 c.features.prp1 = cpu_to_le64(dma_addr); 310 c.features.fid = cpu_to_le32(fid); 311 312 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); 313 } 314 315 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, 316 dma_addr_t dma_addr, u32 *result) 317 { 318 struct nvme_command c; 319 320 memset(&c, 0, sizeof(c)); 321 c.features.opcode = nvme_admin_set_features; 322 c.features.prp1 = cpu_to_le64(dma_addr); 323 c.features.fid = cpu_to_le32(fid); 324 c.features.dword11 = cpu_to_le32(dword11); 325 326 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); 327 } 328 329 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) 330 { 331 struct nvme_command c = { }; 332 int error; 333 334 c.common.opcode = nvme_admin_get_log_page, 335 c.common.nsid = cpu_to_le32(0xFFFFFFFF), 336 c.common.cdw10[0] = cpu_to_le32( 337 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | 338 NVME_LOG_SMART), 339 340 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); 341 if (!*log) 342 return -ENOMEM; 343 344 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, 345 sizeof(struct nvme_smart_log)); 346 if (error) 347 kfree(*log); 348 return error; 349 } 350 351 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) 352 { 353 u32 q_count = (*count - 1) | ((*count - 1) << 16); 354 u32 result; 355 int status, nr_io_queues; 356 357 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, 358 &result); 359 if (status) 360 return status; 361 362 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 363 *count = min(*count, nr_io_queues); 364 return 0; 365 } 366 367 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 368 { 369 struct nvme_user_io io; 370 struct nvme_command c; 371 unsigned length, meta_len; 372 void __user *metadata; 373 374 if (copy_from_user(&io, uio, sizeof(io))) 375 return -EFAULT; 376 377 switch (io.opcode) { 378 case nvme_cmd_write: 379 case nvme_cmd_read: 380 case nvme_cmd_compare: 381 break; 382 default: 383 return -EINVAL; 384 } 385 386 length = (io.nblocks + 1) << ns->lba_shift; 387 meta_len = (io.nblocks + 1) * ns->ms; 388 metadata = (void __user *)(uintptr_t)io.metadata; 389 390 if (ns->ext) { 391 length += meta_len; 392 meta_len = 0; 393 } else if (meta_len) { 394 if ((io.metadata & 3) || !io.metadata) 395 return -EINVAL; 396 } 397 398 memset(&c, 0, sizeof(c)); 399 c.rw.opcode = io.opcode; 400 c.rw.flags = io.flags; 401 c.rw.nsid = cpu_to_le32(ns->ns_id); 402 c.rw.slba = cpu_to_le64(io.slba); 403 c.rw.length = cpu_to_le16(io.nblocks); 404 c.rw.control = cpu_to_le16(io.control); 405 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 406 c.rw.reftag = cpu_to_le32(io.reftag); 407 c.rw.apptag = cpu_to_le16(io.apptag); 408 c.rw.appmask = cpu_to_le16(io.appmask); 409 410 return __nvme_submit_user_cmd(ns->queue, &c, 411 (void __user *)(uintptr_t)io.addr, length, 412 metadata, meta_len, io.slba, NULL, 0); 413 } 414 415 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 416 struct nvme_passthru_cmd __user *ucmd) 417 { 418 struct nvme_passthru_cmd cmd; 419 struct nvme_command c; 420 unsigned timeout = 0; 421 int status; 422 423 if (!capable(CAP_SYS_ADMIN)) 424 return -EACCES; 425 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 426 return -EFAULT; 427 428 memset(&c, 0, sizeof(c)); 429 c.common.opcode = cmd.opcode; 430 c.common.flags = cmd.flags; 431 c.common.nsid = cpu_to_le32(cmd.nsid); 432 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 433 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 434 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 435 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 436 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 437 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 438 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 439 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 440 441 if (cmd.timeout_ms) 442 timeout = msecs_to_jiffies(cmd.timeout_ms); 443 444 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 445 (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 446 &cmd.result, timeout); 447 if (status >= 0) { 448 if (put_user(cmd.result, &ucmd->result)) 449 return -EFAULT; 450 } 451 452 return status; 453 } 454 455 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 456 unsigned int cmd, unsigned long arg) 457 { 458 struct nvme_ns *ns = bdev->bd_disk->private_data; 459 460 switch (cmd) { 461 case NVME_IOCTL_ID: 462 force_successful_syscall_return(); 463 return ns->ns_id; 464 case NVME_IOCTL_ADMIN_CMD: 465 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); 466 case NVME_IOCTL_IO_CMD: 467 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); 468 case NVME_IOCTL_SUBMIT_IO: 469 return nvme_submit_io(ns, (void __user *)arg); 470 #ifdef CONFIG_BLK_DEV_NVME_SCSI 471 case SG_GET_VERSION_NUM: 472 return nvme_sg_get_version_num((void __user *)arg); 473 case SG_IO: 474 return nvme_sg_io(ns, (void __user *)arg); 475 #endif 476 default: 477 return -ENOTTY; 478 } 479 } 480 481 #ifdef CONFIG_COMPAT 482 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 483 unsigned int cmd, unsigned long arg) 484 { 485 switch (cmd) { 486 case SG_IO: 487 return -ENOIOCTLCMD; 488 } 489 return nvme_ioctl(bdev, mode, cmd, arg); 490 } 491 #else 492 #define nvme_compat_ioctl NULL 493 #endif 494 495 static int nvme_open(struct block_device *bdev, fmode_t mode) 496 { 497 return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; 498 } 499 500 static void nvme_release(struct gendisk *disk, fmode_t mode) 501 { 502 nvme_put_ns(disk->private_data); 503 } 504 505 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 506 { 507 /* some standard values */ 508 geo->heads = 1 << 6; 509 geo->sectors = 1 << 5; 510 geo->cylinders = get_capacity(bdev->bd_disk) >> 11; 511 return 0; 512 } 513 514 #ifdef CONFIG_BLK_DEV_INTEGRITY 515 static void nvme_init_integrity(struct nvme_ns *ns) 516 { 517 struct blk_integrity integrity; 518 519 switch (ns->pi_type) { 520 case NVME_NS_DPS_PI_TYPE3: 521 integrity.profile = &t10_pi_type3_crc; 522 break; 523 case NVME_NS_DPS_PI_TYPE1: 524 case NVME_NS_DPS_PI_TYPE2: 525 integrity.profile = &t10_pi_type1_crc; 526 break; 527 default: 528 integrity.profile = NULL; 529 break; 530 } 531 integrity.tuple_size = ns->ms; 532 blk_integrity_register(ns->disk, &integrity); 533 blk_queue_max_integrity_segments(ns->queue, 1); 534 } 535 #else 536 static void nvme_init_integrity(struct nvme_ns *ns) 537 { 538 } 539 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 540 541 static void nvme_config_discard(struct nvme_ns *ns) 542 { 543 u32 logical_block_size = queue_logical_block_size(ns->queue); 544 ns->queue->limits.discard_zeroes_data = 0; 545 ns->queue->limits.discard_alignment = logical_block_size; 546 ns->queue->limits.discard_granularity = logical_block_size; 547 blk_queue_max_discard_sectors(ns->queue, 0xffffffff); 548 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 549 } 550 551 static int nvme_revalidate_disk(struct gendisk *disk) 552 { 553 struct nvme_ns *ns = disk->private_data; 554 struct nvme_id_ns *id; 555 u8 lbaf, pi_type; 556 u16 old_ms; 557 unsigned short bs; 558 559 if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { 560 dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", 561 __func__, ns->ctrl->instance, ns->ns_id); 562 return -ENODEV; 563 } 564 if (id->ncap == 0) { 565 kfree(id); 566 return -ENODEV; 567 } 568 569 if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { 570 if (nvme_nvm_register(ns->queue, disk->disk_name)) { 571 dev_warn(ns->ctrl->dev, 572 "%s: LightNVM init failure\n", __func__); 573 kfree(id); 574 return -ENODEV; 575 } 576 ns->type = NVME_NS_LIGHTNVM; 577 } 578 579 if (ns->ctrl->vs >= NVME_VS(1, 1)) 580 memcpy(ns->eui, id->eui64, sizeof(ns->eui)); 581 if (ns->ctrl->vs >= NVME_VS(1, 2)) 582 memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); 583 584 old_ms = ns->ms; 585 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 586 ns->lba_shift = id->lbaf[lbaf].ds; 587 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 588 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 589 590 /* 591 * If identify namespace failed, use default 512 byte block size so 592 * block layer can use before failing read/write for 0 capacity. 593 */ 594 if (ns->lba_shift == 0) 595 ns->lba_shift = 9; 596 bs = 1 << ns->lba_shift; 597 /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 598 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 599 id->dps & NVME_NS_DPS_PI_MASK : 0; 600 601 blk_mq_freeze_queue(disk->queue); 602 if (blk_get_integrity(disk) && (ns->pi_type != pi_type || 603 ns->ms != old_ms || 604 bs != queue_logical_block_size(disk->queue) || 605 (ns->ms && ns->ext))) 606 blk_integrity_unregister(disk); 607 608 ns->pi_type = pi_type; 609 blk_queue_logical_block_size(ns->queue, bs); 610 611 if (ns->ms && !blk_get_integrity(disk) && !ns->ext) 612 nvme_init_integrity(ns); 613 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) 614 set_capacity(disk, 0); 615 else 616 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 617 618 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) 619 nvme_config_discard(ns); 620 blk_mq_unfreeze_queue(disk->queue); 621 622 kfree(id); 623 return 0; 624 } 625 626 static char nvme_pr_type(enum pr_type type) 627 { 628 switch (type) { 629 case PR_WRITE_EXCLUSIVE: 630 return 1; 631 case PR_EXCLUSIVE_ACCESS: 632 return 2; 633 case PR_WRITE_EXCLUSIVE_REG_ONLY: 634 return 3; 635 case PR_EXCLUSIVE_ACCESS_REG_ONLY: 636 return 4; 637 case PR_WRITE_EXCLUSIVE_ALL_REGS: 638 return 5; 639 case PR_EXCLUSIVE_ACCESS_ALL_REGS: 640 return 6; 641 default: 642 return 0; 643 } 644 }; 645 646 static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 647 u64 key, u64 sa_key, u8 op) 648 { 649 struct nvme_ns *ns = bdev->bd_disk->private_data; 650 struct nvme_command c; 651 u8 data[16] = { 0, }; 652 653 put_unaligned_le64(key, &data[0]); 654 put_unaligned_le64(sa_key, &data[8]); 655 656 memset(&c, 0, sizeof(c)); 657 c.common.opcode = op; 658 c.common.nsid = cpu_to_le32(ns->ns_id); 659 c.common.cdw10[0] = cpu_to_le32(cdw10); 660 661 return nvme_submit_sync_cmd(ns->queue, &c, data, 16); 662 } 663 664 static int nvme_pr_register(struct block_device *bdev, u64 old, 665 u64 new, unsigned flags) 666 { 667 u32 cdw10; 668 669 if (flags & ~PR_FL_IGNORE_KEY) 670 return -EOPNOTSUPP; 671 672 cdw10 = old ? 2 : 0; 673 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 674 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 675 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 676 } 677 678 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 679 enum pr_type type, unsigned flags) 680 { 681 u32 cdw10; 682 683 if (flags & ~PR_FL_IGNORE_KEY) 684 return -EOPNOTSUPP; 685 686 cdw10 = nvme_pr_type(type) << 8; 687 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 688 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 689 } 690 691 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 692 enum pr_type type, bool abort) 693 { 694 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; 695 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 696 } 697 698 static int nvme_pr_clear(struct block_device *bdev, u64 key) 699 { 700 u32 cdw10 = 1 | (key ? 1 << 3 : 0); 701 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); 702 } 703 704 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 705 { 706 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; 707 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 708 } 709 710 static const struct pr_ops nvme_pr_ops = { 711 .pr_register = nvme_pr_register, 712 .pr_reserve = nvme_pr_reserve, 713 .pr_release = nvme_pr_release, 714 .pr_preempt = nvme_pr_preempt, 715 .pr_clear = nvme_pr_clear, 716 }; 717 718 static const struct block_device_operations nvme_fops = { 719 .owner = THIS_MODULE, 720 .ioctl = nvme_ioctl, 721 .compat_ioctl = nvme_compat_ioctl, 722 .open = nvme_open, 723 .release = nvme_release, 724 .getgeo = nvme_getgeo, 725 .revalidate_disk= nvme_revalidate_disk, 726 .pr_ops = &nvme_pr_ops, 727 }; 728 729 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 730 { 731 unsigned long timeout = 732 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 733 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; 734 int ret; 735 736 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 737 if ((csts & NVME_CSTS_RDY) == bit) 738 break; 739 740 msleep(100); 741 if (fatal_signal_pending(current)) 742 return -EINTR; 743 if (time_after(jiffies, timeout)) { 744 dev_err(ctrl->dev, 745 "Device not ready; aborting %s\n", enabled ? 746 "initialisation" : "reset"); 747 return -ENODEV; 748 } 749 } 750 751 return ret; 752 } 753 754 /* 755 * If the device has been passed off to us in an enabled state, just clear 756 * the enabled bit. The spec says we should set the 'shutdown notification 757 * bits', but doing so may cause the device to complete commands to the 758 * admin queue ... and we don't know what memory that might be pointing at! 759 */ 760 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 761 { 762 int ret; 763 764 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 765 ctrl->ctrl_config &= ~NVME_CC_ENABLE; 766 767 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 768 if (ret) 769 return ret; 770 return nvme_wait_ready(ctrl, cap, false); 771 } 772 773 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 774 { 775 /* 776 * Default to a 4K page size, with the intention to update this 777 * path in the future to accomodate architectures with differing 778 * kernel and IO page sizes. 779 */ 780 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; 781 int ret; 782 783 if (page_shift < dev_page_min) { 784 dev_err(ctrl->dev, 785 "Minimum device page size %u too large for host (%u)\n", 786 1 << dev_page_min, 1 << page_shift); 787 return -ENODEV; 788 } 789 790 ctrl->page_size = 1 << page_shift; 791 792 ctrl->ctrl_config = NVME_CC_CSS_NVM; 793 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 794 ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 795 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 796 ctrl->ctrl_config |= NVME_CC_ENABLE; 797 798 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 799 if (ret) 800 return ret; 801 return nvme_wait_ready(ctrl, cap, true); 802 } 803 804 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) 805 { 806 unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; 807 u32 csts; 808 int ret; 809 810 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 811 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; 812 813 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 814 if (ret) 815 return ret; 816 817 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 818 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) 819 break; 820 821 msleep(100); 822 if (fatal_signal_pending(current)) 823 return -EINTR; 824 if (time_after(jiffies, timeout)) { 825 dev_err(ctrl->dev, 826 "Device shutdown incomplete; abort shutdown\n"); 827 return -ENODEV; 828 } 829 } 830 831 return ret; 832 } 833 834 /* 835 * Initialize the cached copies of the Identify data and various controller 836 * register in our nvme_ctrl structure. This should be called as soon as 837 * the admin queue is fully up and running. 838 */ 839 int nvme_init_identify(struct nvme_ctrl *ctrl) 840 { 841 struct nvme_id_ctrl *id; 842 u64 cap; 843 int ret, page_shift; 844 845 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 846 if (ret) { 847 dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); 848 return ret; 849 } 850 851 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); 852 if (ret) { 853 dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); 854 return ret; 855 } 856 page_shift = NVME_CAP_MPSMIN(cap) + 12; 857 858 if (ctrl->vs >= NVME_VS(1, 1)) 859 ctrl->subsystem = NVME_CAP_NSSRC(cap); 860 861 ret = nvme_identify_ctrl(ctrl, &id); 862 if (ret) { 863 dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); 864 return -EIO; 865 } 866 867 ctrl->oncs = le16_to_cpup(&id->oncs); 868 atomic_set(&ctrl->abort_limit, id->acl + 1); 869 ctrl->vwc = id->vwc; 870 memcpy(ctrl->serial, id->sn, sizeof(id->sn)); 871 memcpy(ctrl->model, id->mn, sizeof(id->mn)); 872 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); 873 if (id->mdts) 874 ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9); 875 else 876 ctrl->max_hw_sectors = UINT_MAX; 877 878 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { 879 unsigned int max_hw_sectors; 880 881 ctrl->stripe_size = 1 << (id->vs[3] + page_shift); 882 max_hw_sectors = ctrl->stripe_size >> (page_shift - 9); 883 if (ctrl->max_hw_sectors) { 884 ctrl->max_hw_sectors = min(max_hw_sectors, 885 ctrl->max_hw_sectors); 886 } else { 887 ctrl->max_hw_sectors = max_hw_sectors; 888 } 889 } 890 891 kfree(id); 892 return 0; 893 } 894 895 static int nvme_dev_open(struct inode *inode, struct file *file) 896 { 897 struct nvme_ctrl *ctrl; 898 int instance = iminor(inode); 899 int ret = -ENODEV; 900 901 spin_lock(&dev_list_lock); 902 list_for_each_entry(ctrl, &nvme_ctrl_list, node) { 903 if (ctrl->instance != instance) 904 continue; 905 906 if (!ctrl->admin_q) { 907 ret = -EWOULDBLOCK; 908 break; 909 } 910 if (!kref_get_unless_zero(&ctrl->kref)) 911 break; 912 file->private_data = ctrl; 913 ret = 0; 914 break; 915 } 916 spin_unlock(&dev_list_lock); 917 918 return ret; 919 } 920 921 static int nvme_dev_release(struct inode *inode, struct file *file) 922 { 923 nvme_put_ctrl(file->private_data); 924 return 0; 925 } 926 927 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) 928 { 929 struct nvme_ns *ns; 930 int ret; 931 932 mutex_lock(&ctrl->namespaces_mutex); 933 if (list_empty(&ctrl->namespaces)) { 934 ret = -ENOTTY; 935 goto out_unlock; 936 } 937 938 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 939 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 940 dev_warn(ctrl->dev, 941 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 942 ret = -EINVAL; 943 goto out_unlock; 944 } 945 946 dev_warn(ctrl->dev, 947 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 948 kref_get(&ns->kref); 949 mutex_unlock(&ctrl->namespaces_mutex); 950 951 ret = nvme_user_cmd(ctrl, ns, argp); 952 nvme_put_ns(ns); 953 return ret; 954 955 out_unlock: 956 mutex_unlock(&ctrl->namespaces_mutex); 957 return ret; 958 } 959 960 static long nvme_dev_ioctl(struct file *file, unsigned int cmd, 961 unsigned long arg) 962 { 963 struct nvme_ctrl *ctrl = file->private_data; 964 void __user *argp = (void __user *)arg; 965 966 switch (cmd) { 967 case NVME_IOCTL_ADMIN_CMD: 968 return nvme_user_cmd(ctrl, NULL, argp); 969 case NVME_IOCTL_IO_CMD: 970 return nvme_dev_user_cmd(ctrl, argp); 971 case NVME_IOCTL_RESET: 972 dev_warn(ctrl->dev, "resetting controller\n"); 973 return ctrl->ops->reset_ctrl(ctrl); 974 case NVME_IOCTL_SUBSYS_RESET: 975 return nvme_reset_subsystem(ctrl); 976 default: 977 return -ENOTTY; 978 } 979 } 980 981 static const struct file_operations nvme_dev_fops = { 982 .owner = THIS_MODULE, 983 .open = nvme_dev_open, 984 .release = nvme_dev_release, 985 .unlocked_ioctl = nvme_dev_ioctl, 986 .compat_ioctl = nvme_dev_ioctl, 987 }; 988 989 static ssize_t nvme_sysfs_reset(struct device *dev, 990 struct device_attribute *attr, const char *buf, 991 size_t count) 992 { 993 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 994 int ret; 995 996 ret = ctrl->ops->reset_ctrl(ctrl); 997 if (ret < 0) 998 return ret; 999 return count; 1000 } 1001 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 1002 1003 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 1004 char *buf) 1005 { 1006 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1007 return sprintf(buf, "%pU\n", ns->uuid); 1008 } 1009 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); 1010 1011 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 1012 char *buf) 1013 { 1014 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1015 return sprintf(buf, "%8phd\n", ns->eui); 1016 } 1017 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); 1018 1019 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 1020 char *buf) 1021 { 1022 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1023 return sprintf(buf, "%d\n", ns->ns_id); 1024 } 1025 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); 1026 1027 static struct attribute *nvme_ns_attrs[] = { 1028 &dev_attr_uuid.attr, 1029 &dev_attr_eui.attr, 1030 &dev_attr_nsid.attr, 1031 NULL, 1032 }; 1033 1034 static umode_t nvme_attrs_are_visible(struct kobject *kobj, 1035 struct attribute *a, int n) 1036 { 1037 struct device *dev = container_of(kobj, struct device, kobj); 1038 struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1039 1040 if (a == &dev_attr_uuid.attr) { 1041 if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) 1042 return 0; 1043 } 1044 if (a == &dev_attr_eui.attr) { 1045 if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) 1046 return 0; 1047 } 1048 return a->mode; 1049 } 1050 1051 static const struct attribute_group nvme_ns_attr_group = { 1052 .attrs = nvme_ns_attrs, 1053 .is_visible = nvme_attrs_are_visible, 1054 }; 1055 1056 #define nvme_show_function(field) \ 1057 static ssize_t field##_show(struct device *dev, \ 1058 struct device_attribute *attr, char *buf) \ 1059 { \ 1060 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 1061 return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ 1062 } \ 1063 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 1064 1065 nvme_show_function(model); 1066 nvme_show_function(serial); 1067 nvme_show_function(firmware_rev); 1068 1069 static struct attribute *nvme_dev_attrs[] = { 1070 &dev_attr_reset_controller.attr, 1071 &dev_attr_model.attr, 1072 &dev_attr_serial.attr, 1073 &dev_attr_firmware_rev.attr, 1074 NULL 1075 }; 1076 1077 static struct attribute_group nvme_dev_attrs_group = { 1078 .attrs = nvme_dev_attrs, 1079 }; 1080 1081 static const struct attribute_group *nvme_dev_attr_groups[] = { 1082 &nvme_dev_attrs_group, 1083 NULL, 1084 }; 1085 1086 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 1087 { 1088 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 1089 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 1090 1091 return nsa->ns_id - nsb->ns_id; 1092 } 1093 1094 static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1095 { 1096 struct nvme_ns *ns; 1097 1098 lockdep_assert_held(&ctrl->namespaces_mutex); 1099 1100 list_for_each_entry(ns, &ctrl->namespaces, list) { 1101 if (ns->ns_id == nsid) 1102 return ns; 1103 if (ns->ns_id > nsid) 1104 break; 1105 } 1106 return NULL; 1107 } 1108 1109 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1110 { 1111 struct nvme_ns *ns; 1112 struct gendisk *disk; 1113 int node = dev_to_node(ctrl->dev); 1114 1115 lockdep_assert_held(&ctrl->namespaces_mutex); 1116 1117 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 1118 if (!ns) 1119 return; 1120 1121 ns->queue = blk_mq_init_queue(ctrl->tagset); 1122 if (IS_ERR(ns->queue)) 1123 goto out_free_ns; 1124 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1125 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1126 ns->queue->queuedata = ns; 1127 ns->ctrl = ctrl; 1128 1129 disk = alloc_disk_node(0, node); 1130 if (!disk) 1131 goto out_free_queue; 1132 1133 kref_init(&ns->kref); 1134 ns->ns_id = nsid; 1135 ns->disk = disk; 1136 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1137 1138 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1139 if (ctrl->max_hw_sectors) { 1140 blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors); 1141 blk_queue_max_segments(ns->queue, 1142 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1); 1143 } 1144 if (ctrl->stripe_size) 1145 blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9); 1146 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 1147 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 1148 blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1); 1149 1150 disk->major = nvme_major; 1151 disk->first_minor = 0; 1152 disk->fops = &nvme_fops; 1153 disk->private_data = ns; 1154 disk->queue = ns->queue; 1155 disk->driverfs_dev = ctrl->device; 1156 disk->flags = GENHD_FL_EXT_DEVT; 1157 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); 1158 1159 if (nvme_revalidate_disk(ns->disk)) 1160 goto out_free_disk; 1161 1162 list_add_tail(&ns->list, &ctrl->namespaces); 1163 kref_get(&ctrl->kref); 1164 if (ns->type == NVME_NS_LIGHTNVM) 1165 return; 1166 1167 add_disk(ns->disk); 1168 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, 1169 &nvme_ns_attr_group)) 1170 pr_warn("%s: failed to create sysfs group for identification\n", 1171 ns->disk->disk_name); 1172 return; 1173 out_free_disk: 1174 kfree(disk); 1175 out_free_queue: 1176 blk_cleanup_queue(ns->queue); 1177 out_free_ns: 1178 kfree(ns); 1179 } 1180 1181 static void nvme_ns_remove(struct nvme_ns *ns) 1182 { 1183 bool kill = nvme_io_incapable(ns->ctrl) && 1184 !blk_queue_dying(ns->queue); 1185 1186 lockdep_assert_held(&ns->ctrl->namespaces_mutex); 1187 1188 if (kill) { 1189 blk_set_queue_dying(ns->queue); 1190 1191 /* 1192 * The controller was shutdown first if we got here through 1193 * device removal. The shutdown may requeue outstanding 1194 * requests. These need to be aborted immediately so 1195 * del_gendisk doesn't block indefinitely for their completion. 1196 */ 1197 blk_mq_abort_requeue_list(ns->queue); 1198 } 1199 if (ns->disk->flags & GENHD_FL_UP) { 1200 if (blk_get_integrity(ns->disk)) 1201 blk_integrity_unregister(ns->disk); 1202 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 1203 &nvme_ns_attr_group); 1204 del_gendisk(ns->disk); 1205 } 1206 if (kill || !blk_queue_dying(ns->queue)) { 1207 blk_mq_abort_requeue_list(ns->queue); 1208 blk_cleanup_queue(ns->queue); 1209 } 1210 list_del_init(&ns->list); 1211 nvme_put_ns(ns); 1212 } 1213 1214 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) 1215 { 1216 struct nvme_ns *ns; 1217 1218 ns = nvme_find_ns(ctrl, nsid); 1219 if (ns) { 1220 if (revalidate_disk(ns->disk)) 1221 nvme_ns_remove(ns); 1222 } else 1223 nvme_alloc_ns(ctrl, nsid); 1224 } 1225 1226 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) 1227 { 1228 struct nvme_ns *ns; 1229 __le32 *ns_list; 1230 unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); 1231 int ret = 0; 1232 1233 ns_list = kzalloc(0x1000, GFP_KERNEL); 1234 if (!ns_list) 1235 return -ENOMEM; 1236 1237 for (i = 0; i < num_lists; i++) { 1238 ret = nvme_identify_ns_list(ctrl, prev, ns_list); 1239 if (ret) 1240 goto out; 1241 1242 for (j = 0; j < min(nn, 1024U); j++) { 1243 nsid = le32_to_cpu(ns_list[j]); 1244 if (!nsid) 1245 goto out; 1246 1247 nvme_validate_ns(ctrl, nsid); 1248 1249 while (++prev < nsid) { 1250 ns = nvme_find_ns(ctrl, prev); 1251 if (ns) 1252 nvme_ns_remove(ns); 1253 } 1254 } 1255 nn -= j; 1256 } 1257 out: 1258 kfree(ns_list); 1259 return ret; 1260 } 1261 1262 static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) 1263 { 1264 struct nvme_ns *ns, *next; 1265 unsigned i; 1266 1267 lockdep_assert_held(&ctrl->namespaces_mutex); 1268 1269 for (i = 1; i <= nn; i++) 1270 nvme_validate_ns(ctrl, i); 1271 1272 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 1273 if (ns->ns_id > nn) 1274 nvme_ns_remove(ns); 1275 } 1276 } 1277 1278 void nvme_scan_namespaces(struct nvme_ctrl *ctrl) 1279 { 1280 struct nvme_id_ctrl *id; 1281 unsigned nn; 1282 1283 if (nvme_identify_ctrl(ctrl, &id)) 1284 return; 1285 1286 mutex_lock(&ctrl->namespaces_mutex); 1287 nn = le32_to_cpu(id->nn); 1288 if (ctrl->vs >= NVME_VS(1, 1) && 1289 !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { 1290 if (!nvme_scan_ns_list(ctrl, nn)) 1291 goto done; 1292 } 1293 __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); 1294 done: 1295 list_sort(NULL, &ctrl->namespaces, ns_cmp); 1296 mutex_unlock(&ctrl->namespaces_mutex); 1297 kfree(id); 1298 } 1299 1300 void nvme_remove_namespaces(struct nvme_ctrl *ctrl) 1301 { 1302 struct nvme_ns *ns, *next; 1303 1304 mutex_lock(&ctrl->namespaces_mutex); 1305 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) 1306 nvme_ns_remove(ns); 1307 mutex_unlock(&ctrl->namespaces_mutex); 1308 } 1309 1310 static DEFINE_IDA(nvme_instance_ida); 1311 1312 static int nvme_set_instance(struct nvme_ctrl *ctrl) 1313 { 1314 int instance, error; 1315 1316 do { 1317 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 1318 return -ENODEV; 1319 1320 spin_lock(&dev_list_lock); 1321 error = ida_get_new(&nvme_instance_ida, &instance); 1322 spin_unlock(&dev_list_lock); 1323 } while (error == -EAGAIN); 1324 1325 if (error) 1326 return -ENODEV; 1327 1328 ctrl->instance = instance; 1329 return 0; 1330 } 1331 1332 static void nvme_release_instance(struct nvme_ctrl *ctrl) 1333 { 1334 spin_lock(&dev_list_lock); 1335 ida_remove(&nvme_instance_ida, ctrl->instance); 1336 spin_unlock(&dev_list_lock); 1337 } 1338 1339 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 1340 { 1341 device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); 1342 1343 spin_lock(&dev_list_lock); 1344 list_del(&ctrl->node); 1345 spin_unlock(&dev_list_lock); 1346 } 1347 1348 static void nvme_free_ctrl(struct kref *kref) 1349 { 1350 struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); 1351 1352 put_device(ctrl->device); 1353 nvme_release_instance(ctrl); 1354 1355 ctrl->ops->free_ctrl(ctrl); 1356 } 1357 1358 void nvme_put_ctrl(struct nvme_ctrl *ctrl) 1359 { 1360 kref_put(&ctrl->kref, nvme_free_ctrl); 1361 } 1362 1363 /* 1364 * Initialize a NVMe controller structures. This needs to be called during 1365 * earliest initialization so that we have the initialized structured around 1366 * during probing. 1367 */ 1368 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 1369 const struct nvme_ctrl_ops *ops, unsigned long quirks) 1370 { 1371 int ret; 1372 1373 INIT_LIST_HEAD(&ctrl->namespaces); 1374 mutex_init(&ctrl->namespaces_mutex); 1375 kref_init(&ctrl->kref); 1376 ctrl->dev = dev; 1377 ctrl->ops = ops; 1378 ctrl->quirks = quirks; 1379 1380 ret = nvme_set_instance(ctrl); 1381 if (ret) 1382 goto out; 1383 1384 ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, 1385 MKDEV(nvme_char_major, ctrl->instance), 1386 dev, nvme_dev_attr_groups, 1387 "nvme%d", ctrl->instance); 1388 if (IS_ERR(ctrl->device)) { 1389 ret = PTR_ERR(ctrl->device); 1390 goto out_release_instance; 1391 } 1392 get_device(ctrl->device); 1393 dev_set_drvdata(ctrl->device, ctrl); 1394 1395 spin_lock(&dev_list_lock); 1396 list_add_tail(&ctrl->node, &nvme_ctrl_list); 1397 spin_unlock(&dev_list_lock); 1398 1399 return 0; 1400 out_release_instance: 1401 nvme_release_instance(ctrl); 1402 out: 1403 return ret; 1404 } 1405 1406 void nvme_stop_queues(struct nvme_ctrl *ctrl) 1407 { 1408 struct nvme_ns *ns; 1409 1410 mutex_lock(&ctrl->namespaces_mutex); 1411 list_for_each_entry(ns, &ctrl->namespaces, list) { 1412 spin_lock_irq(ns->queue->queue_lock); 1413 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); 1414 spin_unlock_irq(ns->queue->queue_lock); 1415 1416 blk_mq_cancel_requeue_work(ns->queue); 1417 blk_mq_stop_hw_queues(ns->queue); 1418 } 1419 mutex_unlock(&ctrl->namespaces_mutex); 1420 } 1421 1422 void nvme_start_queues(struct nvme_ctrl *ctrl) 1423 { 1424 struct nvme_ns *ns; 1425 1426 mutex_lock(&ctrl->namespaces_mutex); 1427 list_for_each_entry(ns, &ctrl->namespaces, list) { 1428 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); 1429 blk_mq_start_stopped_hw_queues(ns->queue, true); 1430 blk_mq_kick_requeue_list(ns->queue); 1431 } 1432 mutex_unlock(&ctrl->namespaces_mutex); 1433 } 1434 1435 int __init nvme_core_init(void) 1436 { 1437 int result; 1438 1439 result = register_blkdev(nvme_major, "nvme"); 1440 if (result < 0) 1441 return result; 1442 else if (result > 0) 1443 nvme_major = result; 1444 1445 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 1446 &nvme_dev_fops); 1447 if (result < 0) 1448 goto unregister_blkdev; 1449 else if (result > 0) 1450 nvme_char_major = result; 1451 1452 nvme_class = class_create(THIS_MODULE, "nvme"); 1453 if (IS_ERR(nvme_class)) { 1454 result = PTR_ERR(nvme_class); 1455 goto unregister_chrdev; 1456 } 1457 1458 return 0; 1459 1460 unregister_chrdev: 1461 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 1462 unregister_blkdev: 1463 unregister_blkdev(nvme_major, "nvme"); 1464 return result; 1465 } 1466 1467 void nvme_core_exit(void) 1468 { 1469 unregister_blkdev(nvme_major, "nvme"); 1470 class_destroy(nvme_class); 1471 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 1472 } 1473