1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/aer.h> 16 #include <linux/bitops.h> 17 #include <linux/blkdev.h> 18 #include <linux/blk-mq.h> 19 #include <linux/cpu.h> 20 #include <linux/delay.h> 21 #include <linux/errno.h> 22 #include <linux/fs.h> 23 #include <linux/genhd.h> 24 #include <linux/hdreg.h> 25 #include <linux/idr.h> 26 #include <linux/init.h> 27 #include <linux/interrupt.h> 28 #include <linux/io.h> 29 #include <linux/kdev_t.h> 30 #include <linux/kthread.h> 31 #include <linux/kernel.h> 32 #include <linux/mm.h> 33 #include <linux/module.h> 34 #include <linux/moduleparam.h> 35 #include <linux/mutex.h> 36 #include <linux/pci.h> 37 #include <linux/poison.h> 38 #include <linux/ptrace.h> 39 #include <linux/sched.h> 40 #include <linux/slab.h> 41 #include <linux/t10-pi.h> 42 #include <linux/types.h> 43 #include <linux/io-64-nonatomic-lo-hi.h> 44 #include <asm/unaligned.h> 45 46 #include "nvme.h" 47 48 #define NVME_Q_DEPTH 1024 49 #define NVME_AQ_DEPTH 256 50 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 51 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 52 53 /* 54 * We handle AEN commands ourselves and don't even let the 55 * block layer know about them. 56 */ 57 #define NVME_NR_AEN_COMMANDS 1 58 #define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) 59 60 unsigned char admin_timeout = 60; 61 module_param(admin_timeout, byte, 0644); 62 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 63 64 unsigned char nvme_io_timeout = 30; 65 module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 66 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 67 68 unsigned char shutdown_timeout = 5; 69 module_param(shutdown_timeout, byte, 0644); 70 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 71 72 static int use_threaded_interrupts; 73 module_param(use_threaded_interrupts, int, 0); 74 75 static bool use_cmb_sqes = true; 76 module_param(use_cmb_sqes, bool, 0644); 77 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 78 79 static LIST_HEAD(dev_list); 80 static struct task_struct *nvme_thread; 81 static struct workqueue_struct *nvme_workq; 82 static wait_queue_head_t nvme_kthread_wait; 83 84 struct nvme_dev; 85 struct nvme_queue; 86 87 static int nvme_reset(struct nvme_dev *dev); 88 static void nvme_process_cq(struct nvme_queue *nvmeq); 89 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); 90 91 /* 92 * Represents an NVM Express device. Each nvme_dev is a PCI function. 93 */ 94 struct nvme_dev { 95 struct list_head node; 96 struct nvme_queue **queues; 97 struct blk_mq_tag_set tagset; 98 struct blk_mq_tag_set admin_tagset; 99 u32 __iomem *dbs; 100 struct device *dev; 101 struct dma_pool *prp_page_pool; 102 struct dma_pool *prp_small_pool; 103 unsigned queue_count; 104 unsigned online_queues; 105 unsigned max_qid; 106 int q_depth; 107 u32 db_stride; 108 struct msix_entry *entry; 109 void __iomem *bar; 110 struct work_struct reset_work; 111 struct work_struct scan_work; 112 struct work_struct remove_work; 113 struct mutex shutdown_lock; 114 bool subsystem; 115 void __iomem *cmb; 116 dma_addr_t cmb_dma_addr; 117 u64 cmb_size; 118 u32 cmbsz; 119 unsigned long flags; 120 121 #define NVME_CTRL_RESETTING 0 122 #define NVME_CTRL_REMOVING 1 123 124 struct nvme_ctrl ctrl; 125 struct completion ioq_wait; 126 }; 127 128 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) 129 { 130 return container_of(ctrl, struct nvme_dev, ctrl); 131 } 132 133 /* 134 * An NVM Express queue. Each device has at least two (one for admin 135 * commands and one for I/O commands). 136 */ 137 struct nvme_queue { 138 struct device *q_dmadev; 139 struct nvme_dev *dev; 140 char irqname[24]; /* nvme4294967295-65535\0 */ 141 spinlock_t q_lock; 142 struct nvme_command *sq_cmds; 143 struct nvme_command __iomem *sq_cmds_io; 144 volatile struct nvme_completion *cqes; 145 struct blk_mq_tags **tags; 146 dma_addr_t sq_dma_addr; 147 dma_addr_t cq_dma_addr; 148 u32 __iomem *q_db; 149 u16 q_depth; 150 s16 cq_vector; 151 u16 sq_head; 152 u16 sq_tail; 153 u16 cq_head; 154 u16 qid; 155 u8 cq_phase; 156 u8 cqe_seen; 157 }; 158 159 /* 160 * The nvme_iod describes the data in an I/O, including the list of PRP 161 * entries. You can't see it in this data structure because C doesn't let 162 * me express that. Use nvme_init_iod to ensure there's enough space 163 * allocated to store the PRP list. 164 */ 165 struct nvme_iod { 166 struct nvme_queue *nvmeq; 167 int aborted; 168 int npages; /* In the PRP list. 0 means small pool in use */ 169 int nents; /* Used in scatterlist */ 170 int length; /* Of data, in bytes */ 171 dma_addr_t first_dma; 172 struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ 173 struct scatterlist *sg; 174 struct scatterlist inline_sg[0]; 175 }; 176 177 /* 178 * Check we didin't inadvertently grow the command struct 179 */ 180 static inline void _nvme_check_size(void) 181 { 182 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 183 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 184 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 185 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 186 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 187 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 188 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 189 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 190 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 191 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 192 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 193 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 194 } 195 196 /* 197 * Max size of iod being embedded in the request payload 198 */ 199 #define NVME_INT_PAGES 2 200 #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) 201 202 /* 203 * Will slightly overestimate the number of pages needed. This is OK 204 * as it only leads to a small amount of wasted memory for the lifetime of 205 * the I/O. 206 */ 207 static int nvme_npages(unsigned size, struct nvme_dev *dev) 208 { 209 unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, 210 dev->ctrl.page_size); 211 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 212 } 213 214 static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, 215 unsigned int size, unsigned int nseg) 216 { 217 return sizeof(__le64 *) * nvme_npages(size, dev) + 218 sizeof(struct scatterlist) * nseg; 219 } 220 221 static unsigned int nvme_cmd_size(struct nvme_dev *dev) 222 { 223 return sizeof(struct nvme_iod) + 224 nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); 225 } 226 227 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 228 unsigned int hctx_idx) 229 { 230 struct nvme_dev *dev = data; 231 struct nvme_queue *nvmeq = dev->queues[0]; 232 233 WARN_ON(hctx_idx != 0); 234 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 235 WARN_ON(nvmeq->tags); 236 237 hctx->driver_data = nvmeq; 238 nvmeq->tags = &dev->admin_tagset.tags[0]; 239 return 0; 240 } 241 242 static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 243 { 244 struct nvme_queue *nvmeq = hctx->driver_data; 245 246 nvmeq->tags = NULL; 247 } 248 249 static int nvme_admin_init_request(void *data, struct request *req, 250 unsigned int hctx_idx, unsigned int rq_idx, 251 unsigned int numa_node) 252 { 253 struct nvme_dev *dev = data; 254 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 255 struct nvme_queue *nvmeq = dev->queues[0]; 256 257 BUG_ON(!nvmeq); 258 iod->nvmeq = nvmeq; 259 return 0; 260 } 261 262 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 263 unsigned int hctx_idx) 264 { 265 struct nvme_dev *dev = data; 266 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 267 268 if (!nvmeq->tags) 269 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 270 271 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); 272 hctx->driver_data = nvmeq; 273 return 0; 274 } 275 276 static int nvme_init_request(void *data, struct request *req, 277 unsigned int hctx_idx, unsigned int rq_idx, 278 unsigned int numa_node) 279 { 280 struct nvme_dev *dev = data; 281 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 282 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 283 284 BUG_ON(!nvmeq); 285 iod->nvmeq = nvmeq; 286 return 0; 287 } 288 289 static void nvme_queue_scan(struct nvme_dev *dev) 290 { 291 /* 292 * Do not queue new scan work when a controller is reset during 293 * removal. 294 */ 295 if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) 296 return; 297 queue_work(nvme_workq, &dev->scan_work); 298 } 299 300 static void nvme_complete_async_event(struct nvme_dev *dev, 301 struct nvme_completion *cqe) 302 { 303 u16 status = le16_to_cpu(cqe->status) >> 1; 304 u32 result = le32_to_cpu(cqe->result); 305 306 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) 307 ++dev->ctrl.event_limit; 308 if (status != NVME_SC_SUCCESS) 309 return; 310 311 switch (result & 0xff07) { 312 case NVME_AER_NOTICE_NS_CHANGED: 313 dev_info(dev->dev, "rescanning\n"); 314 nvme_queue_scan(dev); 315 default: 316 dev_warn(dev->dev, "async event result %08x\n", result); 317 } 318 } 319 320 /** 321 * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 322 * @nvmeq: The queue to use 323 * @cmd: The command to send 324 * 325 * Safe to use from interrupt context 326 */ 327 static void __nvme_submit_cmd(struct nvme_queue *nvmeq, 328 struct nvme_command *cmd) 329 { 330 u16 tail = nvmeq->sq_tail; 331 332 if (nvmeq->sq_cmds_io) 333 memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); 334 else 335 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 336 337 if (++tail == nvmeq->q_depth) 338 tail = 0; 339 writel(tail, nvmeq->q_db); 340 nvmeq->sq_tail = tail; 341 } 342 343 static __le64 **iod_list(struct request *req) 344 { 345 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 346 return (__le64 **)(iod->sg + req->nr_phys_segments); 347 } 348 349 static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) 350 { 351 struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); 352 int nseg = rq->nr_phys_segments; 353 unsigned size; 354 355 if (rq->cmd_flags & REQ_DISCARD) 356 size = sizeof(struct nvme_dsm_range); 357 else 358 size = blk_rq_bytes(rq); 359 360 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { 361 iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); 362 if (!iod->sg) 363 return BLK_MQ_RQ_QUEUE_BUSY; 364 } else { 365 iod->sg = iod->inline_sg; 366 } 367 368 iod->aborted = 0; 369 iod->npages = -1; 370 iod->nents = 0; 371 iod->length = size; 372 return 0; 373 } 374 375 static void nvme_free_iod(struct nvme_dev *dev, struct request *req) 376 { 377 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 378 const int last_prp = dev->ctrl.page_size / 8 - 1; 379 int i; 380 __le64 **list = iod_list(req); 381 dma_addr_t prp_dma = iod->first_dma; 382 383 if (iod->npages == 0) 384 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 385 for (i = 0; i < iod->npages; i++) { 386 __le64 *prp_list = list[i]; 387 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 388 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 389 prp_dma = next_prp_dma; 390 } 391 392 if (iod->sg != iod->inline_sg) 393 kfree(iod->sg); 394 } 395 396 #ifdef CONFIG_BLK_DEV_INTEGRITY 397 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 398 { 399 if (be32_to_cpu(pi->ref_tag) == v) 400 pi->ref_tag = cpu_to_be32(p); 401 } 402 403 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 404 { 405 if (be32_to_cpu(pi->ref_tag) == p) 406 pi->ref_tag = cpu_to_be32(v); 407 } 408 409 /** 410 * nvme_dif_remap - remaps ref tags to bip seed and physical lba 411 * 412 * The virtual start sector is the one that was originally submitted by the 413 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical 414 * start sector may be different. Remap protection information to match the 415 * physical LBA on writes, and back to the original seed on reads. 416 * 417 * Type 0 and 3 do not have a ref tag, so no remapping required. 418 */ 419 static void nvme_dif_remap(struct request *req, 420 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 421 { 422 struct nvme_ns *ns = req->rq_disk->private_data; 423 struct bio_integrity_payload *bip; 424 struct t10_pi_tuple *pi; 425 void *p, *pmap; 426 u32 i, nlb, ts, phys, virt; 427 428 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) 429 return; 430 431 bip = bio_integrity(req->bio); 432 if (!bip) 433 return; 434 435 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; 436 437 p = pmap; 438 virt = bip_get_seed(bip); 439 phys = nvme_block_nr(ns, blk_rq_pos(req)); 440 nlb = (blk_rq_bytes(req) >> ns->lba_shift); 441 ts = ns->disk->queue->integrity.tuple_size; 442 443 for (i = 0; i < nlb; i++, virt++, phys++) { 444 pi = (struct t10_pi_tuple *)p; 445 dif_swap(phys, virt, pi); 446 p += ts; 447 } 448 kunmap_atomic(pmap); 449 } 450 #else /* CONFIG_BLK_DEV_INTEGRITY */ 451 static void nvme_dif_remap(struct request *req, 452 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 453 { 454 } 455 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 456 { 457 } 458 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 459 { 460 } 461 #endif 462 463 static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, 464 int total_len) 465 { 466 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 467 struct dma_pool *pool; 468 int length = total_len; 469 struct scatterlist *sg = iod->sg; 470 int dma_len = sg_dma_len(sg); 471 u64 dma_addr = sg_dma_address(sg); 472 u32 page_size = dev->ctrl.page_size; 473 int offset = dma_addr & (page_size - 1); 474 __le64 *prp_list; 475 __le64 **list = iod_list(req); 476 dma_addr_t prp_dma; 477 int nprps, i; 478 479 length -= (page_size - offset); 480 if (length <= 0) 481 return true; 482 483 dma_len -= (page_size - offset); 484 if (dma_len) { 485 dma_addr += (page_size - offset); 486 } else { 487 sg = sg_next(sg); 488 dma_addr = sg_dma_address(sg); 489 dma_len = sg_dma_len(sg); 490 } 491 492 if (length <= page_size) { 493 iod->first_dma = dma_addr; 494 return true; 495 } 496 497 nprps = DIV_ROUND_UP(length, page_size); 498 if (nprps <= (256 / 8)) { 499 pool = dev->prp_small_pool; 500 iod->npages = 0; 501 } else { 502 pool = dev->prp_page_pool; 503 iod->npages = 1; 504 } 505 506 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 507 if (!prp_list) { 508 iod->first_dma = dma_addr; 509 iod->npages = -1; 510 return false; 511 } 512 list[0] = prp_list; 513 iod->first_dma = prp_dma; 514 i = 0; 515 for (;;) { 516 if (i == page_size >> 3) { 517 __le64 *old_prp_list = prp_list; 518 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 519 if (!prp_list) 520 return false; 521 list[iod->npages++] = prp_list; 522 prp_list[0] = old_prp_list[i - 1]; 523 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 524 i = 1; 525 } 526 prp_list[i++] = cpu_to_le64(dma_addr); 527 dma_len -= page_size; 528 dma_addr += page_size; 529 length -= page_size; 530 if (length <= 0) 531 break; 532 if (dma_len > 0) 533 continue; 534 BUG_ON(dma_len < 0); 535 sg = sg_next(sg); 536 dma_addr = sg_dma_address(sg); 537 dma_len = sg_dma_len(sg); 538 } 539 540 return true; 541 } 542 543 static int nvme_map_data(struct nvme_dev *dev, struct request *req, 544 struct nvme_command *cmnd) 545 { 546 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 547 struct request_queue *q = req->q; 548 enum dma_data_direction dma_dir = rq_data_dir(req) ? 549 DMA_TO_DEVICE : DMA_FROM_DEVICE; 550 int ret = BLK_MQ_RQ_QUEUE_ERROR; 551 552 sg_init_table(iod->sg, req->nr_phys_segments); 553 iod->nents = blk_rq_map_sg(q, req, iod->sg); 554 if (!iod->nents) 555 goto out; 556 557 ret = BLK_MQ_RQ_QUEUE_BUSY; 558 if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) 559 goto out; 560 561 if (!nvme_setup_prps(dev, req, blk_rq_bytes(req))) 562 goto out_unmap; 563 564 ret = BLK_MQ_RQ_QUEUE_ERROR; 565 if (blk_integrity_rq(req)) { 566 if (blk_rq_count_integrity_sg(q, req->bio) != 1) 567 goto out_unmap; 568 569 sg_init_table(&iod->meta_sg, 1); 570 if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) 571 goto out_unmap; 572 573 if (rq_data_dir(req)) 574 nvme_dif_remap(req, nvme_dif_prep); 575 576 if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) 577 goto out_unmap; 578 } 579 580 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 581 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 582 if (blk_integrity_rq(req)) 583 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); 584 return BLK_MQ_RQ_QUEUE_OK; 585 586 out_unmap: 587 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 588 out: 589 return ret; 590 } 591 592 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) 593 { 594 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 595 enum dma_data_direction dma_dir = rq_data_dir(req) ? 596 DMA_TO_DEVICE : DMA_FROM_DEVICE; 597 598 if (iod->nents) { 599 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 600 if (blk_integrity_rq(req)) { 601 if (!rq_data_dir(req)) 602 nvme_dif_remap(req, nvme_dif_complete); 603 dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); 604 } 605 } 606 607 nvme_free_iod(dev, req); 608 } 609 610 /* 611 * We reuse the small pool to allocate the 16-byte range here as it is not 612 * worth having a special pool for these or additional cases to handle freeing 613 * the iod. 614 */ 615 static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 616 struct request *req, struct nvme_command *cmnd) 617 { 618 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 619 struct nvme_dsm_range *range; 620 621 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 622 &iod->first_dma); 623 if (!range) 624 return BLK_MQ_RQ_QUEUE_BUSY; 625 iod_list(req)[0] = (__le64 *)range; 626 iod->npages = 0; 627 628 range->cattr = cpu_to_le32(0); 629 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); 630 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 631 632 memset(cmnd, 0, sizeof(*cmnd)); 633 cmnd->dsm.opcode = nvme_cmd_dsm; 634 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 635 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 636 cmnd->dsm.nr = 0; 637 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 638 return BLK_MQ_RQ_QUEUE_OK; 639 } 640 641 /* 642 * NOTE: ns is NULL when called on the admin queue. 643 */ 644 static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 645 const struct blk_mq_queue_data *bd) 646 { 647 struct nvme_ns *ns = hctx->queue->queuedata; 648 struct nvme_queue *nvmeq = hctx->driver_data; 649 struct nvme_dev *dev = nvmeq->dev; 650 struct request *req = bd->rq; 651 struct nvme_command cmnd; 652 int ret = BLK_MQ_RQ_QUEUE_OK; 653 654 /* 655 * If formated with metadata, require the block layer provide a buffer 656 * unless this namespace is formated such that the metadata can be 657 * stripped/generated by the controller with PRACT=1. 658 */ 659 if (ns && ns->ms && !blk_integrity_rq(req)) { 660 if (!(ns->pi_type && ns->ms == 8) && 661 req->cmd_type != REQ_TYPE_DRV_PRIV) { 662 blk_mq_end_request(req, -EFAULT); 663 return BLK_MQ_RQ_QUEUE_OK; 664 } 665 } 666 667 ret = nvme_init_iod(req, dev); 668 if (ret) 669 return ret; 670 671 if (req->cmd_flags & REQ_DISCARD) { 672 ret = nvme_setup_discard(nvmeq, ns, req, &cmnd); 673 } else { 674 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 675 memcpy(&cmnd, req->cmd, sizeof(cmnd)); 676 else if (req->cmd_flags & REQ_FLUSH) 677 nvme_setup_flush(ns, &cmnd); 678 else 679 nvme_setup_rw(ns, req, &cmnd); 680 681 if (req->nr_phys_segments) 682 ret = nvme_map_data(dev, req, &cmnd); 683 } 684 685 if (ret) 686 goto out; 687 688 cmnd.common.command_id = req->tag; 689 blk_mq_start_request(req); 690 691 spin_lock_irq(&nvmeq->q_lock); 692 if (unlikely(nvmeq->cq_vector < 0)) { 693 if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) 694 ret = BLK_MQ_RQ_QUEUE_BUSY; 695 else 696 ret = BLK_MQ_RQ_QUEUE_ERROR; 697 spin_unlock_irq(&nvmeq->q_lock); 698 goto out; 699 } 700 __nvme_submit_cmd(nvmeq, &cmnd); 701 nvme_process_cq(nvmeq); 702 spin_unlock_irq(&nvmeq->q_lock); 703 return BLK_MQ_RQ_QUEUE_OK; 704 out: 705 nvme_free_iod(dev, req); 706 return ret; 707 } 708 709 static void nvme_complete_rq(struct request *req) 710 { 711 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 712 struct nvme_dev *dev = iod->nvmeq->dev; 713 int error = 0; 714 715 nvme_unmap_data(dev, req); 716 717 if (unlikely(req->errors)) { 718 if (nvme_req_needs_retry(req, req->errors)) { 719 nvme_requeue_req(req); 720 return; 721 } 722 723 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 724 error = req->errors; 725 else 726 error = nvme_error_status(req->errors); 727 } 728 729 if (unlikely(iod->aborted)) { 730 dev_warn(dev->dev, 731 "completing aborted command with status: %04x\n", 732 req->errors); 733 } 734 735 blk_mq_end_request(req, error); 736 } 737 738 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) 739 { 740 u16 head, phase; 741 742 head = nvmeq->cq_head; 743 phase = nvmeq->cq_phase; 744 745 for (;;) { 746 struct nvme_completion cqe = nvmeq->cqes[head]; 747 u16 status = le16_to_cpu(cqe.status); 748 struct request *req; 749 750 if ((status & 1) != phase) 751 break; 752 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 753 if (++head == nvmeq->q_depth) { 754 head = 0; 755 phase = !phase; 756 } 757 758 if (tag && *tag == cqe.command_id) 759 *tag = -1; 760 761 if (unlikely(cqe.command_id >= nvmeq->q_depth)) { 762 dev_warn(nvmeq->q_dmadev, 763 "invalid id %d completed on queue %d\n", 764 cqe.command_id, le16_to_cpu(cqe.sq_id)); 765 continue; 766 } 767 768 /* 769 * AEN requests are special as they don't time out and can 770 * survive any kind of queue freeze and often don't respond to 771 * aborts. We don't even bother to allocate a struct request 772 * for them but rather special case them here. 773 */ 774 if (unlikely(nvmeq->qid == 0 && 775 cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { 776 nvme_complete_async_event(nvmeq->dev, &cqe); 777 continue; 778 } 779 780 req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); 781 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 782 u32 result = le32_to_cpu(cqe.result); 783 req->special = (void *)(uintptr_t)result; 784 } 785 blk_mq_complete_request(req, status >> 1); 786 787 } 788 789 /* If the controller ignores the cq head doorbell and continuously 790 * writes to the queue, it is theoretically possible to wrap around 791 * the queue twice and mistakenly return IRQ_NONE. Linux only 792 * requires that 0.1% of your interrupts are handled, so this isn't 793 * a big problem. 794 */ 795 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 796 return; 797 798 if (likely(nvmeq->cq_vector >= 0)) 799 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 800 nvmeq->cq_head = head; 801 nvmeq->cq_phase = phase; 802 803 nvmeq->cqe_seen = 1; 804 } 805 806 static void nvme_process_cq(struct nvme_queue *nvmeq) 807 { 808 __nvme_process_cq(nvmeq, NULL); 809 } 810 811 static irqreturn_t nvme_irq(int irq, void *data) 812 { 813 irqreturn_t result; 814 struct nvme_queue *nvmeq = data; 815 spin_lock(&nvmeq->q_lock); 816 nvme_process_cq(nvmeq); 817 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 818 nvmeq->cqe_seen = 0; 819 spin_unlock(&nvmeq->q_lock); 820 return result; 821 } 822 823 static irqreturn_t nvme_irq_check(int irq, void *data) 824 { 825 struct nvme_queue *nvmeq = data; 826 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 827 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 828 return IRQ_NONE; 829 return IRQ_WAKE_THREAD; 830 } 831 832 static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) 833 { 834 struct nvme_queue *nvmeq = hctx->driver_data; 835 836 if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == 837 nvmeq->cq_phase) { 838 spin_lock_irq(&nvmeq->q_lock); 839 __nvme_process_cq(nvmeq, &tag); 840 spin_unlock_irq(&nvmeq->q_lock); 841 842 if (tag == -1) 843 return 1; 844 } 845 846 return 0; 847 } 848 849 static void nvme_submit_async_event(struct nvme_dev *dev) 850 { 851 struct nvme_command c; 852 853 memset(&c, 0, sizeof(c)); 854 c.common.opcode = nvme_admin_async_event; 855 c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit; 856 857 __nvme_submit_cmd(dev->queues[0], &c); 858 } 859 860 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 861 { 862 struct nvme_command c; 863 864 memset(&c, 0, sizeof(c)); 865 c.delete_queue.opcode = opcode; 866 c.delete_queue.qid = cpu_to_le16(id); 867 868 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 869 } 870 871 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 872 struct nvme_queue *nvmeq) 873 { 874 struct nvme_command c; 875 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 876 877 /* 878 * Note: we (ab)use the fact the the prp fields survive if no data 879 * is attached to the request. 880 */ 881 memset(&c, 0, sizeof(c)); 882 c.create_cq.opcode = nvme_admin_create_cq; 883 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 884 c.create_cq.cqid = cpu_to_le16(qid); 885 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 886 c.create_cq.cq_flags = cpu_to_le16(flags); 887 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 888 889 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 890 } 891 892 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 893 struct nvme_queue *nvmeq) 894 { 895 struct nvme_command c; 896 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 897 898 /* 899 * Note: we (ab)use the fact the the prp fields survive if no data 900 * is attached to the request. 901 */ 902 memset(&c, 0, sizeof(c)); 903 c.create_sq.opcode = nvme_admin_create_sq; 904 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 905 c.create_sq.sqid = cpu_to_le16(qid); 906 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 907 c.create_sq.sq_flags = cpu_to_le16(flags); 908 c.create_sq.cqid = cpu_to_le16(qid); 909 910 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 911 } 912 913 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 914 { 915 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 916 } 917 918 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 919 { 920 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 921 } 922 923 static void abort_endio(struct request *req, int error) 924 { 925 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 926 struct nvme_queue *nvmeq = iod->nvmeq; 927 u32 result = (u32)(uintptr_t)req->special; 928 u16 status = req->errors; 929 930 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); 931 atomic_inc(&nvmeq->dev->ctrl.abort_limit); 932 933 blk_mq_free_request(req); 934 } 935 936 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) 937 { 938 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 939 struct nvme_queue *nvmeq = iod->nvmeq; 940 struct nvme_dev *dev = nvmeq->dev; 941 struct request *abort_req; 942 struct nvme_command cmd; 943 944 /* 945 * Shutdown immediately if controller times out while starting. The 946 * reset work will see the pci device disabled when it gets the forced 947 * cancellation error. All outstanding requests are completed on 948 * shutdown, so we return BLK_EH_HANDLED. 949 */ 950 if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { 951 dev_warn(dev->dev, 952 "I/O %d QID %d timeout, disable controller\n", 953 req->tag, nvmeq->qid); 954 nvme_dev_disable(dev, false); 955 req->errors = NVME_SC_CANCELLED; 956 return BLK_EH_HANDLED; 957 } 958 959 /* 960 * Shutdown the controller immediately and schedule a reset if the 961 * command was already aborted once before and still hasn't been 962 * returned to the driver, or if this is the admin queue. 963 */ 964 if (!nvmeq->qid || iod->aborted) { 965 dev_warn(dev->dev, 966 "I/O %d QID %d timeout, reset controller\n", 967 req->tag, nvmeq->qid); 968 nvme_dev_disable(dev, false); 969 queue_work(nvme_workq, &dev->reset_work); 970 971 /* 972 * Mark the request as handled, since the inline shutdown 973 * forces all outstanding requests to complete. 974 */ 975 req->errors = NVME_SC_CANCELLED; 976 return BLK_EH_HANDLED; 977 } 978 979 iod->aborted = 1; 980 981 if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { 982 atomic_inc(&dev->ctrl.abort_limit); 983 return BLK_EH_RESET_TIMER; 984 } 985 986 memset(&cmd, 0, sizeof(cmd)); 987 cmd.abort.opcode = nvme_admin_abort_cmd; 988 cmd.abort.cid = req->tag; 989 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 990 991 dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", 992 req->tag, nvmeq->qid); 993 994 abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, 995 BLK_MQ_REQ_NOWAIT); 996 if (IS_ERR(abort_req)) { 997 atomic_inc(&dev->ctrl.abort_limit); 998 return BLK_EH_RESET_TIMER; 999 } 1000 1001 abort_req->timeout = ADMIN_TIMEOUT; 1002 abort_req->end_io_data = NULL; 1003 blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); 1004 1005 /* 1006 * The aborted req will be completed on receiving the abort req. 1007 * We enable the timer again. If hit twice, it'll cause a device reset, 1008 * as the device then is in a faulty state. 1009 */ 1010 return BLK_EH_RESET_TIMER; 1011 } 1012 1013 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) 1014 { 1015 struct nvme_queue *nvmeq = data; 1016 int status; 1017 1018 if (!blk_mq_request_started(req)) 1019 return; 1020 1021 dev_dbg_ratelimited(nvmeq->q_dmadev, 1022 "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); 1023 1024 status = NVME_SC_ABORT_REQ; 1025 if (blk_queue_dying(req->q)) 1026 status |= NVME_SC_DNR; 1027 blk_mq_complete_request(req, status); 1028 } 1029 1030 static void nvme_free_queue(struct nvme_queue *nvmeq) 1031 { 1032 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1033 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1034 if (nvmeq->sq_cmds) 1035 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1036 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1037 kfree(nvmeq); 1038 } 1039 1040 static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1041 { 1042 int i; 1043 1044 for (i = dev->queue_count - 1; i >= lowest; i--) { 1045 struct nvme_queue *nvmeq = dev->queues[i]; 1046 dev->queue_count--; 1047 dev->queues[i] = NULL; 1048 nvme_free_queue(nvmeq); 1049 } 1050 } 1051 1052 /** 1053 * nvme_suspend_queue - put queue into suspended state 1054 * @nvmeq - queue to suspend 1055 */ 1056 static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1057 { 1058 int vector; 1059 1060 spin_lock_irq(&nvmeq->q_lock); 1061 if (nvmeq->cq_vector == -1) { 1062 spin_unlock_irq(&nvmeq->q_lock); 1063 return 1; 1064 } 1065 vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1066 nvmeq->dev->online_queues--; 1067 nvmeq->cq_vector = -1; 1068 spin_unlock_irq(&nvmeq->q_lock); 1069 1070 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1071 blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); 1072 1073 irq_set_affinity_hint(vector, NULL); 1074 free_irq(vector, nvmeq); 1075 1076 return 0; 1077 } 1078 1079 static void nvme_clear_queue(struct nvme_queue *nvmeq) 1080 { 1081 spin_lock_irq(&nvmeq->q_lock); 1082 if (nvmeq->tags && *nvmeq->tags) 1083 blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); 1084 spin_unlock_irq(&nvmeq->q_lock); 1085 } 1086 1087 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) 1088 { 1089 struct nvme_queue *nvmeq = dev->queues[0]; 1090 1091 if (!nvmeq) 1092 return; 1093 if (nvme_suspend_queue(nvmeq)) 1094 return; 1095 1096 if (shutdown) 1097 nvme_shutdown_ctrl(&dev->ctrl); 1098 else 1099 nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( 1100 dev->bar + NVME_REG_CAP)); 1101 1102 spin_lock_irq(&nvmeq->q_lock); 1103 nvme_process_cq(nvmeq); 1104 spin_unlock_irq(&nvmeq->q_lock); 1105 } 1106 1107 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1108 int entry_size) 1109 { 1110 int q_depth = dev->q_depth; 1111 unsigned q_size_aligned = roundup(q_depth * entry_size, 1112 dev->ctrl.page_size); 1113 1114 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1115 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1116 mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); 1117 q_depth = div_u64(mem_per_q, entry_size); 1118 1119 /* 1120 * Ensure the reduced q_depth is above some threshold where it 1121 * would be better to map queues in system memory with the 1122 * original depth 1123 */ 1124 if (q_depth < 64) 1125 return -ENOMEM; 1126 } 1127 1128 return q_depth; 1129 } 1130 1131 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1132 int qid, int depth) 1133 { 1134 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1135 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), 1136 dev->ctrl.page_size); 1137 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; 1138 nvmeq->sq_cmds_io = dev->cmb + offset; 1139 } else { 1140 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1141 &nvmeq->sq_dma_addr, GFP_KERNEL); 1142 if (!nvmeq->sq_cmds) 1143 return -ENOMEM; 1144 } 1145 1146 return 0; 1147 } 1148 1149 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1150 int depth) 1151 { 1152 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); 1153 if (!nvmeq) 1154 return NULL; 1155 1156 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1157 &nvmeq->cq_dma_addr, GFP_KERNEL); 1158 if (!nvmeq->cqes) 1159 goto free_nvmeq; 1160 1161 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1162 goto free_cqdma; 1163 1164 nvmeq->q_dmadev = dev->dev; 1165 nvmeq->dev = dev; 1166 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1167 dev->ctrl.instance, qid); 1168 spin_lock_init(&nvmeq->q_lock); 1169 nvmeq->cq_head = 0; 1170 nvmeq->cq_phase = 1; 1171 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1172 nvmeq->q_depth = depth; 1173 nvmeq->qid = qid; 1174 nvmeq->cq_vector = -1; 1175 dev->queues[qid] = nvmeq; 1176 1177 /* make sure queue descriptor is set before queue count, for kthread */ 1178 mb(); 1179 dev->queue_count++; 1180 1181 return nvmeq; 1182 1183 free_cqdma: 1184 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1185 nvmeq->cq_dma_addr); 1186 free_nvmeq: 1187 kfree(nvmeq); 1188 return NULL; 1189 } 1190 1191 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1192 const char *name) 1193 { 1194 if (use_threaded_interrupts) 1195 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1196 nvme_irq_check, nvme_irq, IRQF_SHARED, 1197 name, nvmeq); 1198 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1199 IRQF_SHARED, name, nvmeq); 1200 } 1201 1202 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1203 { 1204 struct nvme_dev *dev = nvmeq->dev; 1205 1206 spin_lock_irq(&nvmeq->q_lock); 1207 nvmeq->sq_tail = 0; 1208 nvmeq->cq_head = 0; 1209 nvmeq->cq_phase = 1; 1210 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1211 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1212 dev->online_queues++; 1213 spin_unlock_irq(&nvmeq->q_lock); 1214 } 1215 1216 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1217 { 1218 struct nvme_dev *dev = nvmeq->dev; 1219 int result; 1220 1221 nvmeq->cq_vector = qid - 1; 1222 result = adapter_alloc_cq(dev, qid, nvmeq); 1223 if (result < 0) 1224 return result; 1225 1226 result = adapter_alloc_sq(dev, qid, nvmeq); 1227 if (result < 0) 1228 goto release_cq; 1229 1230 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1231 if (result < 0) 1232 goto release_sq; 1233 1234 nvme_init_queue(nvmeq, qid); 1235 return result; 1236 1237 release_sq: 1238 adapter_delete_sq(dev, qid); 1239 release_cq: 1240 adapter_delete_cq(dev, qid); 1241 return result; 1242 } 1243 1244 static struct blk_mq_ops nvme_mq_admin_ops = { 1245 .queue_rq = nvme_queue_rq, 1246 .complete = nvme_complete_rq, 1247 .map_queue = blk_mq_map_queue, 1248 .init_hctx = nvme_admin_init_hctx, 1249 .exit_hctx = nvme_admin_exit_hctx, 1250 .init_request = nvme_admin_init_request, 1251 .timeout = nvme_timeout, 1252 }; 1253 1254 static struct blk_mq_ops nvme_mq_ops = { 1255 .queue_rq = nvme_queue_rq, 1256 .complete = nvme_complete_rq, 1257 .map_queue = blk_mq_map_queue, 1258 .init_hctx = nvme_init_hctx, 1259 .init_request = nvme_init_request, 1260 .timeout = nvme_timeout, 1261 .poll = nvme_poll, 1262 }; 1263 1264 static void nvme_dev_remove_admin(struct nvme_dev *dev) 1265 { 1266 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { 1267 /* 1268 * If the controller was reset during removal, it's possible 1269 * user requests may be waiting on a stopped queue. Start the 1270 * queue to flush these to completion. 1271 */ 1272 blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); 1273 blk_cleanup_queue(dev->ctrl.admin_q); 1274 blk_mq_free_tag_set(&dev->admin_tagset); 1275 } 1276 } 1277 1278 static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1279 { 1280 if (!dev->ctrl.admin_q) { 1281 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1282 dev->admin_tagset.nr_hw_queues = 1; 1283 1284 /* 1285 * Subtract one to leave an empty queue entry for 'Full Queue' 1286 * condition. See NVM-Express 1.2 specification, section 4.1.2. 1287 */ 1288 dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; 1289 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1290 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1291 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1292 dev->admin_tagset.driver_data = dev; 1293 1294 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1295 return -ENOMEM; 1296 1297 dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); 1298 if (IS_ERR(dev->ctrl.admin_q)) { 1299 blk_mq_free_tag_set(&dev->admin_tagset); 1300 return -ENOMEM; 1301 } 1302 if (!blk_get_queue(dev->ctrl.admin_q)) { 1303 nvme_dev_remove_admin(dev); 1304 dev->ctrl.admin_q = NULL; 1305 return -ENODEV; 1306 } 1307 } else 1308 blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); 1309 1310 return 0; 1311 } 1312 1313 static int nvme_configure_admin_queue(struct nvme_dev *dev) 1314 { 1315 int result; 1316 u32 aqa; 1317 u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1318 struct nvme_queue *nvmeq; 1319 1320 dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? 1321 NVME_CAP_NSSRC(cap) : 0; 1322 1323 if (dev->subsystem && 1324 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) 1325 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); 1326 1327 result = nvme_disable_ctrl(&dev->ctrl, cap); 1328 if (result < 0) 1329 return result; 1330 1331 nvmeq = dev->queues[0]; 1332 if (!nvmeq) { 1333 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1334 if (!nvmeq) 1335 return -ENOMEM; 1336 } 1337 1338 aqa = nvmeq->q_depth - 1; 1339 aqa |= aqa << 16; 1340 1341 writel(aqa, dev->bar + NVME_REG_AQA); 1342 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); 1343 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); 1344 1345 result = nvme_enable_ctrl(&dev->ctrl, cap); 1346 if (result) 1347 goto free_nvmeq; 1348 1349 nvmeq->cq_vector = 0; 1350 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1351 if (result) { 1352 nvmeq->cq_vector = -1; 1353 goto free_nvmeq; 1354 } 1355 1356 return result; 1357 1358 free_nvmeq: 1359 nvme_free_queues(dev, 0); 1360 return result; 1361 } 1362 1363 static int nvme_kthread(void *data) 1364 { 1365 struct nvme_dev *dev, *next; 1366 1367 while (!kthread_should_stop()) { 1368 set_current_state(TASK_INTERRUPTIBLE); 1369 spin_lock(&dev_list_lock); 1370 list_for_each_entry_safe(dev, next, &dev_list, node) { 1371 int i; 1372 u32 csts = readl(dev->bar + NVME_REG_CSTS); 1373 1374 /* 1375 * Skip controllers currently under reset. 1376 */ 1377 if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work)) 1378 continue; 1379 1380 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || 1381 csts & NVME_CSTS_CFS) { 1382 if (queue_work(nvme_workq, &dev->reset_work)) { 1383 dev_warn(dev->dev, 1384 "Failed status: %x, reset controller\n", 1385 readl(dev->bar + NVME_REG_CSTS)); 1386 } 1387 continue; 1388 } 1389 for (i = 0; i < dev->queue_count; i++) { 1390 struct nvme_queue *nvmeq = dev->queues[i]; 1391 if (!nvmeq) 1392 continue; 1393 spin_lock_irq(&nvmeq->q_lock); 1394 nvme_process_cq(nvmeq); 1395 1396 while (i == 0 && dev->ctrl.event_limit > 0) 1397 nvme_submit_async_event(dev); 1398 spin_unlock_irq(&nvmeq->q_lock); 1399 } 1400 } 1401 spin_unlock(&dev_list_lock); 1402 schedule_timeout(round_jiffies_relative(HZ)); 1403 } 1404 return 0; 1405 } 1406 1407 static int nvme_create_io_queues(struct nvme_dev *dev) 1408 { 1409 unsigned i; 1410 int ret = 0; 1411 1412 for (i = dev->queue_count; i <= dev->max_qid; i++) { 1413 if (!nvme_alloc_queue(dev, i, dev->q_depth)) { 1414 ret = -ENOMEM; 1415 break; 1416 } 1417 } 1418 1419 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) { 1420 ret = nvme_create_queue(dev->queues[i], i); 1421 if (ret) { 1422 nvme_free_queues(dev, i); 1423 break; 1424 } 1425 } 1426 1427 /* 1428 * Ignore failing Create SQ/CQ commands, we can continue with less 1429 * than the desired aount of queues, and even a controller without 1430 * I/O queues an still be used to issue admin commands. This might 1431 * be useful to upgrade a buggy firmware for example. 1432 */ 1433 return ret >= 0 ? 0 : ret; 1434 } 1435 1436 static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 1437 { 1438 u64 szu, size, offset; 1439 u32 cmbloc; 1440 resource_size_t bar_size; 1441 struct pci_dev *pdev = to_pci_dev(dev->dev); 1442 void __iomem *cmb; 1443 dma_addr_t dma_addr; 1444 1445 if (!use_cmb_sqes) 1446 return NULL; 1447 1448 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); 1449 if (!(NVME_CMB_SZ(dev->cmbsz))) 1450 return NULL; 1451 1452 cmbloc = readl(dev->bar + NVME_REG_CMBLOC); 1453 1454 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 1455 size = szu * NVME_CMB_SZ(dev->cmbsz); 1456 offset = szu * NVME_CMB_OFST(cmbloc); 1457 bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); 1458 1459 if (offset > bar_size) 1460 return NULL; 1461 1462 /* 1463 * Controllers may support a CMB size larger than their BAR, 1464 * for example, due to being behind a bridge. Reduce the CMB to 1465 * the reported size of the BAR 1466 */ 1467 if (size > bar_size - offset) 1468 size = bar_size - offset; 1469 1470 dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; 1471 cmb = ioremap_wc(dma_addr, size); 1472 if (!cmb) 1473 return NULL; 1474 1475 dev->cmb_dma_addr = dma_addr; 1476 dev->cmb_size = size; 1477 return cmb; 1478 } 1479 1480 static inline void nvme_release_cmb(struct nvme_dev *dev) 1481 { 1482 if (dev->cmb) { 1483 iounmap(dev->cmb); 1484 dev->cmb = NULL; 1485 } 1486 } 1487 1488 static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 1489 { 1490 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 1491 } 1492 1493 static int nvme_setup_io_queues(struct nvme_dev *dev) 1494 { 1495 struct nvme_queue *adminq = dev->queues[0]; 1496 struct pci_dev *pdev = to_pci_dev(dev->dev); 1497 int result, i, vecs, nr_io_queues, size; 1498 1499 nr_io_queues = num_possible_cpus(); 1500 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); 1501 if (result < 0) 1502 return result; 1503 1504 /* 1505 * Degraded controllers might return an error when setting the queue 1506 * count. We still want to be able to bring them online and offer 1507 * access to the admin queue, as that might be only way to fix them up. 1508 */ 1509 if (result > 0) { 1510 dev_err(dev->dev, "Could not set queue count (%d)\n", result); 1511 nr_io_queues = 0; 1512 result = 0; 1513 } 1514 1515 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 1516 result = nvme_cmb_qdepth(dev, nr_io_queues, 1517 sizeof(struct nvme_command)); 1518 if (result > 0) 1519 dev->q_depth = result; 1520 else 1521 nvme_release_cmb(dev); 1522 } 1523 1524 size = db_bar_size(dev, nr_io_queues); 1525 if (size > 8192) { 1526 iounmap(dev->bar); 1527 do { 1528 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 1529 if (dev->bar) 1530 break; 1531 if (!--nr_io_queues) 1532 return -ENOMEM; 1533 size = db_bar_size(dev, nr_io_queues); 1534 } while (1); 1535 dev->dbs = dev->bar + 4096; 1536 adminq->q_db = dev->dbs; 1537 } 1538 1539 /* Deregister the admin queue's interrupt */ 1540 free_irq(dev->entry[0].vector, adminq); 1541 1542 /* 1543 * If we enable msix early due to not intx, disable it again before 1544 * setting up the full range we need. 1545 */ 1546 if (!pdev->irq) 1547 pci_disable_msix(pdev); 1548 1549 for (i = 0; i < nr_io_queues; i++) 1550 dev->entry[i].entry = i; 1551 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 1552 if (vecs < 0) { 1553 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); 1554 if (vecs < 0) { 1555 vecs = 1; 1556 } else { 1557 for (i = 0; i < vecs; i++) 1558 dev->entry[i].vector = i + pdev->irq; 1559 } 1560 } 1561 1562 /* 1563 * Should investigate if there's a performance win from allocating 1564 * more queues than interrupt vectors; it might allow the submission 1565 * path to scale better, even if the receive path is limited by the 1566 * number of interrupts. 1567 */ 1568 nr_io_queues = vecs; 1569 dev->max_qid = nr_io_queues; 1570 1571 result = queue_request_irq(dev, adminq, adminq->irqname); 1572 if (result) { 1573 adminq->cq_vector = -1; 1574 goto free_queues; 1575 } 1576 1577 /* Free previously allocated queues that are no longer usable */ 1578 nvme_free_queues(dev, nr_io_queues + 1); 1579 return nvme_create_io_queues(dev); 1580 1581 free_queues: 1582 nvme_free_queues(dev, 1); 1583 return result; 1584 } 1585 1586 static void nvme_set_irq_hints(struct nvme_dev *dev) 1587 { 1588 struct nvme_queue *nvmeq; 1589 int i; 1590 1591 for (i = 0; i < dev->online_queues; i++) { 1592 nvmeq = dev->queues[i]; 1593 1594 if (!nvmeq->tags || !(*nvmeq->tags)) 1595 continue; 1596 1597 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 1598 blk_mq_tags_cpumask(*nvmeq->tags)); 1599 } 1600 } 1601 1602 static void nvme_dev_scan(struct work_struct *work) 1603 { 1604 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); 1605 1606 if (!dev->tagset.tags) 1607 return; 1608 nvme_scan_namespaces(&dev->ctrl); 1609 nvme_set_irq_hints(dev); 1610 } 1611 1612 static void nvme_del_queue_end(struct request *req, int error) 1613 { 1614 struct nvme_queue *nvmeq = req->end_io_data; 1615 1616 blk_mq_free_request(req); 1617 complete(&nvmeq->dev->ioq_wait); 1618 } 1619 1620 static void nvme_del_cq_end(struct request *req, int error) 1621 { 1622 struct nvme_queue *nvmeq = req->end_io_data; 1623 1624 if (!error) { 1625 unsigned long flags; 1626 1627 spin_lock_irqsave(&nvmeq->q_lock, flags); 1628 nvme_process_cq(nvmeq); 1629 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 1630 } 1631 1632 nvme_del_queue_end(req, error); 1633 } 1634 1635 static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) 1636 { 1637 struct request_queue *q = nvmeq->dev->ctrl.admin_q; 1638 struct request *req; 1639 struct nvme_command cmd; 1640 1641 memset(&cmd, 0, sizeof(cmd)); 1642 cmd.delete_queue.opcode = opcode; 1643 cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); 1644 1645 req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); 1646 if (IS_ERR(req)) 1647 return PTR_ERR(req); 1648 1649 req->timeout = ADMIN_TIMEOUT; 1650 req->end_io_data = nvmeq; 1651 1652 blk_execute_rq_nowait(q, NULL, req, false, 1653 opcode == nvme_admin_delete_cq ? 1654 nvme_del_cq_end : nvme_del_queue_end); 1655 return 0; 1656 } 1657 1658 static void nvme_disable_io_queues(struct nvme_dev *dev) 1659 { 1660 int pass; 1661 unsigned long timeout; 1662 u8 opcode = nvme_admin_delete_sq; 1663 1664 for (pass = 0; pass < 2; pass++) { 1665 int sent = 0, i = dev->queue_count - 1; 1666 1667 reinit_completion(&dev->ioq_wait); 1668 retry: 1669 timeout = ADMIN_TIMEOUT; 1670 for (; i > 0; i--) { 1671 struct nvme_queue *nvmeq = dev->queues[i]; 1672 1673 if (!pass) 1674 nvme_suspend_queue(nvmeq); 1675 if (nvme_delete_queue(nvmeq, opcode)) 1676 break; 1677 ++sent; 1678 } 1679 while (sent--) { 1680 timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); 1681 if (timeout == 0) 1682 return; 1683 if (i) 1684 goto retry; 1685 } 1686 opcode = nvme_admin_delete_cq; 1687 } 1688 } 1689 1690 /* 1691 * Return: error value if an error occurred setting up the queues or calling 1692 * Identify Device. 0 if these succeeded, even if adding some of the 1693 * namespaces failed. At the moment, these failures are silent. TBD which 1694 * failures should be reported. 1695 */ 1696 static int nvme_dev_add(struct nvme_dev *dev) 1697 { 1698 if (!dev->ctrl.tagset) { 1699 dev->tagset.ops = &nvme_mq_ops; 1700 dev->tagset.nr_hw_queues = dev->online_queues - 1; 1701 dev->tagset.timeout = NVME_IO_TIMEOUT; 1702 dev->tagset.numa_node = dev_to_node(dev->dev); 1703 dev->tagset.queue_depth = 1704 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 1705 dev->tagset.cmd_size = nvme_cmd_size(dev); 1706 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 1707 dev->tagset.driver_data = dev; 1708 1709 if (blk_mq_alloc_tag_set(&dev->tagset)) 1710 return 0; 1711 dev->ctrl.tagset = &dev->tagset; 1712 } 1713 nvme_queue_scan(dev); 1714 return 0; 1715 } 1716 1717 static int nvme_pci_enable(struct nvme_dev *dev) 1718 { 1719 u64 cap; 1720 int result = -ENOMEM; 1721 struct pci_dev *pdev = to_pci_dev(dev->dev); 1722 1723 if (pci_enable_device_mem(pdev)) 1724 return result; 1725 1726 dev->entry[0].vector = pdev->irq; 1727 pci_set_master(pdev); 1728 1729 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 1730 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 1731 goto disable; 1732 1733 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 1734 result = -ENODEV; 1735 goto disable; 1736 } 1737 1738 /* 1739 * Some devices don't advertse INTx interrupts, pre-enable a single 1740 * MSIX vec for setup. We'll adjust this later. 1741 */ 1742 if (!pdev->irq) { 1743 result = pci_enable_msix(pdev, dev->entry, 1); 1744 if (result < 0) 1745 goto disable; 1746 } 1747 1748 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1749 1750 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 1751 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 1752 dev->dbs = dev->bar + 4096; 1753 1754 /* 1755 * Temporary fix for the Apple controller found in the MacBook8,1 and 1756 * some MacBook7,1 to avoid controller resets and data loss. 1757 */ 1758 if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { 1759 dev->q_depth = 2; 1760 dev_warn(dev->dev, "detected Apple NVMe controller, set " 1761 "queue depth=%u to work around controller resets\n", 1762 dev->q_depth); 1763 } 1764 1765 if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) 1766 dev->cmb = nvme_map_cmb(dev); 1767 1768 pci_enable_pcie_error_reporting(pdev); 1769 pci_save_state(pdev); 1770 return 0; 1771 1772 disable: 1773 pci_disable_device(pdev); 1774 return result; 1775 } 1776 1777 static void nvme_dev_unmap(struct nvme_dev *dev) 1778 { 1779 if (dev->bar) 1780 iounmap(dev->bar); 1781 pci_release_regions(to_pci_dev(dev->dev)); 1782 } 1783 1784 static void nvme_pci_disable(struct nvme_dev *dev) 1785 { 1786 struct pci_dev *pdev = to_pci_dev(dev->dev); 1787 1788 if (pdev->msi_enabled) 1789 pci_disable_msi(pdev); 1790 else if (pdev->msix_enabled) 1791 pci_disable_msix(pdev); 1792 1793 if (pci_is_enabled(pdev)) { 1794 pci_disable_pcie_error_reporting(pdev); 1795 pci_disable_device(pdev); 1796 } 1797 } 1798 1799 static int nvme_dev_list_add(struct nvme_dev *dev) 1800 { 1801 bool start_thread = false; 1802 1803 spin_lock(&dev_list_lock); 1804 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { 1805 start_thread = true; 1806 nvme_thread = NULL; 1807 } 1808 list_add(&dev->node, &dev_list); 1809 spin_unlock(&dev_list_lock); 1810 1811 if (start_thread) { 1812 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 1813 wake_up_all(&nvme_kthread_wait); 1814 } else 1815 wait_event_killable(nvme_kthread_wait, nvme_thread); 1816 1817 if (IS_ERR_OR_NULL(nvme_thread)) 1818 return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 1819 1820 return 0; 1821 } 1822 1823 /* 1824 * Remove the node from the device list and check 1825 * for whether or not we need to stop the nvme_thread. 1826 */ 1827 static void nvme_dev_list_remove(struct nvme_dev *dev) 1828 { 1829 struct task_struct *tmp = NULL; 1830 1831 spin_lock(&dev_list_lock); 1832 list_del_init(&dev->node); 1833 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { 1834 tmp = nvme_thread; 1835 nvme_thread = NULL; 1836 } 1837 spin_unlock(&dev_list_lock); 1838 1839 if (tmp) 1840 kthread_stop(tmp); 1841 } 1842 1843 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) 1844 { 1845 int i; 1846 u32 csts = -1; 1847 1848 nvme_dev_list_remove(dev); 1849 1850 mutex_lock(&dev->shutdown_lock); 1851 if (pci_is_enabled(to_pci_dev(dev->dev))) { 1852 nvme_stop_queues(&dev->ctrl); 1853 csts = readl(dev->bar + NVME_REG_CSTS); 1854 } 1855 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 1856 for (i = dev->queue_count - 1; i >= 0; i--) { 1857 struct nvme_queue *nvmeq = dev->queues[i]; 1858 nvme_suspend_queue(nvmeq); 1859 } 1860 } else { 1861 nvme_disable_io_queues(dev); 1862 nvme_disable_admin_queue(dev, shutdown); 1863 } 1864 nvme_pci_disable(dev); 1865 1866 for (i = dev->queue_count - 1; i >= 0; i--) 1867 nvme_clear_queue(dev->queues[i]); 1868 mutex_unlock(&dev->shutdown_lock); 1869 } 1870 1871 static int nvme_setup_prp_pools(struct nvme_dev *dev) 1872 { 1873 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 1874 PAGE_SIZE, PAGE_SIZE, 0); 1875 if (!dev->prp_page_pool) 1876 return -ENOMEM; 1877 1878 /* Optimisation for I/Os between 4k and 128k */ 1879 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 1880 256, 256, 0); 1881 if (!dev->prp_small_pool) { 1882 dma_pool_destroy(dev->prp_page_pool); 1883 return -ENOMEM; 1884 } 1885 return 0; 1886 } 1887 1888 static void nvme_release_prp_pools(struct nvme_dev *dev) 1889 { 1890 dma_pool_destroy(dev->prp_page_pool); 1891 dma_pool_destroy(dev->prp_small_pool); 1892 } 1893 1894 static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) 1895 { 1896 struct nvme_dev *dev = to_nvme_dev(ctrl); 1897 1898 put_device(dev->dev); 1899 if (dev->tagset.tags) 1900 blk_mq_free_tag_set(&dev->tagset); 1901 if (dev->ctrl.admin_q) 1902 blk_put_queue(dev->ctrl.admin_q); 1903 kfree(dev->queues); 1904 kfree(dev->entry); 1905 kfree(dev); 1906 } 1907 1908 static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) 1909 { 1910 dev_warn(dev->dev, "Removing after probe failure status: %d\n", status); 1911 1912 kref_get(&dev->ctrl.kref); 1913 nvme_dev_disable(dev, false); 1914 if (!schedule_work(&dev->remove_work)) 1915 nvme_put_ctrl(&dev->ctrl); 1916 } 1917 1918 static void nvme_reset_work(struct work_struct *work) 1919 { 1920 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); 1921 int result = -ENODEV; 1922 1923 if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) 1924 goto out; 1925 1926 /* 1927 * If we're called to reset a live controller first shut it down before 1928 * moving on. 1929 */ 1930 if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) 1931 nvme_dev_disable(dev, false); 1932 1933 set_bit(NVME_CTRL_RESETTING, &dev->flags); 1934 1935 result = nvme_pci_enable(dev); 1936 if (result) 1937 goto out; 1938 1939 result = nvme_configure_admin_queue(dev); 1940 if (result) 1941 goto out; 1942 1943 nvme_init_queue(dev->queues[0], 0); 1944 result = nvme_alloc_admin_tags(dev); 1945 if (result) 1946 goto out; 1947 1948 result = nvme_init_identify(&dev->ctrl); 1949 if (result) 1950 goto out; 1951 1952 result = nvme_setup_io_queues(dev); 1953 if (result) 1954 goto out; 1955 1956 dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; 1957 1958 result = nvme_dev_list_add(dev); 1959 if (result) 1960 goto out; 1961 1962 /* 1963 * Keep the controller around but remove all namespaces if we don't have 1964 * any working I/O queue. 1965 */ 1966 if (dev->online_queues < 2) { 1967 dev_warn(dev->dev, "IO queues not created\n"); 1968 nvme_remove_namespaces(&dev->ctrl); 1969 } else { 1970 nvme_start_queues(&dev->ctrl); 1971 nvme_dev_add(dev); 1972 } 1973 1974 clear_bit(NVME_CTRL_RESETTING, &dev->flags); 1975 return; 1976 1977 out: 1978 nvme_remove_dead_ctrl(dev, result); 1979 } 1980 1981 static void nvme_remove_dead_ctrl_work(struct work_struct *work) 1982 { 1983 struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); 1984 struct pci_dev *pdev = to_pci_dev(dev->dev); 1985 1986 nvme_kill_queues(&dev->ctrl); 1987 if (pci_get_drvdata(pdev)) 1988 pci_stop_and_remove_bus_device_locked(pdev); 1989 nvme_put_ctrl(&dev->ctrl); 1990 } 1991 1992 static int nvme_reset(struct nvme_dev *dev) 1993 { 1994 if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) 1995 return -ENODEV; 1996 1997 if (!queue_work(nvme_workq, &dev->reset_work)) 1998 return -EBUSY; 1999 2000 flush_work(&dev->reset_work); 2001 return 0; 2002 } 2003 2004 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) 2005 { 2006 *val = readl(to_nvme_dev(ctrl)->bar + off); 2007 return 0; 2008 } 2009 2010 static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) 2011 { 2012 writel(val, to_nvme_dev(ctrl)->bar + off); 2013 return 0; 2014 } 2015 2016 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) 2017 { 2018 *val = readq(to_nvme_dev(ctrl)->bar + off); 2019 return 0; 2020 } 2021 2022 static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) 2023 { 2024 struct nvme_dev *dev = to_nvme_dev(ctrl); 2025 2026 return !dev->bar || dev->online_queues < 2; 2027 } 2028 2029 static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) 2030 { 2031 return nvme_reset(to_nvme_dev(ctrl)); 2032 } 2033 2034 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { 2035 .reg_read32 = nvme_pci_reg_read32, 2036 .reg_write32 = nvme_pci_reg_write32, 2037 .reg_read64 = nvme_pci_reg_read64, 2038 .io_incapable = nvme_pci_io_incapable, 2039 .reset_ctrl = nvme_pci_reset_ctrl, 2040 .free_ctrl = nvme_pci_free_ctrl, 2041 }; 2042 2043 static int nvme_dev_map(struct nvme_dev *dev) 2044 { 2045 int bars; 2046 struct pci_dev *pdev = to_pci_dev(dev->dev); 2047 2048 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2049 if (!bars) 2050 return -ENODEV; 2051 if (pci_request_selected_regions(pdev, bars, "nvme")) 2052 return -ENODEV; 2053 2054 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2055 if (!dev->bar) 2056 goto release; 2057 2058 return 0; 2059 release: 2060 pci_release_regions(pdev); 2061 return -ENODEV; 2062 } 2063 2064 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2065 { 2066 int node, result = -ENOMEM; 2067 struct nvme_dev *dev; 2068 2069 node = dev_to_node(&pdev->dev); 2070 if (node == NUMA_NO_NODE) 2071 set_dev_node(&pdev->dev, 0); 2072 2073 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 2074 if (!dev) 2075 return -ENOMEM; 2076 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), 2077 GFP_KERNEL, node); 2078 if (!dev->entry) 2079 goto free; 2080 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), 2081 GFP_KERNEL, node); 2082 if (!dev->queues) 2083 goto free; 2084 2085 dev->dev = get_device(&pdev->dev); 2086 pci_set_drvdata(pdev, dev); 2087 2088 result = nvme_dev_map(dev); 2089 if (result) 2090 goto free; 2091 2092 INIT_LIST_HEAD(&dev->node); 2093 INIT_WORK(&dev->scan_work, nvme_dev_scan); 2094 INIT_WORK(&dev->reset_work, nvme_reset_work); 2095 INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); 2096 mutex_init(&dev->shutdown_lock); 2097 init_completion(&dev->ioq_wait); 2098 2099 result = nvme_setup_prp_pools(dev); 2100 if (result) 2101 goto put_pci; 2102 2103 result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, 2104 id->driver_data); 2105 if (result) 2106 goto release_pools; 2107 2108 queue_work(nvme_workq, &dev->reset_work); 2109 return 0; 2110 2111 release_pools: 2112 nvme_release_prp_pools(dev); 2113 put_pci: 2114 put_device(dev->dev); 2115 nvme_dev_unmap(dev); 2116 free: 2117 kfree(dev->queues); 2118 kfree(dev->entry); 2119 kfree(dev); 2120 return result; 2121 } 2122 2123 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) 2124 { 2125 struct nvme_dev *dev = pci_get_drvdata(pdev); 2126 2127 if (prepare) 2128 nvme_dev_disable(dev, false); 2129 else 2130 queue_work(nvme_workq, &dev->reset_work); 2131 } 2132 2133 static void nvme_shutdown(struct pci_dev *pdev) 2134 { 2135 struct nvme_dev *dev = pci_get_drvdata(pdev); 2136 nvme_dev_disable(dev, true); 2137 } 2138 2139 /* 2140 * The driver's remove may be called on a device in a partially initialized 2141 * state. This function must not have any dependencies on the device state in 2142 * order to proceed. 2143 */ 2144 static void nvme_remove(struct pci_dev *pdev) 2145 { 2146 struct nvme_dev *dev = pci_get_drvdata(pdev); 2147 2148 set_bit(NVME_CTRL_REMOVING, &dev->flags); 2149 pci_set_drvdata(pdev, NULL); 2150 flush_work(&dev->scan_work); 2151 nvme_remove_namespaces(&dev->ctrl); 2152 nvme_uninit_ctrl(&dev->ctrl); 2153 nvme_dev_disable(dev, true); 2154 flush_work(&dev->reset_work); 2155 nvme_dev_remove_admin(dev); 2156 nvme_free_queues(dev, 0); 2157 nvme_release_cmb(dev); 2158 nvme_release_prp_pools(dev); 2159 nvme_dev_unmap(dev); 2160 nvme_put_ctrl(&dev->ctrl); 2161 } 2162 2163 #ifdef CONFIG_PM_SLEEP 2164 static int nvme_suspend(struct device *dev) 2165 { 2166 struct pci_dev *pdev = to_pci_dev(dev); 2167 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2168 2169 nvme_dev_disable(ndev, true); 2170 return 0; 2171 } 2172 2173 static int nvme_resume(struct device *dev) 2174 { 2175 struct pci_dev *pdev = to_pci_dev(dev); 2176 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2177 2178 queue_work(nvme_workq, &ndev->reset_work); 2179 return 0; 2180 } 2181 #endif 2182 2183 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2184 2185 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, 2186 pci_channel_state_t state) 2187 { 2188 struct nvme_dev *dev = pci_get_drvdata(pdev); 2189 2190 /* 2191 * A frozen channel requires a reset. When detected, this method will 2192 * shutdown the controller to quiesce. The controller will be restarted 2193 * after the slot reset through driver's slot_reset callback. 2194 */ 2195 dev_warn(&pdev->dev, "error detected: state:%d\n", state); 2196 switch (state) { 2197 case pci_channel_io_normal: 2198 return PCI_ERS_RESULT_CAN_RECOVER; 2199 case pci_channel_io_frozen: 2200 nvme_dev_disable(dev, false); 2201 return PCI_ERS_RESULT_NEED_RESET; 2202 case pci_channel_io_perm_failure: 2203 return PCI_ERS_RESULT_DISCONNECT; 2204 } 2205 return PCI_ERS_RESULT_NEED_RESET; 2206 } 2207 2208 static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) 2209 { 2210 struct nvme_dev *dev = pci_get_drvdata(pdev); 2211 2212 dev_info(&pdev->dev, "restart after slot reset\n"); 2213 pci_restore_state(pdev); 2214 queue_work(nvme_workq, &dev->reset_work); 2215 return PCI_ERS_RESULT_RECOVERED; 2216 } 2217 2218 static void nvme_error_resume(struct pci_dev *pdev) 2219 { 2220 pci_cleanup_aer_uncorrect_error_status(pdev); 2221 } 2222 2223 static const struct pci_error_handlers nvme_err_handler = { 2224 .error_detected = nvme_error_detected, 2225 .slot_reset = nvme_slot_reset, 2226 .resume = nvme_error_resume, 2227 .reset_notify = nvme_reset_notify, 2228 }; 2229 2230 /* Move to pci_ids.h later */ 2231 #define PCI_CLASS_STORAGE_EXPRESS 0x010802 2232 2233 static const struct pci_device_id nvme_id_table[] = { 2234 { PCI_VDEVICE(INTEL, 0x0953), 2235 .driver_data = NVME_QUIRK_STRIPE_SIZE, }, 2236 { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ 2237 .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, 2238 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2239 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 2240 { 0, } 2241 }; 2242 MODULE_DEVICE_TABLE(pci, nvme_id_table); 2243 2244 static struct pci_driver nvme_driver = { 2245 .name = "nvme", 2246 .id_table = nvme_id_table, 2247 .probe = nvme_probe, 2248 .remove = nvme_remove, 2249 .shutdown = nvme_shutdown, 2250 .driver = { 2251 .pm = &nvme_dev_pm_ops, 2252 }, 2253 .err_handler = &nvme_err_handler, 2254 }; 2255 2256 static int __init nvme_init(void) 2257 { 2258 int result; 2259 2260 init_waitqueue_head(&nvme_kthread_wait); 2261 2262 nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); 2263 if (!nvme_workq) 2264 return -ENOMEM; 2265 2266 result = nvme_core_init(); 2267 if (result < 0) 2268 goto kill_workq; 2269 2270 result = pci_register_driver(&nvme_driver); 2271 if (result) 2272 goto core_exit; 2273 return 0; 2274 2275 core_exit: 2276 nvme_core_exit(); 2277 kill_workq: 2278 destroy_workqueue(nvme_workq); 2279 return result; 2280 } 2281 2282 static void __exit nvme_exit(void) 2283 { 2284 pci_unregister_driver(&nvme_driver); 2285 nvme_core_exit(); 2286 destroy_workqueue(nvme_workq); 2287 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 2288 _nvme_check_size(); 2289 } 2290 2291 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2292 MODULE_LICENSE("GPL"); 2293 MODULE_VERSION("1.0"); 2294 module_init(nvme_init); 2295 module_exit(nvme_exit); 2296