1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVM Express device driver 4 * Copyright (c) 2011-2014, Intel Corporation. 5 */ 6 7 #include <linux/acpi.h> 8 #include <linux/async.h> 9 #include <linux/blkdev.h> 10 #include <linux/blk-mq-dma.h> 11 #include <linux/blk-integrity.h> 12 #include <linux/dmi.h> 13 #include <linux/init.h> 14 #include <linux/interrupt.h> 15 #include <linux/io.h> 16 #include <linux/kstrtox.h> 17 #include <linux/memremap.h> 18 #include <linux/mm.h> 19 #include <linux/module.h> 20 #include <linux/mutex.h> 21 #include <linux/nodemask.h> 22 #include <linux/once.h> 23 #include <linux/pci.h> 24 #include <linux/suspend.h> 25 #include <linux/t10-pi.h> 26 #include <linux/types.h> 27 #include <linux/io-64-nonatomic-lo-hi.h> 28 #include <linux/io-64-nonatomic-hi-lo.h> 29 #include <linux/sed-opal.h> 30 31 #include "trace.h" 32 #include "nvme.h" 33 34 #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) 35 #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) 36 37 /* Optimisation for I/Os between 4k and 128k */ 38 #define NVME_SMALL_POOL_SIZE 256 39 40 /* 41 * Arbitrary upper bound. 42 */ 43 #define NVME_MAX_BYTES SZ_8M 44 #define NVME_MAX_NR_DESCRIPTORS 5 45 46 /* 47 * For data SGLs we support a single descriptors worth of SGL entries. 48 * For PRPs, segments don't matter at all. 49 */ 50 #define NVME_MAX_SEGS \ 51 (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) 52 53 /* 54 * For metadata SGLs, only the small descriptor is supported, and the first 55 * entry is the segment descriptor, which for the data pointer sits in the SQE. 56 */ 57 #define NVME_MAX_META_SEGS \ 58 ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1) 59 60 /* 61 * The last entry is used to link to the next descriptor. 62 */ 63 #define PRPS_PER_PAGE \ 64 (((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1) 65 66 /* 67 * I/O could be non-aligned both at the beginning and end. 68 */ 69 #define MAX_PRP_RANGE \ 70 (NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1)) 71 72 static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <= 73 (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE)); 74 75 static int use_threaded_interrupts; 76 module_param(use_threaded_interrupts, int, 0444); 77 78 static bool use_cmb_sqes = true; 79 module_param(use_cmb_sqes, bool, 0444); 80 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 81 82 static unsigned int max_host_mem_size_mb = 128; 83 module_param(max_host_mem_size_mb, uint, 0444); 84 MODULE_PARM_DESC(max_host_mem_size_mb, 85 "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); 86 87 static unsigned int sgl_threshold = SZ_32K; 88 module_param(sgl_threshold, uint, 0644); 89 MODULE_PARM_DESC(sgl_threshold, 90 "Use SGLs when average request segment size is larger or equal to " 91 "this size. Use 0 to disable SGLs."); 92 93 #define NVME_PCI_MIN_QUEUE_SIZE 2 94 #define NVME_PCI_MAX_QUEUE_SIZE 4095 95 static int io_queue_depth_set(const char *val, const struct kernel_param *kp); 96 static const struct kernel_param_ops io_queue_depth_ops = { 97 .set = io_queue_depth_set, 98 .get = param_get_uint, 99 }; 100 101 static unsigned int io_queue_depth = 1024; 102 module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); 103 MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096"); 104 105 static int io_queue_count_set(const char *val, const struct kernel_param *kp) 106 { 107 unsigned int n; 108 int ret; 109 110 ret = kstrtouint(val, 10, &n); 111 if (ret != 0 || n > blk_mq_num_possible_queues(0)) 112 return -EINVAL; 113 return param_set_uint(val, kp); 114 } 115 116 static const struct kernel_param_ops io_queue_count_ops = { 117 .set = io_queue_count_set, 118 .get = param_get_uint, 119 }; 120 121 static unsigned int write_queues; 122 module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644); 123 MODULE_PARM_DESC(write_queues, 124 "Number of queues to use for writes. If not set, reads and writes " 125 "will share a queue set."); 126 127 static unsigned int poll_queues; 128 module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); 129 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); 130 131 static bool noacpi; 132 module_param(noacpi, bool, 0444); 133 MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); 134 135 struct nvme_dev; 136 struct nvme_queue; 137 138 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); 139 static void nvme_delete_io_queues(struct nvme_dev *dev); 140 static void nvme_update_attrs(struct nvme_dev *dev); 141 142 struct nvme_descriptor_pools { 143 struct dma_pool *large; 144 struct dma_pool *small; 145 }; 146 147 /* 148 * Represents an NVM Express device. Each nvme_dev is a PCI function. 149 */ 150 struct nvme_dev { 151 struct nvme_queue *queues; 152 struct blk_mq_tag_set tagset; 153 struct blk_mq_tag_set admin_tagset; 154 u32 __iomem *dbs; 155 struct device *dev; 156 unsigned online_queues; 157 unsigned max_qid; 158 unsigned io_queues[HCTX_MAX_TYPES]; 159 unsigned int num_vecs; 160 u32 q_depth; 161 int io_sqes; 162 u32 db_stride; 163 void __iomem *bar; 164 unsigned long bar_mapped_size; 165 struct mutex shutdown_lock; 166 bool subsystem; 167 u64 cmb_size; 168 bool cmb_use_sqes; 169 u32 cmbsz; 170 u32 cmbloc; 171 struct nvme_ctrl ctrl; 172 u32 last_ps; 173 bool hmb; 174 struct sg_table *hmb_sgt; 175 176 mempool_t *iod_meta_mempool; 177 178 /* shadow doorbell buffer support: */ 179 __le32 *dbbuf_dbs; 180 dma_addr_t dbbuf_dbs_dma_addr; 181 __le32 *dbbuf_eis; 182 dma_addr_t dbbuf_eis_dma_addr; 183 184 /* host memory buffer support: */ 185 u64 host_mem_size; 186 u32 nr_host_mem_descs; 187 u32 host_mem_descs_size; 188 dma_addr_t host_mem_descs_dma; 189 struct nvme_host_mem_buf_desc *host_mem_descs; 190 void **host_mem_desc_bufs; 191 unsigned int nr_allocated_queues; 192 unsigned int nr_write_queues; 193 unsigned int nr_poll_queues; 194 struct nvme_descriptor_pools descriptor_pools[]; 195 }; 196 197 static int io_queue_depth_set(const char *val, const struct kernel_param *kp) 198 { 199 return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, 200 NVME_PCI_MAX_QUEUE_SIZE); 201 } 202 203 static inline unsigned int sq_idx(unsigned int qid, u32 stride) 204 { 205 return qid * 2 * stride; 206 } 207 208 static inline unsigned int cq_idx(unsigned int qid, u32 stride) 209 { 210 return (qid * 2 + 1) * stride; 211 } 212 213 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) 214 { 215 return container_of(ctrl, struct nvme_dev, ctrl); 216 } 217 218 /* 219 * An NVM Express queue. Each device has at least two (one for admin 220 * commands and one for I/O commands). 221 */ 222 struct nvme_queue { 223 struct nvme_dev *dev; 224 struct nvme_descriptor_pools descriptor_pools; 225 spinlock_t sq_lock; 226 void *sq_cmds; 227 /* only used for poll queues: */ 228 spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; 229 struct nvme_completion *cqes; 230 dma_addr_t sq_dma_addr; 231 dma_addr_t cq_dma_addr; 232 u32 __iomem *q_db; 233 u32 q_depth; 234 u16 cq_vector; 235 u16 sq_tail; 236 u16 last_sq_tail; 237 u16 cq_head; 238 u16 qid; 239 u8 cq_phase; 240 u8 sqes; 241 unsigned long flags; 242 #define NVMEQ_ENABLED 0 243 #define NVMEQ_SQ_CMB 1 244 #define NVMEQ_DELETE_ERROR 2 245 #define NVMEQ_POLLED 3 246 __le32 *dbbuf_sq_db; 247 __le32 *dbbuf_cq_db; 248 __le32 *dbbuf_sq_ei; 249 __le32 *dbbuf_cq_ei; 250 struct completion delete_done; 251 }; 252 253 /* bits for iod->flags */ 254 enum nvme_iod_flags { 255 /* this command has been aborted by the timeout handler */ 256 IOD_ABORTED = 1U << 0, 257 258 /* uses the small descriptor pool */ 259 IOD_SMALL_DESCRIPTOR = 1U << 1, 260 261 /* single segment dma mapping */ 262 IOD_SINGLE_SEGMENT = 1U << 2, 263 }; 264 265 /* 266 * The nvme_iod describes the data in an I/O. 267 */ 268 struct nvme_iod { 269 struct nvme_request req; 270 struct nvme_command cmd; 271 u8 flags; 272 u8 nr_descriptors; 273 274 unsigned int total_len; 275 struct dma_iova_state dma_state; 276 void *descriptors[NVME_MAX_NR_DESCRIPTORS]; 277 278 dma_addr_t meta_dma; 279 struct sg_table meta_sgt; 280 struct nvme_sgl_desc *meta_descriptor; 281 }; 282 283 static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) 284 { 285 return dev->nr_allocated_queues * 8 * dev->db_stride; 286 } 287 288 static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev) 289 { 290 unsigned int mem_size = nvme_dbbuf_size(dev); 291 292 if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP)) 293 return; 294 295 if (dev->dbbuf_dbs) { 296 /* 297 * Clear the dbbuf memory so the driver doesn't observe stale 298 * values from the previous instantiation. 299 */ 300 memset(dev->dbbuf_dbs, 0, mem_size); 301 memset(dev->dbbuf_eis, 0, mem_size); 302 return; 303 } 304 305 dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, 306 &dev->dbbuf_dbs_dma_addr, 307 GFP_KERNEL); 308 if (!dev->dbbuf_dbs) 309 goto fail; 310 dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, 311 &dev->dbbuf_eis_dma_addr, 312 GFP_KERNEL); 313 if (!dev->dbbuf_eis) 314 goto fail_free_dbbuf_dbs; 315 return; 316 317 fail_free_dbbuf_dbs: 318 dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs, 319 dev->dbbuf_dbs_dma_addr); 320 dev->dbbuf_dbs = NULL; 321 fail: 322 dev_warn(dev->dev, "unable to allocate dma for dbbuf\n"); 323 } 324 325 static void nvme_dbbuf_dma_free(struct nvme_dev *dev) 326 { 327 unsigned int mem_size = nvme_dbbuf_size(dev); 328 329 if (dev->dbbuf_dbs) { 330 dma_free_coherent(dev->dev, mem_size, 331 dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); 332 dev->dbbuf_dbs = NULL; 333 } 334 if (dev->dbbuf_eis) { 335 dma_free_coherent(dev->dev, mem_size, 336 dev->dbbuf_eis, dev->dbbuf_eis_dma_addr); 337 dev->dbbuf_eis = NULL; 338 } 339 } 340 341 static void nvme_dbbuf_init(struct nvme_dev *dev, 342 struct nvme_queue *nvmeq, int qid) 343 { 344 if (!dev->dbbuf_dbs || !qid) 345 return; 346 347 nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)]; 348 nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)]; 349 nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)]; 350 nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)]; 351 } 352 353 static void nvme_dbbuf_free(struct nvme_queue *nvmeq) 354 { 355 if (!nvmeq->qid) 356 return; 357 358 nvmeq->dbbuf_sq_db = NULL; 359 nvmeq->dbbuf_cq_db = NULL; 360 nvmeq->dbbuf_sq_ei = NULL; 361 nvmeq->dbbuf_cq_ei = NULL; 362 } 363 364 static void nvme_dbbuf_set(struct nvme_dev *dev) 365 { 366 struct nvme_command c = { }; 367 unsigned int i; 368 369 if (!dev->dbbuf_dbs) 370 return; 371 372 c.dbbuf.opcode = nvme_admin_dbbuf; 373 c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr); 374 c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); 375 376 if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { 377 dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); 378 /* Free memory and continue on */ 379 nvme_dbbuf_dma_free(dev); 380 381 for (i = 1; i <= dev->online_queues; i++) 382 nvme_dbbuf_free(&dev->queues[i]); 383 } 384 } 385 386 static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) 387 { 388 return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old); 389 } 390 391 /* Update dbbuf and return true if an MMIO is required */ 392 static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, 393 volatile __le32 *dbbuf_ei) 394 { 395 if (dbbuf_db) { 396 u16 old_value, event_idx; 397 398 /* 399 * Ensure that the queue is written before updating 400 * the doorbell in memory 401 */ 402 wmb(); 403 404 old_value = le32_to_cpu(*dbbuf_db); 405 *dbbuf_db = cpu_to_le32(value); 406 407 /* 408 * Ensure that the doorbell is updated before reading the event 409 * index from memory. The controller needs to provide similar 410 * ordering to ensure the event index is updated before reading 411 * the doorbell. 412 */ 413 mb(); 414 415 event_idx = le32_to_cpu(*dbbuf_ei); 416 if (!nvme_dbbuf_need_event(event_idx, value, old_value)) 417 return false; 418 } 419 420 return true; 421 } 422 423 static struct nvme_descriptor_pools * 424 nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) 425 { 426 struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node]; 427 size_t small_align = NVME_SMALL_POOL_SIZE; 428 429 if (pools->small) 430 return pools; /* already initialized */ 431 432 pools->large = dma_pool_create_node("nvme descriptor page", dev->dev, 433 NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node); 434 if (!pools->large) 435 return ERR_PTR(-ENOMEM); 436 437 if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) 438 small_align = 512; 439 440 pools->small = dma_pool_create_node("nvme descriptor small", dev->dev, 441 NVME_SMALL_POOL_SIZE, small_align, 0, numa_node); 442 if (!pools->small) { 443 dma_pool_destroy(pools->large); 444 pools->large = NULL; 445 return ERR_PTR(-ENOMEM); 446 } 447 448 return pools; 449 } 450 451 static void nvme_release_descriptor_pools(struct nvme_dev *dev) 452 { 453 unsigned i; 454 455 for (i = 0; i < nr_node_ids; i++) { 456 struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i]; 457 458 dma_pool_destroy(pools->large); 459 dma_pool_destroy(pools->small); 460 } 461 } 462 463 static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data, 464 unsigned qid) 465 { 466 struct nvme_dev *dev = to_nvme_dev(data); 467 struct nvme_queue *nvmeq = &dev->queues[qid]; 468 struct nvme_descriptor_pools *pools; 469 struct blk_mq_tags *tags; 470 471 tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0]; 472 WARN_ON(tags != hctx->tags); 473 pools = nvme_setup_descriptor_pools(dev, hctx->numa_node); 474 if (IS_ERR(pools)) 475 return PTR_ERR(pools); 476 477 nvmeq->descriptor_pools = *pools; 478 hctx->driver_data = nvmeq; 479 return 0; 480 } 481 482 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 483 unsigned int hctx_idx) 484 { 485 WARN_ON(hctx_idx != 0); 486 return nvme_init_hctx_common(hctx, data, 0); 487 } 488 489 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 490 unsigned int hctx_idx) 491 { 492 return nvme_init_hctx_common(hctx, data, hctx_idx + 1); 493 } 494 495 static int nvme_pci_init_request(struct blk_mq_tag_set *set, 496 struct request *req, unsigned int hctx_idx, 497 unsigned int numa_node) 498 { 499 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 500 501 nvme_req(req)->ctrl = set->driver_data; 502 nvme_req(req)->cmd = &iod->cmd; 503 return 0; 504 } 505 506 static int queue_irq_offset(struct nvme_dev *dev) 507 { 508 /* if we have more than 1 vec, admin queue offsets us by 1 */ 509 if (dev->num_vecs > 1) 510 return 1; 511 512 return 0; 513 } 514 515 static void nvme_pci_map_queues(struct blk_mq_tag_set *set) 516 { 517 struct nvme_dev *dev = to_nvme_dev(set->driver_data); 518 int i, qoff, offset; 519 520 offset = queue_irq_offset(dev); 521 for (i = 0, qoff = 0; i < set->nr_maps; i++) { 522 struct blk_mq_queue_map *map = &set->map[i]; 523 524 map->nr_queues = dev->io_queues[i]; 525 if (!map->nr_queues) { 526 BUG_ON(i == HCTX_TYPE_DEFAULT); 527 continue; 528 } 529 530 /* 531 * The poll queue(s) doesn't have an IRQ (and hence IRQ 532 * affinity), so use the regular blk-mq cpu mapping 533 */ 534 map->queue_offset = qoff; 535 if (i != HCTX_TYPE_POLL && offset) 536 blk_mq_map_hw_queues(map, dev->dev, offset); 537 else 538 blk_mq_map_queues(map); 539 qoff += map->nr_queues; 540 offset += map->nr_queues; 541 } 542 } 543 544 /* 545 * Write sq tail if we are asked to, or if the next command would wrap. 546 */ 547 static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) 548 { 549 if (!write_sq) { 550 u16 next_tail = nvmeq->sq_tail + 1; 551 552 if (next_tail == nvmeq->q_depth) 553 next_tail = 0; 554 if (next_tail != nvmeq->last_sq_tail) 555 return; 556 } 557 558 if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, 559 nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) 560 writel(nvmeq->sq_tail, nvmeq->q_db); 561 nvmeq->last_sq_tail = nvmeq->sq_tail; 562 } 563 564 static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq, 565 struct nvme_command *cmd) 566 { 567 memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), 568 absolute_pointer(cmd), sizeof(*cmd)); 569 if (++nvmeq->sq_tail == nvmeq->q_depth) 570 nvmeq->sq_tail = 0; 571 } 572 573 static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) 574 { 575 struct nvme_queue *nvmeq = hctx->driver_data; 576 577 spin_lock(&nvmeq->sq_lock); 578 if (nvmeq->sq_tail != nvmeq->last_sq_tail) 579 nvme_write_sq_db(nvmeq, true); 580 spin_unlock(&nvmeq->sq_lock); 581 } 582 583 enum nvme_use_sgl { 584 SGL_UNSUPPORTED, 585 SGL_SUPPORTED, 586 SGL_FORCED, 587 }; 588 589 static inline bool nvme_pci_metadata_use_sgls(struct request *req) 590 { 591 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 592 struct nvme_dev *dev = nvmeq->dev; 593 594 if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 595 return false; 596 return req->nr_integrity_segments > 1 || 597 nvme_req(req)->flags & NVME_REQ_USERCMD; 598 } 599 600 static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev, 601 struct request *req) 602 { 603 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 604 605 if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) { 606 if (nvme_req(req)->flags & NVME_REQ_USERCMD) 607 return SGL_FORCED; 608 if (req->nr_integrity_segments > 1) 609 return SGL_FORCED; 610 return SGL_SUPPORTED; 611 } 612 613 return SGL_UNSUPPORTED; 614 } 615 616 static unsigned int nvme_pci_avg_seg_size(struct request *req) 617 { 618 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 619 unsigned int nseg; 620 621 if (blk_rq_dma_map_coalesce(&iod->dma_state)) 622 nseg = 1; 623 else 624 nseg = blk_rq_nr_phys_segments(req); 625 return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); 626 } 627 628 static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, 629 struct nvme_iod *iod) 630 { 631 if (iod->flags & IOD_SMALL_DESCRIPTOR) 632 return nvmeq->descriptor_pools.small; 633 return nvmeq->descriptor_pools.large; 634 } 635 636 static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) 637 { 638 return cmd->common.flags & 639 (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG); 640 } 641 642 static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd) 643 { 644 if (nvme_pci_cmd_use_sgl(cmd)) 645 return le64_to_cpu(cmd->common.dptr.sgl.addr); 646 return le64_to_cpu(cmd->common.dptr.prp2); 647 } 648 649 static void nvme_free_descriptors(struct request *req) 650 { 651 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 652 const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; 653 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 654 dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd); 655 int i; 656 657 if (iod->nr_descriptors == 1) { 658 dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0], 659 dma_addr); 660 return; 661 } 662 663 for (i = 0; i < iod->nr_descriptors; i++) { 664 __le64 *prp_list = iod->descriptors[i]; 665 dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); 666 667 dma_pool_free(nvmeq->descriptor_pools.large, prp_list, 668 dma_addr); 669 dma_addr = next_dma_addr; 670 } 671 } 672 673 static void nvme_free_prps(struct request *req) 674 { 675 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 676 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 677 struct device *dma_dev = nvmeq->dev->dev; 678 enum dma_data_direction dir = rq_dma_dir(req); 679 int length = iod->total_len; 680 dma_addr_t dma_addr; 681 int i, desc; 682 __le64 *prp_list; 683 u32 dma_len; 684 685 dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1); 686 dma_len = min_t(u32, length, 687 NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1))); 688 length -= dma_len; 689 if (!length) { 690 dma_unmap_page(dma_dev, dma_addr, dma_len, dir); 691 return; 692 } 693 694 if (length <= NVME_CTRL_PAGE_SIZE) { 695 dma_unmap_page(dma_dev, dma_addr, dma_len, dir); 696 dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2); 697 dma_unmap_page(dma_dev, dma_addr, length, dir); 698 return; 699 } 700 701 i = 0; 702 desc = 0; 703 prp_list = iod->descriptors[desc]; 704 do { 705 dma_unmap_page(dma_dev, dma_addr, dma_len, dir); 706 if (i == NVME_CTRL_PAGE_SIZE >> 3) { 707 prp_list = iod->descriptors[++desc]; 708 i = 0; 709 } 710 711 dma_addr = le64_to_cpu(prp_list[i++]); 712 dma_len = min(length, NVME_CTRL_PAGE_SIZE); 713 length -= dma_len; 714 } while (length); 715 } 716 717 static void nvme_free_sgls(struct request *req) 718 { 719 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 720 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 721 struct device *dma_dev = nvmeq->dev->dev; 722 dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr); 723 unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length); 724 struct nvme_sgl_desc *sg_list = iod->descriptors[0]; 725 enum dma_data_direction dir = rq_dma_dir(req); 726 727 if (iod->nr_descriptors) { 728 unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i; 729 730 for (i = 0; i < nr_entries; i++) 731 dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), 732 le32_to_cpu(sg_list[i].length), dir); 733 } else { 734 dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir); 735 } 736 } 737 738 static void nvme_unmap_data(struct request *req) 739 { 740 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 741 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 742 struct device *dma_dev = nvmeq->dev->dev; 743 744 if (iod->flags & IOD_SINGLE_SEGMENT) { 745 static_assert(offsetof(union nvme_data_ptr, prp1) == 746 offsetof(union nvme_data_ptr, sgl.addr)); 747 dma_unmap_page(dma_dev, le64_to_cpu(iod->cmd.common.dptr.prp1), 748 iod->total_len, rq_dma_dir(req)); 749 return; 750 } 751 752 if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { 753 if (nvme_pci_cmd_use_sgl(&iod->cmd)) 754 nvme_free_sgls(req); 755 else 756 nvme_free_prps(req); 757 } 758 759 if (iod->nr_descriptors) 760 nvme_free_descriptors(req); 761 } 762 763 static blk_status_t nvme_pci_setup_data_prp(struct request *req, 764 struct blk_dma_iter *iter) 765 { 766 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 767 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 768 unsigned int length = blk_rq_payload_bytes(req); 769 dma_addr_t prp1_dma, prp2_dma = 0; 770 unsigned int prp_len, i; 771 __le64 *prp_list; 772 773 /* 774 * PRP1 always points to the start of the DMA transfers. 775 * 776 * This is the only PRP (except for the list entries) that could be 777 * non-aligned. 778 */ 779 prp1_dma = iter->addr; 780 prp_len = min(length, NVME_CTRL_PAGE_SIZE - 781 (iter->addr & (NVME_CTRL_PAGE_SIZE - 1))); 782 iod->total_len += prp_len; 783 iter->addr += prp_len; 784 iter->len -= prp_len; 785 length -= prp_len; 786 if (!length) 787 goto done; 788 789 if (!iter->len) { 790 if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, 791 &iod->dma_state, iter)) { 792 if (WARN_ON_ONCE(!iter->status)) 793 goto bad_sgl; 794 goto done; 795 } 796 } 797 798 /* 799 * PRP2 is usually a list, but can point to data if all data to be 800 * transferred fits into PRP1 + PRP2: 801 */ 802 if (length <= NVME_CTRL_PAGE_SIZE) { 803 prp2_dma = iter->addr; 804 iod->total_len += length; 805 goto done; 806 } 807 808 if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <= 809 NVME_SMALL_POOL_SIZE / sizeof(__le64)) 810 iod->flags |= IOD_SMALL_DESCRIPTOR; 811 812 prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, 813 &prp2_dma); 814 if (!prp_list) { 815 iter->status = BLK_STS_RESOURCE; 816 goto done; 817 } 818 iod->descriptors[iod->nr_descriptors++] = prp_list; 819 820 i = 0; 821 for (;;) { 822 prp_list[i++] = cpu_to_le64(iter->addr); 823 prp_len = min(length, NVME_CTRL_PAGE_SIZE); 824 if (WARN_ON_ONCE(iter->len < prp_len)) 825 goto bad_sgl; 826 827 iod->total_len += prp_len; 828 iter->addr += prp_len; 829 iter->len -= prp_len; 830 length -= prp_len; 831 if (!length) 832 break; 833 834 if (iter->len == 0) { 835 if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, 836 &iod->dma_state, iter)) { 837 if (WARN_ON_ONCE(!iter->status)) 838 goto bad_sgl; 839 goto done; 840 } 841 } 842 843 /* 844 * If we've filled the entire descriptor, allocate a new that is 845 * pointed to be the last entry in the previous PRP list. To 846 * accommodate for that move the last actual entry to the new 847 * descriptor. 848 */ 849 if (i == NVME_CTRL_PAGE_SIZE >> 3) { 850 __le64 *old_prp_list = prp_list; 851 dma_addr_t prp_list_dma; 852 853 prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, 854 GFP_ATOMIC, &prp_list_dma); 855 if (!prp_list) { 856 iter->status = BLK_STS_RESOURCE; 857 goto done; 858 } 859 iod->descriptors[iod->nr_descriptors++] = prp_list; 860 861 prp_list[0] = old_prp_list[i - 1]; 862 old_prp_list[i - 1] = cpu_to_le64(prp_list_dma); 863 i = 1; 864 } 865 } 866 867 done: 868 /* 869 * nvme_unmap_data uses the DPT field in the SQE to tear down the 870 * mapping, so initialize it even for failures. 871 */ 872 iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma); 873 iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma); 874 if (unlikely(iter->status)) 875 nvme_unmap_data(req); 876 return iter->status; 877 878 bad_sgl: 879 dev_err_once(nvmeq->dev->dev, 880 "Incorrectly formed request for payload:%d nents:%d\n", 881 blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req)); 882 return BLK_STS_IOERR; 883 } 884 885 static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, 886 struct blk_dma_iter *iter) 887 { 888 sge->addr = cpu_to_le64(iter->addr); 889 sge->length = cpu_to_le32(iter->len); 890 sge->type = NVME_SGL_FMT_DATA_DESC << 4; 891 } 892 893 static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, 894 dma_addr_t dma_addr, int entries) 895 { 896 sge->addr = cpu_to_le64(dma_addr); 897 sge->length = cpu_to_le32(entries * sizeof(*sge)); 898 sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; 899 } 900 901 static blk_status_t nvme_pci_setup_data_sgl(struct request *req, 902 struct blk_dma_iter *iter) 903 { 904 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 905 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 906 unsigned int entries = blk_rq_nr_phys_segments(req); 907 struct nvme_sgl_desc *sg_list; 908 dma_addr_t sgl_dma; 909 unsigned int mapped = 0; 910 911 /* set the transfer type as SGL */ 912 iod->cmd.common.flags = NVME_CMD_SGL_METABUF; 913 914 if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) { 915 nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter); 916 iod->total_len += iter->len; 917 return BLK_STS_OK; 918 } 919 920 if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list)) 921 iod->flags |= IOD_SMALL_DESCRIPTOR; 922 923 sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, 924 &sgl_dma); 925 if (!sg_list) 926 return BLK_STS_RESOURCE; 927 iod->descriptors[iod->nr_descriptors++] = sg_list; 928 929 do { 930 if (WARN_ON_ONCE(mapped == entries)) { 931 iter->status = BLK_STS_IOERR; 932 break; 933 } 934 nvme_pci_sgl_set_data(&sg_list[mapped++], iter); 935 iod->total_len += iter->len; 936 } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, 937 iter)); 938 939 nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); 940 if (unlikely(iter->status)) 941 nvme_free_sgls(req); 942 return iter->status; 943 } 944 945 static blk_status_t nvme_pci_setup_data_simple(struct request *req, 946 enum nvme_use_sgl use_sgl) 947 { 948 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 949 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 950 struct bio_vec bv = req_bvec(req); 951 unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1); 952 bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2; 953 dma_addr_t dma_addr; 954 955 if (!use_sgl && !prp_possible) 956 return BLK_STS_AGAIN; 957 if (is_pci_p2pdma_page(bv.bv_page)) 958 return BLK_STS_AGAIN; 959 960 dma_addr = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); 961 if (dma_mapping_error(nvmeq->dev->dev, dma_addr)) 962 return BLK_STS_RESOURCE; 963 iod->total_len = bv.bv_len; 964 iod->flags |= IOD_SINGLE_SEGMENT; 965 966 if (use_sgl == SGL_FORCED || !prp_possible) { 967 iod->cmd.common.flags = NVME_CMD_SGL_METABUF; 968 iod->cmd.common.dptr.sgl.addr = cpu_to_le64(dma_addr); 969 iod->cmd.common.dptr.sgl.length = cpu_to_le32(bv.bv_len); 970 iod->cmd.common.dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; 971 } else { 972 unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset; 973 974 iod->cmd.common.dptr.prp1 = cpu_to_le64(dma_addr); 975 iod->cmd.common.dptr.prp2 = 0; 976 if (bv.bv_len > first_prp_len) 977 iod->cmd.common.dptr.prp2 = 978 cpu_to_le64(dma_addr + first_prp_len); 979 } 980 981 return BLK_STS_OK; 982 } 983 984 static blk_status_t nvme_map_data(struct request *req) 985 { 986 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 987 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 988 struct nvme_dev *dev = nvmeq->dev; 989 enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req); 990 struct blk_dma_iter iter; 991 blk_status_t ret; 992 993 /* 994 * Try to skip the DMA iterator for single segment requests, as that 995 * significantly improves performances for small I/O sizes. 996 */ 997 if (blk_rq_nr_phys_segments(req) == 1) { 998 ret = nvme_pci_setup_data_simple(req, use_sgl); 999 if (ret != BLK_STS_AGAIN) 1000 return ret; 1001 } 1002 1003 if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) 1004 return iter.status; 1005 1006 if (use_sgl == SGL_FORCED || 1007 (use_sgl == SGL_SUPPORTED && 1008 (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) 1009 return nvme_pci_setup_data_sgl(req, &iter); 1010 return nvme_pci_setup_data_prp(req, &iter); 1011 } 1012 1013 static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge, 1014 struct scatterlist *sg) 1015 { 1016 sge->addr = cpu_to_le64(sg_dma_address(sg)); 1017 sge->length = cpu_to_le32(sg_dma_len(sg)); 1018 sge->type = NVME_SGL_FMT_DATA_DESC << 4; 1019 } 1020 1021 static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) 1022 { 1023 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1024 struct nvme_dev *dev = nvmeq->dev; 1025 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1026 struct nvme_sgl_desc *sg_list; 1027 struct scatterlist *sgl, *sg; 1028 unsigned int entries; 1029 dma_addr_t sgl_dma; 1030 int rc, i; 1031 1032 iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); 1033 if (!iod->meta_sgt.sgl) 1034 return BLK_STS_RESOURCE; 1035 1036 sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); 1037 iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, 1038 iod->meta_sgt.sgl); 1039 if (!iod->meta_sgt.orig_nents) 1040 goto out_free_sg; 1041 1042 rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 1043 DMA_ATTR_NO_WARN); 1044 if (rc) 1045 goto out_free_sg; 1046 1047 sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, 1048 &sgl_dma); 1049 if (!sg_list) 1050 goto out_unmap_sg; 1051 1052 entries = iod->meta_sgt.nents; 1053 iod->meta_descriptor = sg_list; 1054 iod->meta_dma = sgl_dma; 1055 1056 iod->cmd.common.flags = NVME_CMD_SGL_METASEG; 1057 iod->cmd.common.metadata = cpu_to_le64(sgl_dma); 1058 1059 sgl = iod->meta_sgt.sgl; 1060 if (entries == 1) { 1061 nvme_pci_sgl_set_data_sg(sg_list, sgl); 1062 return BLK_STS_OK; 1063 } 1064 1065 sgl_dma += sizeof(*sg_list); 1066 nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); 1067 for_each_sg(sgl, sg, entries, i) 1068 nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg); 1069 1070 return BLK_STS_OK; 1071 1072 out_unmap_sg: 1073 dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); 1074 out_free_sg: 1075 mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); 1076 return BLK_STS_RESOURCE; 1077 } 1078 1079 static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) 1080 { 1081 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1082 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1083 struct bio_vec bv = rq_integrity_vec(req); 1084 1085 iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); 1086 if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) 1087 return BLK_STS_IOERR; 1088 iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); 1089 return BLK_STS_OK; 1090 } 1091 1092 static blk_status_t nvme_map_metadata(struct request *req) 1093 { 1094 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1095 1096 if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) && 1097 nvme_pci_metadata_use_sgls(req)) 1098 return nvme_pci_setup_meta_sgls(req); 1099 return nvme_pci_setup_meta_mptr(req); 1100 } 1101 1102 static blk_status_t nvme_prep_rq(struct request *req) 1103 { 1104 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1105 blk_status_t ret; 1106 1107 iod->flags = 0; 1108 iod->nr_descriptors = 0; 1109 iod->total_len = 0; 1110 iod->meta_sgt.nents = 0; 1111 1112 ret = nvme_setup_cmd(req->q->queuedata, req); 1113 if (ret) 1114 return ret; 1115 1116 if (blk_rq_nr_phys_segments(req)) { 1117 ret = nvme_map_data(req); 1118 if (ret) 1119 goto out_free_cmd; 1120 } 1121 1122 if (blk_integrity_rq(req)) { 1123 ret = nvme_map_metadata(req); 1124 if (ret) 1125 goto out_unmap_data; 1126 } 1127 1128 nvme_start_request(req); 1129 return BLK_STS_OK; 1130 out_unmap_data: 1131 if (blk_rq_nr_phys_segments(req)) 1132 nvme_unmap_data(req); 1133 out_free_cmd: 1134 nvme_cleanup_cmd(req); 1135 return ret; 1136 } 1137 1138 static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 1139 const struct blk_mq_queue_data *bd) 1140 { 1141 struct nvme_queue *nvmeq = hctx->driver_data; 1142 struct nvme_dev *dev = nvmeq->dev; 1143 struct request *req = bd->rq; 1144 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1145 blk_status_t ret; 1146 1147 /* 1148 * We should not need to do this, but we're still using this to 1149 * ensure we can drain requests on a dying queue. 1150 */ 1151 if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) 1152 return BLK_STS_IOERR; 1153 1154 if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) 1155 return nvme_fail_nonready_command(&dev->ctrl, req); 1156 1157 ret = nvme_prep_rq(req); 1158 if (unlikely(ret)) 1159 return ret; 1160 spin_lock(&nvmeq->sq_lock); 1161 nvme_sq_copy_cmd(nvmeq, &iod->cmd); 1162 nvme_write_sq_db(nvmeq, bd->last); 1163 spin_unlock(&nvmeq->sq_lock); 1164 return BLK_STS_OK; 1165 } 1166 1167 static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist) 1168 { 1169 struct request *req; 1170 1171 if (rq_list_empty(rqlist)) 1172 return; 1173 1174 spin_lock(&nvmeq->sq_lock); 1175 while ((req = rq_list_pop(rqlist))) { 1176 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1177 1178 nvme_sq_copy_cmd(nvmeq, &iod->cmd); 1179 } 1180 nvme_write_sq_db(nvmeq, true); 1181 spin_unlock(&nvmeq->sq_lock); 1182 } 1183 1184 static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) 1185 { 1186 /* 1187 * We should not need to do this, but we're still using this to 1188 * ensure we can drain requests on a dying queue. 1189 */ 1190 if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) 1191 return false; 1192 if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) 1193 return false; 1194 1195 return nvme_prep_rq(req) == BLK_STS_OK; 1196 } 1197 1198 static void nvme_queue_rqs(struct rq_list *rqlist) 1199 { 1200 struct rq_list submit_list = { }; 1201 struct rq_list requeue_list = { }; 1202 struct nvme_queue *nvmeq = NULL; 1203 struct request *req; 1204 1205 while ((req = rq_list_pop(rqlist))) { 1206 if (nvmeq && nvmeq != req->mq_hctx->driver_data) 1207 nvme_submit_cmds(nvmeq, &submit_list); 1208 nvmeq = req->mq_hctx->driver_data; 1209 1210 if (nvme_prep_rq_batch(nvmeq, req)) 1211 rq_list_add_tail(&submit_list, req); 1212 else 1213 rq_list_add_tail(&requeue_list, req); 1214 } 1215 1216 if (nvmeq) 1217 nvme_submit_cmds(nvmeq, &submit_list); 1218 *rqlist = requeue_list; 1219 } 1220 1221 static __always_inline void nvme_unmap_metadata(struct request *req) 1222 { 1223 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1224 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1225 struct nvme_dev *dev = nvmeq->dev; 1226 1227 if (!iod->meta_sgt.nents) { 1228 dma_unmap_page(dev->dev, iod->meta_dma, 1229 rq_integrity_vec(req).bv_len, 1230 rq_dma_dir(req)); 1231 return; 1232 } 1233 1234 dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, 1235 iod->meta_dma); 1236 dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); 1237 mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); 1238 } 1239 1240 static __always_inline void nvme_pci_unmap_rq(struct request *req) 1241 { 1242 if (blk_integrity_rq(req)) 1243 nvme_unmap_metadata(req); 1244 if (blk_rq_nr_phys_segments(req)) 1245 nvme_unmap_data(req); 1246 } 1247 1248 static void nvme_pci_complete_rq(struct request *req) 1249 { 1250 nvme_pci_unmap_rq(req); 1251 nvme_complete_rq(req); 1252 } 1253 1254 static void nvme_pci_complete_batch(struct io_comp_batch *iob) 1255 { 1256 nvme_complete_batch(iob, nvme_pci_unmap_rq); 1257 } 1258 1259 /* We read the CQE phase first to check if the rest of the entry is valid */ 1260 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) 1261 { 1262 struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head]; 1263 1264 return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase; 1265 } 1266 1267 static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) 1268 { 1269 u16 head = nvmeq->cq_head; 1270 1271 if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, 1272 nvmeq->dbbuf_cq_ei)) 1273 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 1274 } 1275 1276 static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) 1277 { 1278 if (!nvmeq->qid) 1279 return nvmeq->dev->admin_tagset.tags[0]; 1280 return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; 1281 } 1282 1283 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, 1284 struct io_comp_batch *iob, u16 idx) 1285 { 1286 struct nvme_completion *cqe = &nvmeq->cqes[idx]; 1287 __u16 command_id = READ_ONCE(cqe->command_id); 1288 struct request *req; 1289 1290 /* 1291 * AEN requests are special as they don't time out and can 1292 * survive any kind of queue freeze and often don't respond to 1293 * aborts. We don't even bother to allocate a struct request 1294 * for them but rather special case them here. 1295 */ 1296 if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) { 1297 nvme_complete_async_event(&nvmeq->dev->ctrl, 1298 cqe->status, &cqe->result); 1299 return; 1300 } 1301 1302 req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); 1303 if (unlikely(!req)) { 1304 dev_warn(nvmeq->dev->ctrl.device, 1305 "invalid id %d completed on queue %d\n", 1306 command_id, le16_to_cpu(cqe->sq_id)); 1307 return; 1308 } 1309 1310 trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); 1311 if (!nvme_try_complete_req(req, cqe->status, cqe->result) && 1312 !blk_mq_add_to_batch(req, iob, 1313 nvme_req(req)->status != NVME_SC_SUCCESS, 1314 nvme_pci_complete_batch)) 1315 nvme_pci_complete_rq(req); 1316 } 1317 1318 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) 1319 { 1320 u32 tmp = nvmeq->cq_head + 1; 1321 1322 if (tmp == nvmeq->q_depth) { 1323 nvmeq->cq_head = 0; 1324 nvmeq->cq_phase ^= 1; 1325 } else { 1326 nvmeq->cq_head = tmp; 1327 } 1328 } 1329 1330 static inline bool nvme_poll_cq(struct nvme_queue *nvmeq, 1331 struct io_comp_batch *iob) 1332 { 1333 bool found = false; 1334 1335 while (nvme_cqe_pending(nvmeq)) { 1336 found = true; 1337 /* 1338 * load-load control dependency between phase and the rest of 1339 * the cqe requires a full read memory barrier 1340 */ 1341 dma_rmb(); 1342 nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); 1343 nvme_update_cq_head(nvmeq); 1344 } 1345 1346 if (found) 1347 nvme_ring_cq_doorbell(nvmeq); 1348 return found; 1349 } 1350 1351 static irqreturn_t nvme_irq(int irq, void *data) 1352 { 1353 struct nvme_queue *nvmeq = data; 1354 DEFINE_IO_COMP_BATCH(iob); 1355 1356 if (nvme_poll_cq(nvmeq, &iob)) { 1357 if (!rq_list_empty(&iob.req_list)) 1358 nvme_pci_complete_batch(&iob); 1359 return IRQ_HANDLED; 1360 } 1361 return IRQ_NONE; 1362 } 1363 1364 static irqreturn_t nvme_irq_check(int irq, void *data) 1365 { 1366 struct nvme_queue *nvmeq = data; 1367 1368 if (nvme_cqe_pending(nvmeq)) 1369 return IRQ_WAKE_THREAD; 1370 return IRQ_NONE; 1371 } 1372 1373 /* 1374 * Poll for completions for any interrupt driven queue 1375 * Can be called from any context. 1376 */ 1377 static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) 1378 { 1379 struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); 1380 1381 WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); 1382 1383 disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); 1384 spin_lock(&nvmeq->cq_poll_lock); 1385 nvme_poll_cq(nvmeq, NULL); 1386 spin_unlock(&nvmeq->cq_poll_lock); 1387 enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); 1388 } 1389 1390 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) 1391 { 1392 struct nvme_queue *nvmeq = hctx->driver_data; 1393 bool found; 1394 1395 if (!nvme_cqe_pending(nvmeq)) 1396 return 0; 1397 1398 spin_lock(&nvmeq->cq_poll_lock); 1399 found = nvme_poll_cq(nvmeq, iob); 1400 spin_unlock(&nvmeq->cq_poll_lock); 1401 1402 return found; 1403 } 1404 1405 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) 1406 { 1407 struct nvme_dev *dev = to_nvme_dev(ctrl); 1408 struct nvme_queue *nvmeq = &dev->queues[0]; 1409 struct nvme_command c = { }; 1410 1411 c.common.opcode = nvme_admin_async_event; 1412 c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; 1413 1414 spin_lock(&nvmeq->sq_lock); 1415 nvme_sq_copy_cmd(nvmeq, &c); 1416 nvme_write_sq_db(nvmeq, true); 1417 spin_unlock(&nvmeq->sq_lock); 1418 } 1419 1420 static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl) 1421 { 1422 struct nvme_dev *dev = to_nvme_dev(ctrl); 1423 int ret = 0; 1424 1425 /* 1426 * Taking the shutdown_lock ensures the BAR mapping is not being 1427 * altered by reset_work. Holding this lock before the RESETTING state 1428 * change, if successful, also ensures nvme_remove won't be able to 1429 * proceed to iounmap until we're done. 1430 */ 1431 mutex_lock(&dev->shutdown_lock); 1432 if (!dev->bar_mapped_size) { 1433 ret = -ENODEV; 1434 goto unlock; 1435 } 1436 1437 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) { 1438 ret = -EBUSY; 1439 goto unlock; 1440 } 1441 1442 writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR); 1443 nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE); 1444 1445 /* 1446 * Read controller status to flush the previous write and trigger a 1447 * pcie read error. 1448 */ 1449 readl(dev->bar + NVME_REG_CSTS); 1450 unlock: 1451 mutex_unlock(&dev->shutdown_lock); 1452 return ret; 1453 } 1454 1455 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 1456 { 1457 struct nvme_command c = { }; 1458 1459 c.delete_queue.opcode = opcode; 1460 c.delete_queue.qid = cpu_to_le16(id); 1461 1462 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1463 } 1464 1465 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 1466 struct nvme_queue *nvmeq, s16 vector) 1467 { 1468 struct nvme_command c = { }; 1469 int flags = NVME_QUEUE_PHYS_CONTIG; 1470 1471 if (!test_bit(NVMEQ_POLLED, &nvmeq->flags)) 1472 flags |= NVME_CQ_IRQ_ENABLED; 1473 1474 /* 1475 * Note: we (ab)use the fact that the prp fields survive if no data 1476 * is attached to the request. 1477 */ 1478 c.create_cq.opcode = nvme_admin_create_cq; 1479 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 1480 c.create_cq.cqid = cpu_to_le16(qid); 1481 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1482 c.create_cq.cq_flags = cpu_to_le16(flags); 1483 c.create_cq.irq_vector = cpu_to_le16(vector); 1484 1485 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1486 } 1487 1488 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 1489 struct nvme_queue *nvmeq) 1490 { 1491 struct nvme_ctrl *ctrl = &dev->ctrl; 1492 struct nvme_command c = { }; 1493 int flags = NVME_QUEUE_PHYS_CONTIG; 1494 1495 /* 1496 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't 1497 * set. Since URGENT priority is zeroes, it makes all queues 1498 * URGENT. 1499 */ 1500 if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ) 1501 flags |= NVME_SQ_PRIO_MEDIUM; 1502 1503 /* 1504 * Note: we (ab)use the fact that the prp fields survive if no data 1505 * is attached to the request. 1506 */ 1507 c.create_sq.opcode = nvme_admin_create_sq; 1508 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 1509 c.create_sq.sqid = cpu_to_le16(qid); 1510 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1511 c.create_sq.sq_flags = cpu_to_le16(flags); 1512 c.create_sq.cqid = cpu_to_le16(qid); 1513 1514 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1515 } 1516 1517 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1518 { 1519 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1520 } 1521 1522 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1523 { 1524 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1525 } 1526 1527 static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error) 1528 { 1529 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1530 1531 dev_warn(nvmeq->dev->ctrl.device, 1532 "Abort status: 0x%x", nvme_req(req)->status); 1533 atomic_inc(&nvmeq->dev->ctrl.abort_limit); 1534 blk_mq_free_request(req); 1535 return RQ_END_IO_NONE; 1536 } 1537 1538 static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) 1539 { 1540 /* If true, indicates loss of adapter communication, possibly by a 1541 * NVMe Subsystem reset. 1542 */ 1543 bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); 1544 1545 /* If there is a reset/reinit ongoing, we shouldn't reset again. */ 1546 switch (nvme_ctrl_state(&dev->ctrl)) { 1547 case NVME_CTRL_RESETTING: 1548 case NVME_CTRL_CONNECTING: 1549 return false; 1550 default: 1551 break; 1552 } 1553 1554 /* We shouldn't reset unless the controller is on fatal error state 1555 * _or_ if we lost the communication with it. 1556 */ 1557 if (!(csts & NVME_CSTS_CFS) && !nssro) 1558 return false; 1559 1560 return true; 1561 } 1562 1563 static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) 1564 { 1565 /* Read a config register to help see what died. */ 1566 u16 pci_status; 1567 int result; 1568 1569 result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, 1570 &pci_status); 1571 if (result == PCIBIOS_SUCCESSFUL) 1572 dev_warn(dev->ctrl.device, 1573 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", 1574 csts, pci_status); 1575 else 1576 dev_warn(dev->ctrl.device, 1577 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", 1578 csts, result); 1579 1580 if (csts != ~0) 1581 return; 1582 1583 dev_warn(dev->ctrl.device, 1584 "Does your device have a faulty power saving mode enabled?\n"); 1585 dev_warn(dev->ctrl.device, 1586 "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n"); 1587 } 1588 1589 static enum blk_eh_timer_return nvme_timeout(struct request *req) 1590 { 1591 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1592 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1593 struct nvme_dev *dev = nvmeq->dev; 1594 struct request *abort_req; 1595 struct nvme_command cmd = { }; 1596 struct pci_dev *pdev = to_pci_dev(dev->dev); 1597 u32 csts = readl(dev->bar + NVME_REG_CSTS); 1598 u8 opcode; 1599 1600 /* 1601 * Shutdown the device immediately if we see it is disconnected. This 1602 * unblocks PCIe error handling if the nvme driver is waiting in 1603 * error_resume for a device that has been removed. We can't unbind the 1604 * driver while the driver's error callback is waiting to complete, so 1605 * we're relying on a timeout to break that deadlock if a removal 1606 * occurs while reset work is running. 1607 */ 1608 if (pci_dev_is_disconnected(pdev)) 1609 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 1610 if (nvme_state_terminal(&dev->ctrl)) 1611 goto disable; 1612 1613 /* If PCI error recovery process is happening, we cannot reset or 1614 * the recovery mechanism will surely fail. 1615 */ 1616 mb(); 1617 if (pci_channel_offline(pdev)) 1618 return BLK_EH_RESET_TIMER; 1619 1620 /* 1621 * Reset immediately if the controller is failed 1622 */ 1623 if (nvme_should_reset(dev, csts)) { 1624 nvme_warn_reset(dev, csts); 1625 goto disable; 1626 } 1627 1628 /* 1629 * Did we miss an interrupt? 1630 */ 1631 if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) 1632 nvme_poll(req->mq_hctx, NULL); 1633 else 1634 nvme_poll_irqdisable(nvmeq); 1635 1636 if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) { 1637 dev_warn(dev->ctrl.device, 1638 "I/O tag %d (%04x) QID %d timeout, completion polled\n", 1639 req->tag, nvme_cid(req), nvmeq->qid); 1640 return BLK_EH_DONE; 1641 } 1642 1643 /* 1644 * Shutdown immediately if controller times out while starting. The 1645 * reset work will see the pci device disabled when it gets the forced 1646 * cancellation error. All outstanding requests are completed on 1647 * shutdown, so we return BLK_EH_DONE. 1648 */ 1649 switch (nvme_ctrl_state(&dev->ctrl)) { 1650 case NVME_CTRL_CONNECTING: 1651 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 1652 fallthrough; 1653 case NVME_CTRL_DELETING: 1654 dev_warn_ratelimited(dev->ctrl.device, 1655 "I/O tag %d (%04x) QID %d timeout, disable controller\n", 1656 req->tag, nvme_cid(req), nvmeq->qid); 1657 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1658 nvme_dev_disable(dev, true); 1659 return BLK_EH_DONE; 1660 case NVME_CTRL_RESETTING: 1661 return BLK_EH_RESET_TIMER; 1662 default: 1663 break; 1664 } 1665 1666 /* 1667 * Shutdown the controller immediately and schedule a reset if the 1668 * command was already aborted once before and still hasn't been 1669 * returned to the driver, or if this is the admin queue. 1670 */ 1671 opcode = nvme_req(req)->cmd->common.opcode; 1672 if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) { 1673 dev_warn(dev->ctrl.device, 1674 "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n", 1675 req->tag, nvme_cid(req), opcode, 1676 nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid); 1677 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1678 goto disable; 1679 } 1680 1681 if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { 1682 atomic_inc(&dev->ctrl.abort_limit); 1683 return BLK_EH_RESET_TIMER; 1684 } 1685 iod->flags |= IOD_ABORTED; 1686 1687 cmd.abort.opcode = nvme_admin_abort_cmd; 1688 cmd.abort.cid = nvme_cid(req); 1689 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1690 1691 dev_warn(nvmeq->dev->ctrl.device, 1692 "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n", 1693 req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode), 1694 nvmeq->qid, blk_op_str(req_op(req)), req_op(req), 1695 blk_rq_bytes(req)); 1696 1697 abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd), 1698 BLK_MQ_REQ_NOWAIT); 1699 if (IS_ERR(abort_req)) { 1700 atomic_inc(&dev->ctrl.abort_limit); 1701 return BLK_EH_RESET_TIMER; 1702 } 1703 nvme_init_request(abort_req, &cmd); 1704 1705 abort_req->end_io = abort_endio; 1706 abort_req->end_io_data = NULL; 1707 blk_execute_rq_nowait(abort_req, false); 1708 1709 /* 1710 * The aborted req will be completed on receiving the abort req. 1711 * We enable the timer again. If hit twice, it'll cause a device reset, 1712 * as the device then is in a faulty state. 1713 */ 1714 return BLK_EH_RESET_TIMER; 1715 1716 disable: 1717 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) { 1718 if (nvme_state_terminal(&dev->ctrl)) 1719 nvme_dev_disable(dev, true); 1720 return BLK_EH_DONE; 1721 } 1722 1723 nvme_dev_disable(dev, false); 1724 if (nvme_try_sched_reset(&dev->ctrl)) 1725 nvme_unquiesce_io_queues(&dev->ctrl); 1726 return BLK_EH_DONE; 1727 } 1728 1729 static void nvme_free_queue(struct nvme_queue *nvmeq) 1730 { 1731 dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), 1732 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1733 if (!nvmeq->sq_cmds) 1734 return; 1735 1736 if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { 1737 pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), 1738 nvmeq->sq_cmds, SQ_SIZE(nvmeq)); 1739 } else { 1740 dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), 1741 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1742 } 1743 } 1744 1745 static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1746 { 1747 int i; 1748 1749 for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { 1750 dev->ctrl.queue_count--; 1751 nvme_free_queue(&dev->queues[i]); 1752 } 1753 } 1754 1755 static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid) 1756 { 1757 struct nvme_queue *nvmeq = &dev->queues[qid]; 1758 1759 if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) 1760 return; 1761 1762 /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ 1763 mb(); 1764 1765 nvmeq->dev->online_queues--; 1766 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1767 nvme_quiesce_admin_queue(&nvmeq->dev->ctrl); 1768 if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) 1769 pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq); 1770 } 1771 1772 static void nvme_suspend_io_queues(struct nvme_dev *dev) 1773 { 1774 int i; 1775 1776 for (i = dev->ctrl.queue_count - 1; i > 0; i--) 1777 nvme_suspend_queue(dev, i); 1778 } 1779 1780 /* 1781 * Called only on a device that has been disabled and after all other threads 1782 * that can check this device's completion queues have synced, except 1783 * nvme_poll(). This is the last chance for the driver to see a natural 1784 * completion before nvme_cancel_request() terminates all incomplete requests. 1785 */ 1786 static void nvme_reap_pending_cqes(struct nvme_dev *dev) 1787 { 1788 int i; 1789 1790 for (i = dev->ctrl.queue_count - 1; i > 0; i--) { 1791 spin_lock(&dev->queues[i].cq_poll_lock); 1792 nvme_poll_cq(&dev->queues[i], NULL); 1793 spin_unlock(&dev->queues[i].cq_poll_lock); 1794 } 1795 } 1796 1797 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1798 int entry_size) 1799 { 1800 int q_depth = dev->q_depth; 1801 unsigned q_size_aligned = roundup(q_depth * entry_size, 1802 NVME_CTRL_PAGE_SIZE); 1803 1804 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1805 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1806 1807 mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); 1808 q_depth = div_u64(mem_per_q, entry_size); 1809 1810 /* 1811 * Ensure the reduced q_depth is above some threshold where it 1812 * would be better to map queues in system memory with the 1813 * original depth 1814 */ 1815 if (q_depth < 64) 1816 return -ENOMEM; 1817 } 1818 1819 return q_depth; 1820 } 1821 1822 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1823 int qid) 1824 { 1825 struct pci_dev *pdev = to_pci_dev(dev->dev); 1826 1827 if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { 1828 nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); 1829 if (nvmeq->sq_cmds) { 1830 nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, 1831 nvmeq->sq_cmds); 1832 if (nvmeq->sq_dma_addr) { 1833 set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); 1834 return 0; 1835 } 1836 1837 pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); 1838 } 1839 } 1840 1841 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), 1842 &nvmeq->sq_dma_addr, GFP_KERNEL); 1843 if (!nvmeq->sq_cmds) 1844 return -ENOMEM; 1845 return 0; 1846 } 1847 1848 static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) 1849 { 1850 struct nvme_queue *nvmeq = &dev->queues[qid]; 1851 1852 if (dev->ctrl.queue_count > qid) 1853 return 0; 1854 1855 nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; 1856 nvmeq->q_depth = depth; 1857 nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), 1858 &nvmeq->cq_dma_addr, GFP_KERNEL); 1859 if (!nvmeq->cqes) 1860 goto free_nvmeq; 1861 1862 if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) 1863 goto free_cqdma; 1864 1865 nvmeq->dev = dev; 1866 spin_lock_init(&nvmeq->sq_lock); 1867 spin_lock_init(&nvmeq->cq_poll_lock); 1868 nvmeq->cq_head = 0; 1869 nvmeq->cq_phase = 1; 1870 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1871 nvmeq->qid = qid; 1872 dev->ctrl.queue_count++; 1873 1874 return 0; 1875 1876 free_cqdma: 1877 dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, 1878 nvmeq->cq_dma_addr); 1879 free_nvmeq: 1880 return -ENOMEM; 1881 } 1882 1883 static int queue_request_irq(struct nvme_queue *nvmeq) 1884 { 1885 struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); 1886 int nr = nvmeq->dev->ctrl.instance; 1887 1888 if (use_threaded_interrupts) { 1889 return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check, 1890 nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid); 1891 } else { 1892 return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq, 1893 NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid); 1894 } 1895 } 1896 1897 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1898 { 1899 struct nvme_dev *dev = nvmeq->dev; 1900 1901 nvmeq->sq_tail = 0; 1902 nvmeq->last_sq_tail = 0; 1903 nvmeq->cq_head = 0; 1904 nvmeq->cq_phase = 1; 1905 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1906 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); 1907 nvme_dbbuf_init(dev, nvmeq, qid); 1908 dev->online_queues++; 1909 wmb(); /* ensure the first interrupt sees the initialization */ 1910 } 1911 1912 /* 1913 * Try getting shutdown_lock while setting up IO queues. 1914 */ 1915 static int nvme_setup_io_queues_trylock(struct nvme_dev *dev) 1916 { 1917 /* 1918 * Give up if the lock is being held by nvme_dev_disable. 1919 */ 1920 if (!mutex_trylock(&dev->shutdown_lock)) 1921 return -ENODEV; 1922 1923 /* 1924 * Controller is in wrong state, fail early. 1925 */ 1926 if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) { 1927 mutex_unlock(&dev->shutdown_lock); 1928 return -ENODEV; 1929 } 1930 1931 return 0; 1932 } 1933 1934 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) 1935 { 1936 struct nvme_dev *dev = nvmeq->dev; 1937 int result; 1938 u16 vector = 0; 1939 1940 clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); 1941 1942 /* 1943 * A queue's vector matches the queue identifier unless the controller 1944 * has only one vector available. 1945 */ 1946 if (!polled) 1947 vector = dev->num_vecs == 1 ? 0 : qid; 1948 else 1949 set_bit(NVMEQ_POLLED, &nvmeq->flags); 1950 1951 result = adapter_alloc_cq(dev, qid, nvmeq, vector); 1952 if (result) 1953 return result; 1954 1955 result = adapter_alloc_sq(dev, qid, nvmeq); 1956 if (result < 0) 1957 return result; 1958 if (result) 1959 goto release_cq; 1960 1961 nvmeq->cq_vector = vector; 1962 1963 result = nvme_setup_io_queues_trylock(dev); 1964 if (result) 1965 return result; 1966 nvme_init_queue(nvmeq, qid); 1967 if (!polled) { 1968 result = queue_request_irq(nvmeq); 1969 if (result < 0) 1970 goto release_sq; 1971 } 1972 1973 set_bit(NVMEQ_ENABLED, &nvmeq->flags); 1974 mutex_unlock(&dev->shutdown_lock); 1975 return result; 1976 1977 release_sq: 1978 dev->online_queues--; 1979 mutex_unlock(&dev->shutdown_lock); 1980 adapter_delete_sq(dev, qid); 1981 release_cq: 1982 adapter_delete_cq(dev, qid); 1983 return result; 1984 } 1985 1986 static const struct blk_mq_ops nvme_mq_admin_ops = { 1987 .queue_rq = nvme_queue_rq, 1988 .complete = nvme_pci_complete_rq, 1989 .init_hctx = nvme_admin_init_hctx, 1990 .init_request = nvme_pci_init_request, 1991 .timeout = nvme_timeout, 1992 }; 1993 1994 static const struct blk_mq_ops nvme_mq_ops = { 1995 .queue_rq = nvme_queue_rq, 1996 .queue_rqs = nvme_queue_rqs, 1997 .complete = nvme_pci_complete_rq, 1998 .commit_rqs = nvme_commit_rqs, 1999 .init_hctx = nvme_init_hctx, 2000 .init_request = nvme_pci_init_request, 2001 .map_queues = nvme_pci_map_queues, 2002 .timeout = nvme_timeout, 2003 .poll = nvme_poll, 2004 }; 2005 2006 static void nvme_dev_remove_admin(struct nvme_dev *dev) 2007 { 2008 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { 2009 /* 2010 * If the controller was reset during removal, it's possible 2011 * user requests may be waiting on a stopped queue. Start the 2012 * queue to flush these to completion. 2013 */ 2014 nvme_unquiesce_admin_queue(&dev->ctrl); 2015 nvme_remove_admin_tag_set(&dev->ctrl); 2016 } 2017 } 2018 2019 static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 2020 { 2021 return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); 2022 } 2023 2024 static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) 2025 { 2026 struct pci_dev *pdev = to_pci_dev(dev->dev); 2027 2028 if (size <= dev->bar_mapped_size) 2029 return 0; 2030 if (size > pci_resource_len(pdev, 0)) 2031 return -ENOMEM; 2032 if (dev->bar) 2033 iounmap(dev->bar); 2034 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 2035 if (!dev->bar) { 2036 dev->bar_mapped_size = 0; 2037 return -ENOMEM; 2038 } 2039 dev->bar_mapped_size = size; 2040 dev->dbs = dev->bar + NVME_REG_DBS; 2041 2042 return 0; 2043 } 2044 2045 static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) 2046 { 2047 int result; 2048 u32 aqa; 2049 struct nvme_queue *nvmeq; 2050 2051 result = nvme_remap_bar(dev, db_bar_size(dev, 0)); 2052 if (result < 0) 2053 return result; 2054 2055 dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? 2056 NVME_CAP_NSSRC(dev->ctrl.cap) : 0; 2057 2058 if (dev->subsystem && 2059 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) 2060 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); 2061 2062 /* 2063 * If the device has been passed off to us in an enabled state, just 2064 * clear the enabled bit. The spec says we should set the 'shutdown 2065 * notification bits', but doing so may cause the device to complete 2066 * commands to the admin queue ... and we don't know what memory that 2067 * might be pointing at! 2068 */ 2069 result = nvme_disable_ctrl(&dev->ctrl, false); 2070 if (result < 0) 2071 return result; 2072 2073 result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 2074 if (result) 2075 return result; 2076 2077 dev->ctrl.numa_node = dev_to_node(dev->dev); 2078 2079 nvmeq = &dev->queues[0]; 2080 aqa = nvmeq->q_depth - 1; 2081 aqa |= aqa << 16; 2082 2083 writel(aqa, dev->bar + NVME_REG_AQA); 2084 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); 2085 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); 2086 2087 result = nvme_enable_ctrl(&dev->ctrl); 2088 if (result) 2089 return result; 2090 2091 nvmeq->cq_vector = 0; 2092 nvme_init_queue(nvmeq, 0); 2093 result = queue_request_irq(nvmeq); 2094 if (result) { 2095 dev->online_queues--; 2096 return result; 2097 } 2098 2099 set_bit(NVMEQ_ENABLED, &nvmeq->flags); 2100 return result; 2101 } 2102 2103 static int nvme_create_io_queues(struct nvme_dev *dev) 2104 { 2105 unsigned i, max, rw_queues; 2106 int ret = 0; 2107 2108 for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { 2109 if (nvme_alloc_queue(dev, i, dev->q_depth)) { 2110 ret = -ENOMEM; 2111 break; 2112 } 2113 } 2114 2115 max = min(dev->max_qid, dev->ctrl.queue_count - 1); 2116 if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { 2117 rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + 2118 dev->io_queues[HCTX_TYPE_READ]; 2119 } else { 2120 rw_queues = max; 2121 } 2122 2123 for (i = dev->online_queues; i <= max; i++) { 2124 bool polled = i > rw_queues; 2125 2126 ret = nvme_create_queue(&dev->queues[i], i, polled); 2127 if (ret) 2128 break; 2129 } 2130 2131 /* 2132 * Ignore failing Create SQ/CQ commands, we can continue with less 2133 * than the desired amount of queues, and even a controller without 2134 * I/O queues can still be used to issue admin commands. This might 2135 * be useful to upgrade a buggy firmware for example. 2136 */ 2137 return ret >= 0 ? 0 : ret; 2138 } 2139 2140 static u64 nvme_cmb_size_unit(struct nvme_dev *dev) 2141 { 2142 u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; 2143 2144 return 1ULL << (12 + 4 * szu); 2145 } 2146 2147 static u32 nvme_cmb_size(struct nvme_dev *dev) 2148 { 2149 return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; 2150 } 2151 2152 static void nvme_map_cmb(struct nvme_dev *dev) 2153 { 2154 u64 size, offset; 2155 resource_size_t bar_size; 2156 struct pci_dev *pdev = to_pci_dev(dev->dev); 2157 int bar; 2158 2159 if (dev->cmb_size) 2160 return; 2161 2162 if (NVME_CAP_CMBS(dev->ctrl.cap)) 2163 writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC); 2164 2165 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); 2166 if (!dev->cmbsz) 2167 return; 2168 dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); 2169 2170 size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); 2171 offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); 2172 bar = NVME_CMB_BIR(dev->cmbloc); 2173 bar_size = pci_resource_len(pdev, bar); 2174 2175 if (offset > bar_size) 2176 return; 2177 2178 /* 2179 * Controllers may support a CMB size larger than their BAR, for 2180 * example, due to being behind a bridge. Reduce the CMB to the 2181 * reported size of the BAR 2182 */ 2183 size = min(size, bar_size - offset); 2184 2185 if (!IS_ALIGNED(size, memremap_compat_align()) || 2186 !IS_ALIGNED(pci_resource_start(pdev, bar), 2187 memremap_compat_align())) 2188 return; 2189 2190 /* 2191 * Tell the controller about the host side address mapping the CMB, 2192 * and enable CMB decoding for the NVMe 1.4+ scheme: 2193 */ 2194 if (NVME_CAP_CMBS(dev->ctrl.cap)) { 2195 hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE | 2196 (pci_bus_address(pdev, bar) + offset), 2197 dev->bar + NVME_REG_CMBMSC); 2198 } 2199 2200 if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { 2201 dev_warn(dev->ctrl.device, 2202 "failed to register the CMB\n"); 2203 hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC); 2204 return; 2205 } 2206 2207 dev->cmb_size = size; 2208 dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS); 2209 2210 if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == 2211 (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) 2212 pci_p2pmem_publish(pdev, true); 2213 2214 nvme_update_attrs(dev); 2215 } 2216 2217 static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) 2218 { 2219 u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; 2220 u64 dma_addr = dev->host_mem_descs_dma; 2221 struct nvme_command c = { }; 2222 int ret; 2223 2224 c.features.opcode = nvme_admin_set_features; 2225 c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); 2226 c.features.dword11 = cpu_to_le32(bits); 2227 c.features.dword12 = cpu_to_le32(host_mem_size); 2228 c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); 2229 c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); 2230 c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); 2231 2232 ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 2233 if (ret) { 2234 dev_warn(dev->ctrl.device, 2235 "failed to set host mem (err %d, flags %#x).\n", 2236 ret, bits); 2237 } else 2238 dev->hmb = bits & NVME_HOST_MEM_ENABLE; 2239 2240 return ret; 2241 } 2242 2243 static void nvme_free_host_mem_multi(struct nvme_dev *dev) 2244 { 2245 int i; 2246 2247 for (i = 0; i < dev->nr_host_mem_descs; i++) { 2248 struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; 2249 size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; 2250 2251 dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], 2252 le64_to_cpu(desc->addr), 2253 DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); 2254 } 2255 2256 kfree(dev->host_mem_desc_bufs); 2257 dev->host_mem_desc_bufs = NULL; 2258 } 2259 2260 static void nvme_free_host_mem(struct nvme_dev *dev) 2261 { 2262 if (dev->hmb_sgt) 2263 dma_free_noncontiguous(dev->dev, dev->host_mem_size, 2264 dev->hmb_sgt, DMA_BIDIRECTIONAL); 2265 else 2266 nvme_free_host_mem_multi(dev); 2267 2268 dma_free_coherent(dev->dev, dev->host_mem_descs_size, 2269 dev->host_mem_descs, dev->host_mem_descs_dma); 2270 dev->host_mem_descs = NULL; 2271 dev->host_mem_descs_size = 0; 2272 dev->nr_host_mem_descs = 0; 2273 } 2274 2275 static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size) 2276 { 2277 dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size, 2278 DMA_BIDIRECTIONAL, GFP_KERNEL, 0); 2279 if (!dev->hmb_sgt) 2280 return -ENOMEM; 2281 2282 dev->host_mem_descs = dma_alloc_coherent(dev->dev, 2283 sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma, 2284 GFP_KERNEL); 2285 if (!dev->host_mem_descs) { 2286 dma_free_noncontiguous(dev->dev, size, dev->hmb_sgt, 2287 DMA_BIDIRECTIONAL); 2288 dev->hmb_sgt = NULL; 2289 return -ENOMEM; 2290 } 2291 dev->host_mem_size = size; 2292 dev->host_mem_descs_size = sizeof(*dev->host_mem_descs); 2293 dev->nr_host_mem_descs = 1; 2294 2295 dev->host_mem_descs[0].addr = 2296 cpu_to_le64(dev->hmb_sgt->sgl->dma_address); 2297 dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE); 2298 return 0; 2299 } 2300 2301 static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, 2302 u32 chunk_size) 2303 { 2304 struct nvme_host_mem_buf_desc *descs; 2305 u32 max_entries, len, descs_size; 2306 dma_addr_t descs_dma; 2307 int i = 0; 2308 void **bufs; 2309 u64 size, tmp; 2310 2311 tmp = (preferred + chunk_size - 1); 2312 do_div(tmp, chunk_size); 2313 max_entries = tmp; 2314 2315 if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) 2316 max_entries = dev->ctrl.hmmaxd; 2317 2318 descs_size = max_entries * sizeof(*descs); 2319 descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma, 2320 GFP_KERNEL); 2321 if (!descs) 2322 goto out; 2323 2324 bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL); 2325 if (!bufs) 2326 goto out_free_descs; 2327 2328 for (size = 0; size < preferred && i < max_entries; size += len) { 2329 dma_addr_t dma_addr; 2330 2331 len = min_t(u64, chunk_size, preferred - size); 2332 bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, 2333 DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); 2334 if (!bufs[i]) 2335 break; 2336 2337 descs[i].addr = cpu_to_le64(dma_addr); 2338 descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); 2339 i++; 2340 } 2341 2342 if (!size) 2343 goto out_free_bufs; 2344 2345 dev->nr_host_mem_descs = i; 2346 dev->host_mem_size = size; 2347 dev->host_mem_descs = descs; 2348 dev->host_mem_descs_dma = descs_dma; 2349 dev->host_mem_descs_size = descs_size; 2350 dev->host_mem_desc_bufs = bufs; 2351 return 0; 2352 2353 out_free_bufs: 2354 kfree(bufs); 2355 out_free_descs: 2356 dma_free_coherent(dev->dev, descs_size, descs, descs_dma); 2357 out: 2358 dev->host_mem_descs = NULL; 2359 return -ENOMEM; 2360 } 2361 2362 static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) 2363 { 2364 unsigned long dma_merge_boundary = dma_get_merge_boundary(dev->dev); 2365 u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); 2366 u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); 2367 u64 chunk_size; 2368 2369 /* 2370 * If there is an IOMMU that can merge pages, try a virtually 2371 * non-contiguous allocation for a single segment first. 2372 */ 2373 if (dma_merge_boundary && (PAGE_SIZE & dma_merge_boundary) == 0) { 2374 if (!nvme_alloc_host_mem_single(dev, preferred)) 2375 return 0; 2376 } 2377 2378 /* start big and work our way down */ 2379 for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { 2380 if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) { 2381 if (!min || dev->host_mem_size >= min) 2382 return 0; 2383 nvme_free_host_mem(dev); 2384 } 2385 } 2386 2387 return -ENOMEM; 2388 } 2389 2390 static int nvme_setup_host_mem(struct nvme_dev *dev) 2391 { 2392 u64 max = (u64)max_host_mem_size_mb * SZ_1M; 2393 u64 preferred = (u64)dev->ctrl.hmpre * 4096; 2394 u64 min = (u64)dev->ctrl.hmmin * 4096; 2395 u32 enable_bits = NVME_HOST_MEM_ENABLE; 2396 int ret; 2397 2398 if (!dev->ctrl.hmpre) 2399 return 0; 2400 2401 preferred = min(preferred, max); 2402 if (min > max) { 2403 dev_warn(dev->ctrl.device, 2404 "min host memory (%lld MiB) above limit (%d MiB).\n", 2405 min >> ilog2(SZ_1M), max_host_mem_size_mb); 2406 nvme_free_host_mem(dev); 2407 return 0; 2408 } 2409 2410 /* 2411 * If we already have a buffer allocated check if we can reuse it. 2412 */ 2413 if (dev->host_mem_descs) { 2414 if (dev->host_mem_size >= min) 2415 enable_bits |= NVME_HOST_MEM_RETURN; 2416 else 2417 nvme_free_host_mem(dev); 2418 } 2419 2420 if (!dev->host_mem_descs) { 2421 if (nvme_alloc_host_mem(dev, min, preferred)) { 2422 dev_warn(dev->ctrl.device, 2423 "failed to allocate host memory buffer.\n"); 2424 return 0; /* controller must work without HMB */ 2425 } 2426 2427 dev_info(dev->ctrl.device, 2428 "allocated %lld MiB host memory buffer (%u segment%s).\n", 2429 dev->host_mem_size >> ilog2(SZ_1M), 2430 dev->nr_host_mem_descs, 2431 str_plural(dev->nr_host_mem_descs)); 2432 } 2433 2434 ret = nvme_set_host_mem(dev, enable_bits); 2435 if (ret) 2436 nvme_free_host_mem(dev); 2437 return ret; 2438 } 2439 2440 static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, 2441 char *buf) 2442 { 2443 struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); 2444 2445 return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", 2446 ndev->cmbloc, ndev->cmbsz); 2447 } 2448 static DEVICE_ATTR_RO(cmb); 2449 2450 static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr, 2451 char *buf) 2452 { 2453 struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); 2454 2455 return sysfs_emit(buf, "%u\n", ndev->cmbloc); 2456 } 2457 static DEVICE_ATTR_RO(cmbloc); 2458 2459 static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, 2460 char *buf) 2461 { 2462 struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); 2463 2464 return sysfs_emit(buf, "%u\n", ndev->cmbsz); 2465 } 2466 static DEVICE_ATTR_RO(cmbsz); 2467 2468 static ssize_t hmb_show(struct device *dev, struct device_attribute *attr, 2469 char *buf) 2470 { 2471 struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); 2472 2473 return sysfs_emit(buf, "%d\n", ndev->hmb); 2474 } 2475 2476 static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, 2477 const char *buf, size_t count) 2478 { 2479 struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); 2480 bool new; 2481 int ret; 2482 2483 if (kstrtobool(buf, &new) < 0) 2484 return -EINVAL; 2485 2486 if (new == ndev->hmb) 2487 return count; 2488 2489 if (new) { 2490 ret = nvme_setup_host_mem(ndev); 2491 } else { 2492 ret = nvme_set_host_mem(ndev, 0); 2493 if (!ret) 2494 nvme_free_host_mem(ndev); 2495 } 2496 2497 if (ret < 0) 2498 return ret; 2499 2500 return count; 2501 } 2502 static DEVICE_ATTR_RW(hmb); 2503 2504 static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, 2505 struct attribute *a, int n) 2506 { 2507 struct nvme_ctrl *ctrl = 2508 dev_get_drvdata(container_of(kobj, struct device, kobj)); 2509 struct nvme_dev *dev = to_nvme_dev(ctrl); 2510 2511 if (a == &dev_attr_cmb.attr || 2512 a == &dev_attr_cmbloc.attr || 2513 a == &dev_attr_cmbsz.attr) { 2514 if (!dev->cmbsz) 2515 return 0; 2516 } 2517 if (a == &dev_attr_hmb.attr && !ctrl->hmpre) 2518 return 0; 2519 2520 return a->mode; 2521 } 2522 2523 static struct attribute *nvme_pci_attrs[] = { 2524 &dev_attr_cmb.attr, 2525 &dev_attr_cmbloc.attr, 2526 &dev_attr_cmbsz.attr, 2527 &dev_attr_hmb.attr, 2528 NULL, 2529 }; 2530 2531 static const struct attribute_group nvme_pci_dev_attrs_group = { 2532 .attrs = nvme_pci_attrs, 2533 .is_visible = nvme_pci_attrs_are_visible, 2534 }; 2535 2536 static const struct attribute_group *nvme_pci_dev_attr_groups[] = { 2537 &nvme_dev_attrs_group, 2538 &nvme_pci_dev_attrs_group, 2539 NULL, 2540 }; 2541 2542 static void nvme_update_attrs(struct nvme_dev *dev) 2543 { 2544 sysfs_update_group(&dev->ctrl.device->kobj, &nvme_pci_dev_attrs_group); 2545 } 2546 2547 /* 2548 * nirqs is the number of interrupts available for write and read 2549 * queues. The core already reserved an interrupt for the admin queue. 2550 */ 2551 static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) 2552 { 2553 struct nvme_dev *dev = affd->priv; 2554 unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; 2555 2556 /* 2557 * If there is no interrupt available for queues, ensure that 2558 * the default queue is set to 1. The affinity set size is 2559 * also set to one, but the irq core ignores it for this case. 2560 * 2561 * If only one interrupt is available or 'write_queue' == 0, combine 2562 * write and read queues. 2563 * 2564 * If 'write_queues' > 0, ensure it leaves room for at least one read 2565 * queue. 2566 */ 2567 if (!nrirqs) { 2568 nrirqs = 1; 2569 nr_read_queues = 0; 2570 } else if (nrirqs == 1 || !nr_write_queues) { 2571 nr_read_queues = 0; 2572 } else if (nr_write_queues >= nrirqs) { 2573 nr_read_queues = 1; 2574 } else { 2575 nr_read_queues = nrirqs - nr_write_queues; 2576 } 2577 2578 dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; 2579 affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; 2580 dev->io_queues[HCTX_TYPE_READ] = nr_read_queues; 2581 affd->set_size[HCTX_TYPE_READ] = nr_read_queues; 2582 affd->nr_sets = nr_read_queues ? 2 : 1; 2583 } 2584 2585 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) 2586 { 2587 struct pci_dev *pdev = to_pci_dev(dev->dev); 2588 struct irq_affinity affd = { 2589 .pre_vectors = 1, 2590 .calc_sets = nvme_calc_irq_sets, 2591 .priv = dev, 2592 }; 2593 unsigned int irq_queues, poll_queues; 2594 unsigned int flags = PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY; 2595 2596 /* 2597 * Poll queues don't need interrupts, but we need at least one I/O queue 2598 * left over for non-polled I/O. 2599 */ 2600 poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); 2601 dev->io_queues[HCTX_TYPE_POLL] = poll_queues; 2602 2603 /* 2604 * Initialize for the single interrupt case, will be updated in 2605 * nvme_calc_irq_sets(). 2606 */ 2607 dev->io_queues[HCTX_TYPE_DEFAULT] = 1; 2608 dev->io_queues[HCTX_TYPE_READ] = 0; 2609 2610 /* 2611 * We need interrupts for the admin queue and each non-polled I/O queue, 2612 * but some Apple controllers require all queues to use the first 2613 * vector. 2614 */ 2615 irq_queues = 1; 2616 if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) 2617 irq_queues += (nr_io_queues - poll_queues); 2618 if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI) 2619 flags &= ~PCI_IRQ_MSI; 2620 return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, flags, 2621 &affd); 2622 } 2623 2624 static unsigned int nvme_max_io_queues(struct nvme_dev *dev) 2625 { 2626 /* 2627 * If tags are shared with admin queue (Apple bug), then 2628 * make sure we only use one IO queue. 2629 */ 2630 if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) 2631 return 1; 2632 return blk_mq_num_possible_queues(0) + dev->nr_write_queues + 2633 dev->nr_poll_queues; 2634 } 2635 2636 static int nvme_setup_io_queues(struct nvme_dev *dev) 2637 { 2638 struct nvme_queue *adminq = &dev->queues[0]; 2639 struct pci_dev *pdev = to_pci_dev(dev->dev); 2640 unsigned int nr_io_queues; 2641 unsigned long size; 2642 int result; 2643 2644 /* 2645 * Sample the module parameters once at reset time so that we have 2646 * stable values to work with. 2647 */ 2648 dev->nr_write_queues = write_queues; 2649 dev->nr_poll_queues = poll_queues; 2650 2651 nr_io_queues = dev->nr_allocated_queues - 1; 2652 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); 2653 if (result < 0) 2654 return result; 2655 2656 if (nr_io_queues == 0) 2657 return 0; 2658 2659 /* 2660 * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions 2661 * from set to unset. If there is a window to it is truely freed, 2662 * pci_free_irq_vectors() jumping into this window will crash. 2663 * And take lock to avoid racing with pci_free_irq_vectors() in 2664 * nvme_dev_disable() path. 2665 */ 2666 result = nvme_setup_io_queues_trylock(dev); 2667 if (result) 2668 return result; 2669 if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) 2670 pci_free_irq(pdev, 0, adminq); 2671 2672 if (dev->cmb_use_sqes) { 2673 result = nvme_cmb_qdepth(dev, nr_io_queues, 2674 sizeof(struct nvme_command)); 2675 if (result > 0) { 2676 dev->q_depth = result; 2677 dev->ctrl.sqsize = result - 1; 2678 } else { 2679 dev->cmb_use_sqes = false; 2680 } 2681 } 2682 2683 do { 2684 size = db_bar_size(dev, nr_io_queues); 2685 result = nvme_remap_bar(dev, size); 2686 if (!result) 2687 break; 2688 if (!--nr_io_queues) { 2689 result = -ENOMEM; 2690 goto out_unlock; 2691 } 2692 } while (1); 2693 adminq->q_db = dev->dbs; 2694 2695 retry: 2696 /* Deregister the admin queue's interrupt */ 2697 if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) 2698 pci_free_irq(pdev, 0, adminq); 2699 2700 /* 2701 * If we enable msix early due to not intx, disable it again before 2702 * setting up the full range we need. 2703 */ 2704 pci_free_irq_vectors(pdev); 2705 2706 result = nvme_setup_irqs(dev, nr_io_queues); 2707 if (result <= 0) { 2708 result = -EIO; 2709 goto out_unlock; 2710 } 2711 2712 dev->num_vecs = result; 2713 result = max(result - 1, 1); 2714 dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; 2715 2716 /* 2717 * Should investigate if there's a performance win from allocating 2718 * more queues than interrupt vectors; it might allow the submission 2719 * path to scale better, even if the receive path is limited by the 2720 * number of interrupts. 2721 */ 2722 result = queue_request_irq(adminq); 2723 if (result) 2724 goto out_unlock; 2725 set_bit(NVMEQ_ENABLED, &adminq->flags); 2726 mutex_unlock(&dev->shutdown_lock); 2727 2728 result = nvme_create_io_queues(dev); 2729 if (result || dev->online_queues < 2) 2730 return result; 2731 2732 if (dev->online_queues - 1 < dev->max_qid) { 2733 nr_io_queues = dev->online_queues - 1; 2734 nvme_delete_io_queues(dev); 2735 result = nvme_setup_io_queues_trylock(dev); 2736 if (result) 2737 return result; 2738 nvme_suspend_io_queues(dev); 2739 goto retry; 2740 } 2741 dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", 2742 dev->io_queues[HCTX_TYPE_DEFAULT], 2743 dev->io_queues[HCTX_TYPE_READ], 2744 dev->io_queues[HCTX_TYPE_POLL]); 2745 return 0; 2746 out_unlock: 2747 mutex_unlock(&dev->shutdown_lock); 2748 return result; 2749 } 2750 2751 static enum rq_end_io_ret nvme_del_queue_end(struct request *req, 2752 blk_status_t error) 2753 { 2754 struct nvme_queue *nvmeq = req->end_io_data; 2755 2756 blk_mq_free_request(req); 2757 complete(&nvmeq->delete_done); 2758 return RQ_END_IO_NONE; 2759 } 2760 2761 static enum rq_end_io_ret nvme_del_cq_end(struct request *req, 2762 blk_status_t error) 2763 { 2764 struct nvme_queue *nvmeq = req->end_io_data; 2765 2766 if (error) 2767 set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); 2768 2769 return nvme_del_queue_end(req, error); 2770 } 2771 2772 static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) 2773 { 2774 struct request_queue *q = nvmeq->dev->ctrl.admin_q; 2775 struct request *req; 2776 struct nvme_command cmd = { }; 2777 2778 cmd.delete_queue.opcode = opcode; 2779 cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2780 2781 req = blk_mq_alloc_request(q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT); 2782 if (IS_ERR(req)) 2783 return PTR_ERR(req); 2784 nvme_init_request(req, &cmd); 2785 2786 if (opcode == nvme_admin_delete_cq) 2787 req->end_io = nvme_del_cq_end; 2788 else 2789 req->end_io = nvme_del_queue_end; 2790 req->end_io_data = nvmeq; 2791 2792 init_completion(&nvmeq->delete_done); 2793 blk_execute_rq_nowait(req, false); 2794 return 0; 2795 } 2796 2797 static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode) 2798 { 2799 int nr_queues = dev->online_queues - 1, sent = 0; 2800 unsigned long timeout; 2801 2802 retry: 2803 timeout = NVME_ADMIN_TIMEOUT; 2804 while (nr_queues > 0) { 2805 if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) 2806 break; 2807 nr_queues--; 2808 sent++; 2809 } 2810 while (sent) { 2811 struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent]; 2812 2813 timeout = wait_for_completion_io_timeout(&nvmeq->delete_done, 2814 timeout); 2815 if (timeout == 0) 2816 return false; 2817 2818 sent--; 2819 if (nr_queues) 2820 goto retry; 2821 } 2822 return true; 2823 } 2824 2825 static void nvme_delete_io_queues(struct nvme_dev *dev) 2826 { 2827 if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq)) 2828 __nvme_delete_io_queues(dev, nvme_admin_delete_cq); 2829 } 2830 2831 static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev) 2832 { 2833 if (dev->io_queues[HCTX_TYPE_POLL]) 2834 return 3; 2835 if (dev->io_queues[HCTX_TYPE_READ]) 2836 return 2; 2837 return 1; 2838 } 2839 2840 static bool nvme_pci_update_nr_queues(struct nvme_dev *dev) 2841 { 2842 if (!dev->ctrl.tagset) { 2843 nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops, 2844 nvme_pci_nr_maps(dev), sizeof(struct nvme_iod)); 2845 return true; 2846 } 2847 2848 /* Give up if we are racing with nvme_dev_disable() */ 2849 if (!mutex_trylock(&dev->shutdown_lock)) 2850 return false; 2851 2852 /* Check if nvme_dev_disable() has been executed already */ 2853 if (!dev->online_queues) { 2854 mutex_unlock(&dev->shutdown_lock); 2855 return false; 2856 } 2857 2858 blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); 2859 /* free previously allocated queues that are no longer usable */ 2860 nvme_free_queues(dev, dev->online_queues); 2861 mutex_unlock(&dev->shutdown_lock); 2862 return true; 2863 } 2864 2865 static int nvme_pci_enable(struct nvme_dev *dev) 2866 { 2867 int result = -ENOMEM; 2868 struct pci_dev *pdev = to_pci_dev(dev->dev); 2869 unsigned int flags = PCI_IRQ_ALL_TYPES; 2870 2871 if (pci_enable_device_mem(pdev)) 2872 return result; 2873 2874 pci_set_master(pdev); 2875 2876 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 2877 result = -ENODEV; 2878 goto disable; 2879 } 2880 2881 /* 2882 * Some devices and/or platforms don't advertise or work with INTx 2883 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll 2884 * adjust this later. 2885 */ 2886 if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI) 2887 flags &= ~PCI_IRQ_MSI; 2888 result = pci_alloc_irq_vectors(pdev, 1, 1, flags); 2889 if (result < 0) 2890 goto disable; 2891 2892 dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 2893 2894 dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, 2895 io_queue_depth); 2896 dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); 2897 dev->dbs = dev->bar + 4096; 2898 2899 /* 2900 * Some Apple controllers require a non-standard SQE size. 2901 * Interestingly they also seem to ignore the CC:IOSQES register 2902 * so we don't bother updating it here. 2903 */ 2904 if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) 2905 dev->io_sqes = 7; 2906 else 2907 dev->io_sqes = NVME_NVM_IOSQES; 2908 2909 if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) { 2910 dev->q_depth = 2; 2911 } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && 2912 (pdev->device == 0xa821 || pdev->device == 0xa822) && 2913 NVME_CAP_MQES(dev->ctrl.cap) == 0) { 2914 dev->q_depth = 64; 2915 dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " 2916 "set queue depth=%u\n", dev->q_depth); 2917 } 2918 2919 /* 2920 * Controllers with the shared tags quirk need the IO queue to be 2921 * big enough so that we get 32 tags for the admin queue 2922 */ 2923 if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && 2924 (dev->q_depth < (NVME_AQ_DEPTH + 2))) { 2925 dev->q_depth = NVME_AQ_DEPTH + 2; 2926 dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", 2927 dev->q_depth); 2928 } 2929 dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ 2930 2931 nvme_map_cmb(dev); 2932 2933 pci_save_state(pdev); 2934 2935 result = nvme_pci_configure_admin_queue(dev); 2936 if (result) 2937 goto free_irq; 2938 return result; 2939 2940 free_irq: 2941 pci_free_irq_vectors(pdev); 2942 disable: 2943 pci_disable_device(pdev); 2944 return result; 2945 } 2946 2947 static void nvme_dev_unmap(struct nvme_dev *dev) 2948 { 2949 if (dev->bar) 2950 iounmap(dev->bar); 2951 pci_release_mem_regions(to_pci_dev(dev->dev)); 2952 } 2953 2954 static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev) 2955 { 2956 struct pci_dev *pdev = to_pci_dev(dev->dev); 2957 u32 csts; 2958 2959 if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev)) 2960 return true; 2961 if (pdev->error_state != pci_channel_io_normal) 2962 return true; 2963 2964 csts = readl(dev->bar + NVME_REG_CSTS); 2965 return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY); 2966 } 2967 2968 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) 2969 { 2970 enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl); 2971 struct pci_dev *pdev = to_pci_dev(dev->dev); 2972 bool dead; 2973 2974 mutex_lock(&dev->shutdown_lock); 2975 dead = nvme_pci_ctrl_is_dead(dev); 2976 if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) { 2977 if (pci_is_enabled(pdev)) 2978 nvme_start_freeze(&dev->ctrl); 2979 /* 2980 * Give the controller a chance to complete all entered requests 2981 * if doing a safe shutdown. 2982 */ 2983 if (!dead && shutdown) 2984 nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); 2985 } 2986 2987 nvme_quiesce_io_queues(&dev->ctrl); 2988 2989 if (!dead && dev->ctrl.queue_count > 0) { 2990 nvme_delete_io_queues(dev); 2991 nvme_disable_ctrl(&dev->ctrl, shutdown); 2992 nvme_poll_irqdisable(&dev->queues[0]); 2993 } 2994 nvme_suspend_io_queues(dev); 2995 nvme_suspend_queue(dev, 0); 2996 pci_free_irq_vectors(pdev); 2997 if (pci_is_enabled(pdev)) 2998 pci_disable_device(pdev); 2999 nvme_reap_pending_cqes(dev); 3000 3001 nvme_cancel_tagset(&dev->ctrl); 3002 nvme_cancel_admin_tagset(&dev->ctrl); 3003 3004 /* 3005 * The driver will not be starting up queues again if shutting down so 3006 * must flush all entered requests to their failed completion to avoid 3007 * deadlocking blk-mq hot-cpu notifier. 3008 */ 3009 if (shutdown) { 3010 nvme_unquiesce_io_queues(&dev->ctrl); 3011 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) 3012 nvme_unquiesce_admin_queue(&dev->ctrl); 3013 } 3014 mutex_unlock(&dev->shutdown_lock); 3015 } 3016 3017 static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) 3018 { 3019 if (!nvme_wait_reset(&dev->ctrl)) 3020 return -EBUSY; 3021 nvme_dev_disable(dev, shutdown); 3022 return 0; 3023 } 3024 3025 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) 3026 { 3027 size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); 3028 3029 dev->iod_meta_mempool = mempool_create_node(1, 3030 mempool_kmalloc, mempool_kfree, 3031 (void *)meta_size, GFP_KERNEL, 3032 dev_to_node(dev->dev)); 3033 if (!dev->iod_meta_mempool) 3034 return -ENOMEM; 3035 return 0; 3036 } 3037 3038 static void nvme_free_tagset(struct nvme_dev *dev) 3039 { 3040 if (dev->tagset.tags) 3041 nvme_remove_io_tag_set(&dev->ctrl); 3042 dev->ctrl.tagset = NULL; 3043 } 3044 3045 /* pairs with nvme_pci_alloc_dev */ 3046 static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) 3047 { 3048 struct nvme_dev *dev = to_nvme_dev(ctrl); 3049 3050 nvme_free_tagset(dev); 3051 put_device(dev->dev); 3052 kfree(dev->queues); 3053 kfree(dev); 3054 } 3055 3056 static void nvme_reset_work(struct work_struct *work) 3057 { 3058 struct nvme_dev *dev = 3059 container_of(work, struct nvme_dev, ctrl.reset_work); 3060 bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); 3061 int result; 3062 3063 if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) { 3064 dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", 3065 dev->ctrl.state); 3066 result = -ENODEV; 3067 goto out; 3068 } 3069 3070 /* 3071 * If we're called to reset a live controller first shut it down before 3072 * moving on. 3073 */ 3074 if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) 3075 nvme_dev_disable(dev, false); 3076 nvme_sync_queues(&dev->ctrl); 3077 3078 mutex_lock(&dev->shutdown_lock); 3079 result = nvme_pci_enable(dev); 3080 if (result) 3081 goto out_unlock; 3082 nvme_unquiesce_admin_queue(&dev->ctrl); 3083 mutex_unlock(&dev->shutdown_lock); 3084 3085 /* 3086 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the 3087 * initializing procedure here. 3088 */ 3089 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { 3090 dev_warn(dev->ctrl.device, 3091 "failed to mark controller CONNECTING\n"); 3092 result = -EBUSY; 3093 goto out; 3094 } 3095 3096 result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); 3097 if (result) 3098 goto out; 3099 3100 if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 3101 dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; 3102 else 3103 dev->ctrl.max_integrity_segments = 1; 3104 3105 nvme_dbbuf_dma_alloc(dev); 3106 3107 result = nvme_setup_host_mem(dev); 3108 if (result < 0) 3109 goto out; 3110 3111 result = nvme_setup_io_queues(dev); 3112 if (result) 3113 goto out; 3114 3115 /* 3116 * Freeze and update the number of I/O queues as those might have 3117 * changed. If there are no I/O queues left after this reset, keep the 3118 * controller around but remove all namespaces. 3119 */ 3120 if (dev->online_queues > 1) { 3121 nvme_dbbuf_set(dev); 3122 nvme_unquiesce_io_queues(&dev->ctrl); 3123 nvme_wait_freeze(&dev->ctrl); 3124 if (!nvme_pci_update_nr_queues(dev)) 3125 goto out; 3126 nvme_unfreeze(&dev->ctrl); 3127 } else { 3128 dev_warn(dev->ctrl.device, "IO queues lost\n"); 3129 nvme_mark_namespaces_dead(&dev->ctrl); 3130 nvme_unquiesce_io_queues(&dev->ctrl); 3131 nvme_remove_namespaces(&dev->ctrl); 3132 nvme_free_tagset(dev); 3133 } 3134 3135 /* 3136 * If only admin queue live, keep it to do further investigation or 3137 * recovery. 3138 */ 3139 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { 3140 dev_warn(dev->ctrl.device, 3141 "failed to mark controller live state\n"); 3142 result = -ENODEV; 3143 goto out; 3144 } 3145 3146 nvme_start_ctrl(&dev->ctrl); 3147 return; 3148 3149 out_unlock: 3150 mutex_unlock(&dev->shutdown_lock); 3151 out: 3152 /* 3153 * Set state to deleting now to avoid blocking nvme_wait_reset(), which 3154 * may be holding this pci_dev's device lock. 3155 */ 3156 dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n", 3157 result); 3158 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 3159 nvme_dev_disable(dev, true); 3160 nvme_sync_queues(&dev->ctrl); 3161 nvme_mark_namespaces_dead(&dev->ctrl); 3162 nvme_unquiesce_io_queues(&dev->ctrl); 3163 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); 3164 } 3165 3166 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) 3167 { 3168 *val = readl(to_nvme_dev(ctrl)->bar + off); 3169 return 0; 3170 } 3171 3172 static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) 3173 { 3174 writel(val, to_nvme_dev(ctrl)->bar + off); 3175 return 0; 3176 } 3177 3178 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) 3179 { 3180 *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off); 3181 return 0; 3182 } 3183 3184 static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) 3185 { 3186 struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); 3187 3188 return snprintf(buf, size, "%s\n", dev_name(&pdev->dev)); 3189 } 3190 3191 static void nvme_pci_print_device_info(struct nvme_ctrl *ctrl) 3192 { 3193 struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); 3194 struct nvme_subsystem *subsys = ctrl->subsys; 3195 3196 dev_err(ctrl->device, 3197 "VID:DID %04x:%04x model:%.*s firmware:%.*s\n", 3198 pdev->vendor, pdev->device, 3199 nvme_strlen(subsys->model, sizeof(subsys->model)), 3200 subsys->model, nvme_strlen(subsys->firmware_rev, 3201 sizeof(subsys->firmware_rev)), 3202 subsys->firmware_rev); 3203 } 3204 3205 static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl) 3206 { 3207 struct nvme_dev *dev = to_nvme_dev(ctrl); 3208 3209 return dma_pci_p2pdma_supported(dev->dev); 3210 } 3211 3212 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { 3213 .name = "pcie", 3214 .module = THIS_MODULE, 3215 .flags = NVME_F_METADATA_SUPPORTED, 3216 .dev_attr_groups = nvme_pci_dev_attr_groups, 3217 .reg_read32 = nvme_pci_reg_read32, 3218 .reg_write32 = nvme_pci_reg_write32, 3219 .reg_read64 = nvme_pci_reg_read64, 3220 .free_ctrl = nvme_pci_free_ctrl, 3221 .submit_async_event = nvme_pci_submit_async_event, 3222 .subsystem_reset = nvme_pci_subsystem_reset, 3223 .get_address = nvme_pci_get_address, 3224 .print_device_info = nvme_pci_print_device_info, 3225 .supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma, 3226 }; 3227 3228 static int nvme_dev_map(struct nvme_dev *dev) 3229 { 3230 struct pci_dev *pdev = to_pci_dev(dev->dev); 3231 3232 if (pci_request_mem_regions(pdev, "nvme")) 3233 return -ENODEV; 3234 3235 if (nvme_remap_bar(dev, NVME_REG_DBS + 4096)) 3236 goto release; 3237 3238 return 0; 3239 release: 3240 pci_release_mem_regions(pdev); 3241 return -ENODEV; 3242 } 3243 3244 static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) 3245 { 3246 if (pdev->vendor == 0x144d && pdev->device == 0xa802) { 3247 /* 3248 * Several Samsung devices seem to drop off the PCIe bus 3249 * randomly when APST is on and uses the deepest sleep state. 3250 * This has been observed on a Samsung "SM951 NVMe SAMSUNG 3251 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD 3252 * 950 PRO 256GB", but it seems to be restricted to two Dell 3253 * laptops. 3254 */ 3255 if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") && 3256 (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") || 3257 dmi_match(DMI_PRODUCT_NAME, "Precision 5510"))) 3258 return NVME_QUIRK_NO_DEEPEST_PS; 3259 } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) { 3260 /* 3261 * Samsung SSD 960 EVO drops off the PCIe bus after system 3262 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as 3263 * within few minutes after bootup on a Coffee Lake board - 3264 * ASUS PRIME Z370-A 3265 */ 3266 if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") && 3267 (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") || 3268 dmi_match(DMI_BOARD_NAME, "PRIME Z370-A"))) 3269 return NVME_QUIRK_NO_APST; 3270 } else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 || 3271 pdev->device == 0xa808 || pdev->device == 0xa809)) || 3272 (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) { 3273 /* 3274 * Forcing to use host managed nvme power settings for 3275 * lowest idle power with quick resume latency on 3276 * Samsung and Toshiba SSDs based on suspend behavior 3277 * on Coffee Lake board for LENOVO C640 3278 */ 3279 if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) && 3280 dmi_match(DMI_BOARD_NAME, "LNVNB161216")) 3281 return NVME_QUIRK_SIMPLE_SUSPEND; 3282 } else if (pdev->vendor == 0x2646 && (pdev->device == 0x2263 || 3283 pdev->device == 0x500f)) { 3284 /* 3285 * Exclude some Kingston NV1 and A2000 devices from 3286 * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a 3287 * lot of energy with s2idle sleep on some TUXEDO platforms. 3288 */ 3289 if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") || 3290 dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") || 3291 dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") || 3292 dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1")) 3293 return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; 3294 } else if (pdev->vendor == 0x144d && pdev->device == 0xa80d) { 3295 /* 3296 * Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND 3297 * because of high power consumption (> 2 Watt) in s2idle 3298 * sleep. Only some boards with Intel CPU are affected. 3299 */ 3300 if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") || 3301 dmi_match(DMI_BOARD_NAME, "GMxPXxx") || 3302 dmi_match(DMI_BOARD_NAME, "GXxMRXx") || 3303 dmi_match(DMI_BOARD_NAME, "PH4PG31") || 3304 dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") || 3305 dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) 3306 return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; 3307 } 3308 3309 /* 3310 * NVMe SSD drops off the PCIe bus after system idle 3311 * for 10 hours on a Lenovo N60z board. 3312 */ 3313 if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6")) 3314 return NVME_QUIRK_NO_APST; 3315 3316 return 0; 3317 } 3318 3319 static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, 3320 const struct pci_device_id *id) 3321 { 3322 unsigned long quirks = id->driver_data; 3323 int node = dev_to_node(&pdev->dev); 3324 struct nvme_dev *dev; 3325 int ret = -ENOMEM; 3326 3327 dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids), 3328 GFP_KERNEL, node); 3329 if (!dev) 3330 return ERR_PTR(-ENOMEM); 3331 INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); 3332 mutex_init(&dev->shutdown_lock); 3333 3334 dev->nr_write_queues = write_queues; 3335 dev->nr_poll_queues = poll_queues; 3336 dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1; 3337 dev->queues = kcalloc_node(dev->nr_allocated_queues, 3338 sizeof(struct nvme_queue), GFP_KERNEL, node); 3339 if (!dev->queues) 3340 goto out_free_dev; 3341 3342 dev->dev = get_device(&pdev->dev); 3343 3344 quirks |= check_vendor_combination_bug(pdev); 3345 if (!noacpi && 3346 !(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) && 3347 acpi_storage_d3(&pdev->dev)) { 3348 /* 3349 * Some systems use a bios work around to ask for D3 on 3350 * platforms that support kernel managed suspend. 3351 */ 3352 dev_info(&pdev->dev, 3353 "platform quirk: setting simple suspend\n"); 3354 quirks |= NVME_QUIRK_SIMPLE_SUSPEND; 3355 } 3356 ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, 3357 quirks); 3358 if (ret) 3359 goto out_put_device; 3360 3361 if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) 3362 dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48)); 3363 else 3364 dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); 3365 dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); 3366 dma_set_max_seg_size(&pdev->dev, 0xffffffff); 3367 3368 /* 3369 * Limit the max command size to prevent iod->sg allocations going 3370 * over a single page. 3371 */ 3372 dev->ctrl.max_hw_sectors = min_t(u32, 3373 NVME_MAX_BYTES >> SECTOR_SHIFT, 3374 dma_opt_mapping_size(&pdev->dev) >> 9); 3375 dev->ctrl.max_segments = NVME_MAX_SEGS; 3376 dev->ctrl.max_integrity_segments = 1; 3377 return dev; 3378 3379 out_put_device: 3380 put_device(dev->dev); 3381 kfree(dev->queues); 3382 out_free_dev: 3383 kfree(dev); 3384 return ERR_PTR(ret); 3385 } 3386 3387 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 3388 { 3389 struct nvme_dev *dev; 3390 int result = -ENOMEM; 3391 3392 dev = nvme_pci_alloc_dev(pdev, id); 3393 if (IS_ERR(dev)) 3394 return PTR_ERR(dev); 3395 3396 result = nvme_add_ctrl(&dev->ctrl); 3397 if (result) 3398 goto out_put_ctrl; 3399 3400 result = nvme_dev_map(dev); 3401 if (result) 3402 goto out_uninit_ctrl; 3403 3404 result = nvme_pci_alloc_iod_mempool(dev); 3405 if (result) 3406 goto out_dev_unmap; 3407 3408 dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); 3409 3410 result = nvme_pci_enable(dev); 3411 if (result) 3412 goto out_release_iod_mempool; 3413 3414 result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset, 3415 &nvme_mq_admin_ops, sizeof(struct nvme_iod)); 3416 if (result) 3417 goto out_disable; 3418 3419 /* 3420 * Mark the controller as connecting before sending admin commands to 3421 * allow the timeout handler to do the right thing. 3422 */ 3423 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { 3424 dev_warn(dev->ctrl.device, 3425 "failed to mark controller CONNECTING\n"); 3426 result = -EBUSY; 3427 goto out_disable; 3428 } 3429 3430 result = nvme_init_ctrl_finish(&dev->ctrl, false); 3431 if (result) 3432 goto out_disable; 3433 3434 if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 3435 dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; 3436 else 3437 dev->ctrl.max_integrity_segments = 1; 3438 3439 nvme_dbbuf_dma_alloc(dev); 3440 3441 result = nvme_setup_host_mem(dev); 3442 if (result < 0) 3443 goto out_disable; 3444 3445 result = nvme_setup_io_queues(dev); 3446 if (result) 3447 goto out_disable; 3448 3449 if (dev->online_queues > 1) { 3450 nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops, 3451 nvme_pci_nr_maps(dev), sizeof(struct nvme_iod)); 3452 nvme_dbbuf_set(dev); 3453 } 3454 3455 if (!dev->ctrl.tagset) 3456 dev_warn(dev->ctrl.device, "IO queues not created\n"); 3457 3458 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { 3459 dev_warn(dev->ctrl.device, 3460 "failed to mark controller live state\n"); 3461 result = -ENODEV; 3462 goto out_disable; 3463 } 3464 3465 pci_set_drvdata(pdev, dev); 3466 3467 nvme_start_ctrl(&dev->ctrl); 3468 nvme_put_ctrl(&dev->ctrl); 3469 flush_work(&dev->ctrl.scan_work); 3470 return 0; 3471 3472 out_disable: 3473 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 3474 nvme_dev_disable(dev, true); 3475 nvme_free_host_mem(dev); 3476 nvme_dev_remove_admin(dev); 3477 nvme_dbbuf_dma_free(dev); 3478 nvme_free_queues(dev, 0); 3479 out_release_iod_mempool: 3480 mempool_destroy(dev->iod_meta_mempool); 3481 out_dev_unmap: 3482 nvme_dev_unmap(dev); 3483 out_uninit_ctrl: 3484 nvme_uninit_ctrl(&dev->ctrl); 3485 out_put_ctrl: 3486 nvme_put_ctrl(&dev->ctrl); 3487 return result; 3488 } 3489 3490 static void nvme_reset_prepare(struct pci_dev *pdev) 3491 { 3492 struct nvme_dev *dev = pci_get_drvdata(pdev); 3493 3494 /* 3495 * We don't need to check the return value from waiting for the reset 3496 * state as pci_dev device lock is held, making it impossible to race 3497 * with ->remove(). 3498 */ 3499 nvme_disable_prepare_reset(dev, false); 3500 nvme_sync_queues(&dev->ctrl); 3501 } 3502 3503 static void nvme_reset_done(struct pci_dev *pdev) 3504 { 3505 struct nvme_dev *dev = pci_get_drvdata(pdev); 3506 3507 if (!nvme_try_sched_reset(&dev->ctrl)) 3508 flush_work(&dev->ctrl.reset_work); 3509 } 3510 3511 static void nvme_shutdown(struct pci_dev *pdev) 3512 { 3513 struct nvme_dev *dev = pci_get_drvdata(pdev); 3514 3515 nvme_disable_prepare_reset(dev, true); 3516 } 3517 3518 /* 3519 * The driver's remove may be called on a device in a partially initialized 3520 * state. This function must not have any dependencies on the device state in 3521 * order to proceed. 3522 */ 3523 static void nvme_remove(struct pci_dev *pdev) 3524 { 3525 struct nvme_dev *dev = pci_get_drvdata(pdev); 3526 3527 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 3528 pci_set_drvdata(pdev, NULL); 3529 3530 if (!pci_device_is_present(pdev)) { 3531 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); 3532 nvme_dev_disable(dev, true); 3533 } 3534 3535 flush_work(&dev->ctrl.reset_work); 3536 nvme_stop_ctrl(&dev->ctrl); 3537 nvme_remove_namespaces(&dev->ctrl); 3538 nvme_dev_disable(dev, true); 3539 nvme_free_host_mem(dev); 3540 nvme_dev_remove_admin(dev); 3541 nvme_dbbuf_dma_free(dev); 3542 nvme_free_queues(dev, 0); 3543 mempool_destroy(dev->iod_meta_mempool); 3544 nvme_release_descriptor_pools(dev); 3545 nvme_dev_unmap(dev); 3546 nvme_uninit_ctrl(&dev->ctrl); 3547 } 3548 3549 #ifdef CONFIG_PM_SLEEP 3550 static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps) 3551 { 3552 return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps); 3553 } 3554 3555 static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps) 3556 { 3557 return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL); 3558 } 3559 3560 static int nvme_resume(struct device *dev) 3561 { 3562 struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); 3563 struct nvme_ctrl *ctrl = &ndev->ctrl; 3564 3565 if (ndev->last_ps == U32_MAX || 3566 nvme_set_power_state(ctrl, ndev->last_ps) != 0) 3567 goto reset; 3568 if (ctrl->hmpre && nvme_setup_host_mem(ndev)) 3569 goto reset; 3570 3571 return 0; 3572 reset: 3573 return nvme_try_sched_reset(ctrl); 3574 } 3575 3576 static int nvme_suspend(struct device *dev) 3577 { 3578 struct pci_dev *pdev = to_pci_dev(dev); 3579 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3580 struct nvme_ctrl *ctrl = &ndev->ctrl; 3581 int ret = -EBUSY; 3582 3583 ndev->last_ps = U32_MAX; 3584 3585 /* 3586 * The platform does not remove power for a kernel managed suspend so 3587 * use host managed nvme power settings for lowest idle power if 3588 * possible. This should have quicker resume latency than a full device 3589 * shutdown. But if the firmware is involved after the suspend or the 3590 * device does not support any non-default power states, shut down the 3591 * device fully. 3592 * 3593 * If ASPM is not enabled for the device, shut down the device and allow 3594 * the PCI bus layer to put it into D3 in order to take the PCIe link 3595 * down, so as to allow the platform to achieve its minimum low-power 3596 * state (which may not be possible if the link is up). 3597 */ 3598 if (pm_suspend_via_firmware() || !ctrl->npss || 3599 !pcie_aspm_enabled(pdev) || 3600 (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) 3601 return nvme_disable_prepare_reset(ndev, true); 3602 3603 nvme_start_freeze(ctrl); 3604 nvme_wait_freeze(ctrl); 3605 nvme_sync_queues(ctrl); 3606 3607 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) 3608 goto unfreeze; 3609 3610 /* 3611 * Host memory access may not be successful in a system suspend state, 3612 * but the specification allows the controller to access memory in a 3613 * non-operational power state. 3614 */ 3615 if (ndev->hmb) { 3616 ret = nvme_set_host_mem(ndev, 0); 3617 if (ret < 0) 3618 goto unfreeze; 3619 } 3620 3621 ret = nvme_get_power_state(ctrl, &ndev->last_ps); 3622 if (ret < 0) 3623 goto unfreeze; 3624 3625 /* 3626 * A saved state prevents pci pm from generically controlling the 3627 * device's power. If we're using protocol specific settings, we don't 3628 * want pci interfering. 3629 */ 3630 pci_save_state(pdev); 3631 3632 ret = nvme_set_power_state(ctrl, ctrl->npss); 3633 if (ret < 0) 3634 goto unfreeze; 3635 3636 if (ret) { 3637 /* discard the saved state */ 3638 pci_load_saved_state(pdev, NULL); 3639 3640 /* 3641 * Clearing npss forces a controller reset on resume. The 3642 * correct value will be rediscovered then. 3643 */ 3644 ret = nvme_disable_prepare_reset(ndev, true); 3645 ctrl->npss = 0; 3646 } 3647 unfreeze: 3648 nvme_unfreeze(ctrl); 3649 return ret; 3650 } 3651 3652 static int nvme_simple_suspend(struct device *dev) 3653 { 3654 struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); 3655 3656 return nvme_disable_prepare_reset(ndev, true); 3657 } 3658 3659 static int nvme_simple_resume(struct device *dev) 3660 { 3661 struct pci_dev *pdev = to_pci_dev(dev); 3662 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3663 3664 return nvme_try_sched_reset(&ndev->ctrl); 3665 } 3666 3667 static const struct dev_pm_ops nvme_dev_pm_ops = { 3668 .suspend = nvme_suspend, 3669 .resume = nvme_resume, 3670 .freeze = nvme_simple_suspend, 3671 .thaw = nvme_simple_resume, 3672 .poweroff = nvme_simple_suspend, 3673 .restore = nvme_simple_resume, 3674 }; 3675 #endif /* CONFIG_PM_SLEEP */ 3676 3677 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, 3678 pci_channel_state_t state) 3679 { 3680 struct nvme_dev *dev = pci_get_drvdata(pdev); 3681 3682 /* 3683 * A frozen channel requires a reset. When detected, this method will 3684 * shutdown the controller to quiesce. The controller will be restarted 3685 * after the slot reset through driver's slot_reset callback. 3686 */ 3687 switch (state) { 3688 case pci_channel_io_normal: 3689 return PCI_ERS_RESULT_CAN_RECOVER; 3690 case pci_channel_io_frozen: 3691 dev_warn(dev->ctrl.device, 3692 "frozen state error detected, reset controller\n"); 3693 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) { 3694 nvme_dev_disable(dev, true); 3695 return PCI_ERS_RESULT_DISCONNECT; 3696 } 3697 nvme_dev_disable(dev, false); 3698 return PCI_ERS_RESULT_NEED_RESET; 3699 case pci_channel_io_perm_failure: 3700 dev_warn(dev->ctrl.device, 3701 "failure state error detected, request disconnect\n"); 3702 return PCI_ERS_RESULT_DISCONNECT; 3703 } 3704 return PCI_ERS_RESULT_NEED_RESET; 3705 } 3706 3707 static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) 3708 { 3709 struct nvme_dev *dev = pci_get_drvdata(pdev); 3710 3711 dev_info(dev->ctrl.device, "restart after slot reset\n"); 3712 pci_restore_state(pdev); 3713 if (nvme_try_sched_reset(&dev->ctrl)) 3714 nvme_unquiesce_io_queues(&dev->ctrl); 3715 return PCI_ERS_RESULT_RECOVERED; 3716 } 3717 3718 static void nvme_error_resume(struct pci_dev *pdev) 3719 { 3720 struct nvme_dev *dev = pci_get_drvdata(pdev); 3721 3722 flush_work(&dev->ctrl.reset_work); 3723 } 3724 3725 static const struct pci_error_handlers nvme_err_handler = { 3726 .error_detected = nvme_error_detected, 3727 .slot_reset = nvme_slot_reset, 3728 .resume = nvme_error_resume, 3729 .reset_prepare = nvme_reset_prepare, 3730 .reset_done = nvme_reset_done, 3731 }; 3732 3733 static const struct pci_device_id nvme_id_table[] = { 3734 { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ 3735 .driver_data = NVME_QUIRK_STRIPE_SIZE | 3736 NVME_QUIRK_DEALLOCATE_ZEROES, }, 3737 { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ 3738 .driver_data = NVME_QUIRK_STRIPE_SIZE | 3739 NVME_QUIRK_DEALLOCATE_ZEROES, }, 3740 { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ 3741 .driver_data = NVME_QUIRK_STRIPE_SIZE | 3742 NVME_QUIRK_IGNORE_DEV_SUBNQN | 3743 NVME_QUIRK_BOGUS_NID, }, 3744 { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ 3745 .driver_data = NVME_QUIRK_STRIPE_SIZE, }, 3746 { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ 3747 .driver_data = NVME_QUIRK_NO_DEEPEST_PS | 3748 NVME_QUIRK_MEDIUM_PRIO_SQ | 3749 NVME_QUIRK_NO_TEMP_THRESH_CHANGE | 3750 NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3751 { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ 3752 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3753 { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ 3754 .driver_data = NVME_QUIRK_IDENTIFY_CNS | 3755 NVME_QUIRK_DISABLE_WRITE_ZEROES | 3756 NVME_QUIRK_BOGUS_NID, }, 3757 { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ 3758 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3759 { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ 3760 .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, }, 3761 { PCI_DEVICE(0x126f, 0x1001), /* Silicon Motion generic */ 3762 .driver_data = NVME_QUIRK_NO_DEEPEST_PS | 3763 NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3764 { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ 3765 .driver_data = NVME_QUIRK_NO_DEEPEST_PS | 3766 NVME_QUIRK_BOGUS_NID, }, 3767 { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ 3768 .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | 3769 NVME_QUIRK_BOGUS_NID, }, 3770 { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ 3771 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | 3772 NVME_QUIRK_NO_NS_DESC_LIST, }, 3773 { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ 3774 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 3775 { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */ 3776 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 3777 { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ 3778 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 3779 { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ 3780 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 3781 { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ 3782 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | 3783 NVME_QUIRK_DISABLE_WRITE_ZEROES| 3784 NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3785 { PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */ 3786 .driver_data = NVME_QUIRK_BROKEN_MSI }, 3787 { PCI_DEVICE(0x15b7, 0x5009), /* Sandisk SN550 */ 3788 .driver_data = NVME_QUIRK_BROKEN_MSI | 3789 NVME_QUIRK_NO_DEEPEST_PS }, 3790 { PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */ 3791 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3792 { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ 3793 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | 3794 NVME_QUIRK_BOGUS_NID, }, 3795 { PCI_DEVICE(0x1987, 0x5019), /* phison E19 */ 3796 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3797 { PCI_DEVICE(0x1987, 0x5021), /* Phison E21 */ 3798 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3799 { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ 3800 .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | 3801 NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3802 { PCI_DEVICE(0x1cc1, 0x33f8), /* ADATA IM2P33F8ABR1 1 TB */ 3803 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3804 { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ 3805 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | 3806 NVME_QUIRK_BOGUS_NID, }, 3807 { PCI_DEVICE(0x10ec, 0x5763), /* ADATA SX6000PNP */ 3808 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3809 { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ 3810 .driver_data = NVME_QUIRK_NO_DEEPEST_PS | 3811 NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3812 { PCI_DEVICE(0x1344, 0x5407), /* Micron Technology Inc NVMe SSD */ 3813 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN }, 3814 { PCI_DEVICE(0x1344, 0x6001), /* Micron Nitro NVMe */ 3815 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3816 { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ 3817 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3818 { PCI_DEVICE(0x1c5c, 0x174a), /* SK Hynix P31 SSD */ 3819 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3820 { PCI_DEVICE(0x1c5c, 0x1D59), /* SK Hynix BC901 */ 3821 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3822 { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ 3823 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3824 { PCI_DEVICE(0x1d97, 0x2263), /* SPCC */ 3825 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3826 { PCI_DEVICE(0x144d, 0xa80b), /* Samsung PM9B1 256G and 512G */ 3827 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES | 3828 NVME_QUIRK_BOGUS_NID, }, 3829 { PCI_DEVICE(0x144d, 0xa809), /* Samsung MZALQ256HBJD 256G */ 3830 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3831 { PCI_DEVICE(0x144d, 0xa802), /* Samsung SM953 */ 3832 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3833 { PCI_DEVICE(0x1cc4, 0x6303), /* UMIS RPJTJ512MGE1QDY 512G */ 3834 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3835 { PCI_DEVICE(0x1cc4, 0x6302), /* UMIS RPJTJ256MGE1QDY 256G */ 3836 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3837 { PCI_DEVICE(0x2646, 0x2262), /* KINGSTON SKC2000 NVMe SSD */ 3838 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, 3839 { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ 3840 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, 3841 { PCI_DEVICE(0x2646, 0x5013), /* Kingston KC3000, Kingston FURY Renegade */ 3842 .driver_data = NVME_QUIRK_NO_SECONDARY_TEMP_THRESH, }, 3843 { PCI_DEVICE(0x2646, 0x5018), /* KINGSTON OM8SFP4xxxxP OS21012 NVMe SSD */ 3844 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3845 { PCI_DEVICE(0x2646, 0x5016), /* KINGSTON OM3PGP4xxxxP OS21011 NVMe SSD */ 3846 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3847 { PCI_DEVICE(0x2646, 0x501A), /* KINGSTON OM8PGP4xxxxP OS21005 NVMe SSD */ 3848 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3849 { PCI_DEVICE(0x2646, 0x501B), /* KINGSTON OM8PGP4xxxxQ OS21005 NVMe SSD */ 3850 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3851 { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */ 3852 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3853 { PCI_DEVICE(0x1f40, 0x1202), /* Netac Technologies Co. NV3000 NVMe SSD */ 3854 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3855 { PCI_DEVICE(0x1f40, 0x5236), /* Netac Technologies Co. NV7000 NVMe SSD */ 3856 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3857 { PCI_DEVICE(0x1e4B, 0x1001), /* MAXIO MAP1001 */ 3858 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3859 { PCI_DEVICE(0x1e4B, 0x1002), /* MAXIO MAP1002 */ 3860 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3861 { PCI_DEVICE(0x1e4B, 0x1202), /* MAXIO MAP1202 */ 3862 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3863 { PCI_DEVICE(0x1e4B, 0x1602), /* MAXIO MAP1602 */ 3864 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3865 { PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */ 3866 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3867 { PCI_DEVICE(0x1dbe, 0x5216), /* Acer/INNOGRIT FA100/5216 NVMe SSD */ 3868 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3869 { PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */ 3870 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3871 { PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */ 3872 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, 3873 { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ 3874 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, 3875 { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */ 3876 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, 3877 { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ 3878 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3879 { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ 3880 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3881 { PCI_DEVICE(0x1d97, 0x1d97), /* Lexar NM620 */ 3882 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3883 { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ 3884 .driver_data = NVME_QUIRK_BOGUS_NID | 3885 NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3886 { PCI_DEVICE(0x10ec, 0x5763), /* TEAMGROUP T-FORCE CARDEA ZERO Z330 SSD */ 3887 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3888 { PCI_DEVICE(0x1e4b, 0x1602), /* HS-SSD-FUTURE 2048G */ 3889 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3890 { PCI_DEVICE(0x10ec, 0x5765), /* TEAMGROUP MP33 2TB SSD */ 3891 .driver_data = NVME_QUIRK_BOGUS_NID, }, 3892 { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), 3893 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, 3894 { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), 3895 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, 3896 { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061), 3897 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, 3898 { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00), 3899 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, 3900 { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01), 3901 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, 3902 { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), 3903 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, 3904 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), 3905 /* 3906 * Fix for the Apple controller found in the MacBook8,1 and 3907 * some MacBook7,1 to avoid controller resets and data loss. 3908 */ 3909 .driver_data = NVME_QUIRK_SINGLE_VECTOR | 3910 NVME_QUIRK_QDEPTH_ONE }, 3911 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, 3912 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), 3913 .driver_data = NVME_QUIRK_SINGLE_VECTOR | 3914 NVME_QUIRK_128_BYTES_SQES | 3915 NVME_QUIRK_SHARED_TAGS | 3916 NVME_QUIRK_SKIP_CID_GEN | 3917 NVME_QUIRK_IDENTIFY_CNS }, 3918 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 3919 { 0, } 3920 }; 3921 MODULE_DEVICE_TABLE(pci, nvme_id_table); 3922 3923 static struct pci_driver nvme_driver = { 3924 .name = "nvme", 3925 .id_table = nvme_id_table, 3926 .probe = nvme_probe, 3927 .remove = nvme_remove, 3928 .shutdown = nvme_shutdown, 3929 .driver = { 3930 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 3931 #ifdef CONFIG_PM_SLEEP 3932 .pm = &nvme_dev_pm_ops, 3933 #endif 3934 }, 3935 .sriov_configure = pci_sriov_configure_simple, 3936 .err_handler = &nvme_err_handler, 3937 }; 3938 3939 static int __init nvme_init(void) 3940 { 3941 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 3942 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 3943 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 3944 BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); 3945 3946 return pci_register_driver(&nvme_driver); 3947 } 3948 3949 static void __exit nvme_exit(void) 3950 { 3951 pci_unregister_driver(&nvme_driver); 3952 flush_workqueue(nvme_wq); 3953 } 3954 3955 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 3956 MODULE_LICENSE("GPL"); 3957 MODULE_VERSION("1.0"); 3958 MODULE_DESCRIPTION("NVMe host PCIe transport driver"); 3959 module_init(nvme_init); 3960 module_exit(nvme_exit); 3961