1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <semaphore.h> 73 #include <stdbool.h> 74 #include <stddef.h> 75 #include <stdint.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 80 #include <machine/atomic.h> 81 #include <machine/vmm.h> 82 #include <vmmapi.h> 83 84 #include <dev/nvme/nvme.h> 85 86 #include "bhyverun.h" 87 #include "block_if.h" 88 #include "config.h" 89 #include "debug.h" 90 #include "pci_emul.h" 91 92 93 static int nvme_debug = 0; 94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 96 97 /* defaults; can be overridden */ 98 #define NVME_MSIX_BAR 4 99 100 #define NVME_IOSLOTS 8 101 102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 103 #define NVME_MMIO_SPACE_MIN (1 << 14) 104 105 #define NVME_QUEUES 16 106 #define NVME_MAX_QENTRIES 2048 107 /* Memory Page size Minimum reported in CAP register */ 108 #define NVME_MPSMIN 0 109 /* MPSMIN converted to bytes */ 110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 111 112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 113 #define NVME_MDTS 9 114 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 117 118 /* This is a synthetic status code to indicate there is no status */ 119 #define NVME_NO_STATUS 0xffff 120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 121 122 /* Reported temperature in Kelvin (i.e. room temperature) */ 123 #define NVME_TEMPERATURE 296 124 125 /* helpers */ 126 127 /* Convert a zero-based value into a one-based value */ 128 #define ONE_BASED(zero) ((zero) + 1) 129 /* Convert a one-based value into a zero-based value */ 130 #define ZERO_BASED(one) ((one) - 1) 131 132 /* Encode number of SQ's and CQ's for Set/Get Features */ 133 #define NVME_FEATURE_NUM_QUEUES(sc) \ 134 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 135 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 136 137 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 138 139 enum nvme_controller_register_offsets { 140 NVME_CR_CAP_LOW = 0x00, 141 NVME_CR_CAP_HI = 0x04, 142 NVME_CR_VS = 0x08, 143 NVME_CR_INTMS = 0x0c, 144 NVME_CR_INTMC = 0x10, 145 NVME_CR_CC = 0x14, 146 NVME_CR_CSTS = 0x1c, 147 NVME_CR_NSSR = 0x20, 148 NVME_CR_AQA = 0x24, 149 NVME_CR_ASQ_LOW = 0x28, 150 NVME_CR_ASQ_HI = 0x2c, 151 NVME_CR_ACQ_LOW = 0x30, 152 NVME_CR_ACQ_HI = 0x34, 153 }; 154 155 enum nvme_cmd_cdw11 { 156 NVME_CMD_CDW11_PC = 0x0001, 157 NVME_CMD_CDW11_IEN = 0x0002, 158 NVME_CMD_CDW11_IV = 0xFFFF0000, 159 }; 160 161 enum nvme_copy_dir { 162 NVME_COPY_TO_PRP, 163 NVME_COPY_FROM_PRP, 164 }; 165 166 #define NVME_CQ_INTEN 0x01 167 #define NVME_CQ_INTCOAL 0x02 168 169 struct nvme_completion_queue { 170 struct nvme_completion *qbase; 171 pthread_mutex_t mtx; 172 uint32_t size; 173 uint16_t tail; /* nvme progress */ 174 uint16_t head; /* guest progress */ 175 uint16_t intr_vec; 176 uint32_t intr_en; 177 }; 178 179 struct nvme_submission_queue { 180 struct nvme_command *qbase; 181 pthread_mutex_t mtx; 182 uint32_t size; 183 uint16_t head; /* nvme progress */ 184 uint16_t tail; /* guest progress */ 185 uint16_t cqid; /* completion queue id */ 186 int qpriority; 187 }; 188 189 enum nvme_storage_type { 190 NVME_STOR_BLOCKIF = 0, 191 NVME_STOR_RAM = 1, 192 }; 193 194 struct pci_nvme_blockstore { 195 enum nvme_storage_type type; 196 void *ctx; 197 uint64_t size; 198 uint32_t sectsz; 199 uint32_t sectsz_bits; 200 uint64_t eui64; 201 uint32_t deallocate:1; 202 }; 203 204 /* 205 * Calculate the number of additional page descriptors for guest IO requests 206 * based on the advertised Max Data Transfer (MDTS) and given the number of 207 * default iovec's in a struct blockif_req. 208 */ 209 #define MDTS_PAD_SIZE \ 210 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 211 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 212 0 ) 213 214 struct pci_nvme_ioreq { 215 struct pci_nvme_softc *sc; 216 STAILQ_ENTRY(pci_nvme_ioreq) link; 217 struct nvme_submission_queue *nvme_sq; 218 uint16_t sqid; 219 220 /* command information */ 221 uint16_t opc; 222 uint16_t cid; 223 uint32_t nsid; 224 225 uint64_t prev_gpaddr; 226 size_t prev_size; 227 size_t bytes; 228 229 struct blockif_req io_req; 230 231 struct iovec iovpadding[MDTS_PAD_SIZE]; 232 }; 233 234 enum nvme_dsm_type { 235 /* Dataset Management bit in ONCS reflects backing storage capability */ 236 NVME_DATASET_MANAGEMENT_AUTO, 237 /* Unconditionally set Dataset Management bit in ONCS */ 238 NVME_DATASET_MANAGEMENT_ENABLE, 239 /* Unconditionally clear Dataset Management bit in ONCS */ 240 NVME_DATASET_MANAGEMENT_DISABLE, 241 }; 242 243 struct pci_nvme_softc; 244 struct nvme_feature_obj; 245 246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 247 struct nvme_feature_obj *, 248 struct nvme_command *, 249 struct nvme_completion *); 250 251 struct nvme_feature_obj { 252 uint32_t cdw11; 253 nvme_feature_cb set; 254 nvme_feature_cb get; 255 bool namespace_specific; 256 }; 257 258 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 259 260 typedef enum { 261 PCI_NVME_AE_TYPE_ERROR = 0, 262 PCI_NVME_AE_TYPE_SMART, 263 PCI_NVME_AE_TYPE_NOTICE, 264 PCI_NVME_AE_TYPE_IO_CMD = 6, 265 PCI_NVME_AE_TYPE_VENDOR = 7, 266 PCI_NVME_AE_TYPE_MAX /* Must be last */ 267 } pci_nvme_async_type; 268 269 /* Asynchronous Event Requests */ 270 struct pci_nvme_aer { 271 STAILQ_ENTRY(pci_nvme_aer) link; 272 uint16_t cid; /* Command ID of the submitted AER */ 273 }; 274 275 /** Asynchronous Event Information - Notice */ 276 typedef enum { 277 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 278 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 279 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 280 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 281 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 282 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 283 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 284 PCI_NVME_AEI_NOTICE_MAX, 285 } pci_nvme_async_event_info_notice; 286 287 #define PCI_NVME_AEI_NOTICE_SHIFT 8 288 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 289 290 /* Asynchronous Event Notifications */ 291 struct pci_nvme_aen { 292 pci_nvme_async_type atype; 293 uint32_t event_data; 294 bool posted; 295 }; 296 297 /* 298 * By default, enable all Asynchrnous Event Notifications: 299 * SMART / Health Critical Warnings 300 * Namespace Attribute Notices 301 */ 302 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 303 304 typedef enum { 305 NVME_CNTRLTYPE_IO = 1, 306 NVME_CNTRLTYPE_DISCOVERY = 2, 307 NVME_CNTRLTYPE_ADMIN = 3, 308 } pci_nvme_cntrl_type; 309 310 struct pci_nvme_softc { 311 struct pci_devinst *nsc_pi; 312 313 pthread_mutex_t mtx; 314 315 struct nvme_registers regs; 316 317 struct nvme_namespace_data nsdata; 318 struct nvme_controller_data ctrldata; 319 struct nvme_error_information_entry err_log; 320 struct nvme_health_information_page health_log; 321 struct nvme_firmware_page fw_log; 322 struct nvme_ns_list ns_log; 323 324 struct pci_nvme_blockstore nvstore; 325 326 uint16_t max_qentries; /* max entries per queue */ 327 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 328 uint32_t num_cqueues; 329 uint32_t num_squeues; 330 bool num_q_is_set; /* Has host set Number of Queues */ 331 332 struct pci_nvme_ioreq *ioreqs; 333 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 334 uint32_t pending_ios; 335 uint32_t ioslots; 336 sem_t iosemlock; 337 338 /* 339 * Memory mapped Submission and Completion queues 340 * Each array includes both Admin and IO queues 341 */ 342 struct nvme_completion_queue *compl_queues; 343 struct nvme_submission_queue *submit_queues; 344 345 struct nvme_feature_obj feat[NVME_FID_MAX]; 346 347 enum nvme_dsm_type dataset_management; 348 349 /* Accounting for SMART data */ 350 __uint128_t read_data_units; 351 __uint128_t write_data_units; 352 __uint128_t read_commands; 353 __uint128_t write_commands; 354 uint32_t read_dunits_remainder; 355 uint32_t write_dunits_remainder; 356 357 STAILQ_HEAD(, pci_nvme_aer) aer_list; 358 pthread_mutex_t aer_mtx; 359 uint32_t aer_count; 360 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 361 pthread_t aen_tid; 362 pthread_mutex_t aen_mtx; 363 pthread_cond_t aen_cond; 364 }; 365 366 367 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 368 struct nvme_completion_queue *cq, 369 uint32_t cdw0, 370 uint16_t cid, 371 uint16_t sqid, 372 uint16_t status); 373 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 374 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 375 static void pci_nvme_io_done(struct blockif_req *, int); 376 377 /* Controller Configuration utils */ 378 #define NVME_CC_GET_EN(cc) \ 379 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 380 #define NVME_CC_GET_CSS(cc) \ 381 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 382 #define NVME_CC_GET_SHN(cc) \ 383 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 384 #define NVME_CC_GET_IOSQES(cc) \ 385 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 386 #define NVME_CC_GET_IOCQES(cc) \ 387 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 388 389 #define NVME_CC_WRITE_MASK \ 390 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 391 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 392 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 393 394 #define NVME_CC_NEN_WRITE_MASK \ 395 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 396 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 397 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 398 399 /* Controller Status utils */ 400 #define NVME_CSTS_GET_RDY(sts) \ 401 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 402 403 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 404 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 405 406 /* Completion Queue status word utils */ 407 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 408 #define NVME_STATUS_MASK \ 409 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 410 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 411 412 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 413 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 414 415 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 416 struct nvme_feature_obj *, 417 struct nvme_command *, 418 struct nvme_completion *); 419 static void nvme_feature_temperature(struct pci_nvme_softc *, 420 struct nvme_feature_obj *, 421 struct nvme_command *, 422 struct nvme_completion *); 423 static void nvme_feature_num_queues(struct pci_nvme_softc *, 424 struct nvme_feature_obj *, 425 struct nvme_command *, 426 struct nvme_completion *); 427 static void nvme_feature_iv_config(struct pci_nvme_softc *, 428 struct nvme_feature_obj *, 429 struct nvme_command *, 430 struct nvme_completion *); 431 static void nvme_feature_async_event(struct pci_nvme_softc *, 432 struct nvme_feature_obj *, 433 struct nvme_command *, 434 struct nvme_completion *); 435 436 static void *aen_thr(void *arg); 437 438 static __inline void 439 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 440 { 441 size_t len; 442 443 len = strnlen(src, dst_size); 444 memset(dst, pad, dst_size); 445 memcpy(dst, src, len); 446 } 447 448 static __inline void 449 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 450 { 451 452 *status &= ~NVME_STATUS_MASK; 453 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 454 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 455 } 456 457 static __inline void 458 pci_nvme_status_genc(uint16_t *status, uint16_t code) 459 { 460 461 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 462 } 463 464 /* 465 * Initialize the requested number or IO Submission and Completion Queues. 466 * Admin queues are allocated implicitly. 467 */ 468 static void 469 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 470 { 471 uint32_t i; 472 473 /* 474 * Allocate and initialize the Submission Queues 475 */ 476 if (nsq > NVME_QUEUES) { 477 WPRINTF("%s: clamping number of SQ from %u to %u", 478 __func__, nsq, NVME_QUEUES); 479 nsq = NVME_QUEUES; 480 } 481 482 sc->num_squeues = nsq; 483 484 sc->submit_queues = calloc(sc->num_squeues + 1, 485 sizeof(struct nvme_submission_queue)); 486 if (sc->submit_queues == NULL) { 487 WPRINTF("%s: SQ allocation failed", __func__); 488 sc->num_squeues = 0; 489 } else { 490 struct nvme_submission_queue *sq = sc->submit_queues; 491 492 for (i = 0; i < sc->num_squeues + 1; i++) 493 pthread_mutex_init(&sq[i].mtx, NULL); 494 } 495 496 /* 497 * Allocate and initialize the Completion Queues 498 */ 499 if (ncq > NVME_QUEUES) { 500 WPRINTF("%s: clamping number of CQ from %u to %u", 501 __func__, ncq, NVME_QUEUES); 502 ncq = NVME_QUEUES; 503 } 504 505 sc->num_cqueues = ncq; 506 507 sc->compl_queues = calloc(sc->num_cqueues + 1, 508 sizeof(struct nvme_completion_queue)); 509 if (sc->compl_queues == NULL) { 510 WPRINTF("%s: CQ allocation failed", __func__); 511 sc->num_cqueues = 0; 512 } else { 513 struct nvme_completion_queue *cq = sc->compl_queues; 514 515 for (i = 0; i < sc->num_cqueues + 1; i++) 516 pthread_mutex_init(&cq[i].mtx, NULL); 517 } 518 } 519 520 static void 521 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 522 { 523 struct nvme_controller_data *cd = &sc->ctrldata; 524 525 cd->vid = 0xFB5D; 526 cd->ssvid = 0x0000; 527 528 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 529 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 530 531 /* Num of submission commands that we can handle at a time (2^rab) */ 532 cd->rab = 4; 533 534 /* FreeBSD OUI */ 535 cd->ieee[0] = 0x58; 536 cd->ieee[1] = 0x9c; 537 cd->ieee[2] = 0xfc; 538 539 cd->mic = 0; 540 541 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 542 543 cd->ver = NVME_REV(1,4); 544 545 cd->cntrltype = NVME_CNTRLTYPE_IO; 546 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 547 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 548 cd->acl = 2; 549 cd->aerl = 4; 550 551 /* Advertise 1, Read-only firmware slot */ 552 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 553 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 554 cd->lpa = 0; /* TODO: support some simple things like SMART */ 555 cd->elpe = 0; /* max error log page entries */ 556 /* 557 * Report a single power state (zero-based value) 558 * power_state[] values are left as zero to indicate "Not reported" 559 */ 560 cd->npss = 0; 561 562 /* Warning Composite Temperature Threshold */ 563 cd->wctemp = 0x0157; 564 cd->cctemp = 0x0157; 565 566 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 567 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 568 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 569 570 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 571 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 572 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 573 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 574 cd->nn = 1; /* number of namespaces */ 575 576 cd->oncs = 0; 577 switch (sc->dataset_management) { 578 case NVME_DATASET_MANAGEMENT_AUTO: 579 if (sc->nvstore.deallocate) 580 cd->oncs |= NVME_ONCS_DSM; 581 break; 582 case NVME_DATASET_MANAGEMENT_ENABLE: 583 cd->oncs |= NVME_ONCS_DSM; 584 break; 585 default: 586 break; 587 } 588 589 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 590 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 591 592 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 593 } 594 595 /* 596 * Calculate the CRC-16 of the given buffer 597 * See copyright attribution at top of file 598 */ 599 static uint16_t 600 crc16(uint16_t crc, const void *buffer, unsigned int len) 601 { 602 const unsigned char *cp = buffer; 603 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 604 static uint16_t const crc16_table[256] = { 605 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 606 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 607 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 608 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 609 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 610 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 611 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 612 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 613 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 614 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 615 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 616 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 617 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 618 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 619 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 620 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 621 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 622 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 623 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 624 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 625 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 626 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 627 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 628 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 629 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 630 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 631 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 632 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 633 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 634 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 635 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 636 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 637 }; 638 639 while (len--) 640 crc = (((crc >> 8) & 0xffU) ^ 641 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 642 return crc; 643 } 644 645 static void 646 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 647 struct nvme_namespace_data *nd) 648 { 649 650 /* Get capacity and block size information from backing store */ 651 nd->nsze = nvstore->size / nvstore->sectsz; 652 nd->ncap = nd->nsze; 653 nd->nuse = nd->nsze; 654 } 655 656 static void 657 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 658 struct nvme_namespace_data *nd, uint32_t nsid, 659 struct pci_nvme_blockstore *nvstore) 660 { 661 662 pci_nvme_init_nsdata_size(nvstore, nd); 663 664 if (nvstore->type == NVME_STOR_BLOCKIF) 665 nvstore->deallocate = blockif_candelete(nvstore->ctx); 666 667 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 668 nd->flbas = 0; 669 670 /* Create an EUI-64 if user did not provide one */ 671 if (nvstore->eui64 == 0) { 672 char *data = NULL; 673 uint64_t eui64 = nvstore->eui64; 674 675 asprintf(&data, "%s%u%u%u", get_config_value("name"), 676 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 677 sc->nsc_pi->pi_func); 678 679 if (data != NULL) { 680 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 681 free(data); 682 } 683 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 684 } 685 be64enc(nd->eui64, nvstore->eui64); 686 687 /* LBA data-sz = 2^lbads */ 688 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 689 } 690 691 static void 692 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 693 { 694 695 memset(&sc->err_log, 0, sizeof(sc->err_log)); 696 memset(&sc->health_log, 0, sizeof(sc->health_log)); 697 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 698 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 699 700 /* Set read/write remainder to round up according to spec */ 701 sc->read_dunits_remainder = 999; 702 sc->write_dunits_remainder = 999; 703 704 /* Set nominal Health values checked by implementations */ 705 sc->health_log.temperature = NVME_TEMPERATURE; 706 sc->health_log.available_spare = 100; 707 sc->health_log.available_spare_threshold = 10; 708 709 /* Set Active Firmware Info to slot 1 */ 710 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 711 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 712 sizeof(sc->fw_log.revision[0])); 713 } 714 715 static void 716 pci_nvme_init_features(struct pci_nvme_softc *sc) 717 { 718 enum nvme_feature fid; 719 720 for (fid = 0; fid < NVME_FID_MAX; fid++) { 721 switch (fid) { 722 case NVME_FEAT_ARBITRATION: 723 case NVME_FEAT_POWER_MANAGEMENT: 724 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 725 case NVME_FEAT_WRITE_ATOMICITY: 726 /* Mandatory but no special handling required */ 727 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 728 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 729 // this returns a data buffer 730 break; 731 case NVME_FEAT_TEMPERATURE_THRESHOLD: 732 sc->feat[fid].set = nvme_feature_temperature; 733 break; 734 case NVME_FEAT_ERROR_RECOVERY: 735 sc->feat[fid].namespace_specific = true; 736 break; 737 case NVME_FEAT_NUMBER_OF_QUEUES: 738 sc->feat[fid].set = nvme_feature_num_queues; 739 break; 740 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 741 sc->feat[fid].set = nvme_feature_iv_config; 742 break; 743 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 744 sc->feat[fid].set = nvme_feature_async_event; 745 /* Enable all AENs by default */ 746 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 747 break; 748 default: 749 sc->feat[fid].set = nvme_feature_invalid_cb; 750 sc->feat[fid].get = nvme_feature_invalid_cb; 751 } 752 } 753 } 754 755 static void 756 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 757 { 758 759 STAILQ_INIT(&sc->aer_list); 760 sc->aer_count = 0; 761 } 762 763 static void 764 pci_nvme_aer_init(struct pci_nvme_softc *sc) 765 { 766 767 pthread_mutex_init(&sc->aer_mtx, NULL); 768 pci_nvme_aer_reset(sc); 769 } 770 771 static void 772 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 773 { 774 struct pci_nvme_aer *aer = NULL; 775 776 pthread_mutex_lock(&sc->aer_mtx); 777 while (!STAILQ_EMPTY(&sc->aer_list)) { 778 aer = STAILQ_FIRST(&sc->aer_list); 779 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 780 free(aer); 781 } 782 pthread_mutex_unlock(&sc->aer_mtx); 783 784 pci_nvme_aer_reset(sc); 785 } 786 787 static bool 788 pci_nvme_aer_available(struct pci_nvme_softc *sc) 789 { 790 791 return (sc->aer_count != 0); 792 } 793 794 static bool 795 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 796 { 797 struct nvme_controller_data *cd = &sc->ctrldata; 798 799 /* AERL is a zero based value while aer_count is one's based */ 800 return (sc->aer_count == (cd->aerl + 1)); 801 } 802 803 /* 804 * Add an Async Event Request 805 * 806 * Stores an AER to be returned later if the Controller needs to notify the 807 * host of an event. 808 * Note that while the NVMe spec doesn't require Controllers to return AER's 809 * in order, this implementation does preserve the order. 810 */ 811 static int 812 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 813 { 814 struct pci_nvme_aer *aer = NULL; 815 816 aer = calloc(1, sizeof(struct pci_nvme_aer)); 817 if (aer == NULL) 818 return (-1); 819 820 /* Save the Command ID for use in the completion message */ 821 aer->cid = cid; 822 823 pthread_mutex_lock(&sc->aer_mtx); 824 sc->aer_count++; 825 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 826 pthread_mutex_unlock(&sc->aer_mtx); 827 828 return (0); 829 } 830 831 /* 832 * Get an Async Event Request structure 833 * 834 * Returns a pointer to an AER previously submitted by the host or NULL if 835 * no AER's exist. Caller is responsible for freeing the returned struct. 836 */ 837 static struct pci_nvme_aer * 838 pci_nvme_aer_get(struct pci_nvme_softc *sc) 839 { 840 struct pci_nvme_aer *aer = NULL; 841 842 pthread_mutex_lock(&sc->aer_mtx); 843 aer = STAILQ_FIRST(&sc->aer_list); 844 if (aer != NULL) { 845 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 846 sc->aer_count--; 847 } 848 pthread_mutex_unlock(&sc->aer_mtx); 849 850 return (aer); 851 } 852 853 static void 854 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 855 { 856 uint32_t atype; 857 858 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 859 860 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 861 sc->aen[atype].atype = atype; 862 } 863 } 864 865 static void 866 pci_nvme_aen_init(struct pci_nvme_softc *sc) 867 { 868 char nstr[80]; 869 870 pci_nvme_aen_reset(sc); 871 872 pthread_mutex_init(&sc->aen_mtx, NULL); 873 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 874 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 875 sc->nsc_pi->pi_func); 876 pthread_set_name_np(sc->aen_tid, nstr); 877 } 878 879 static void 880 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 881 { 882 883 pci_nvme_aen_reset(sc); 884 } 885 886 /* Notify the AEN thread of pending work */ 887 static void 888 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 889 { 890 891 pthread_cond_signal(&sc->aen_cond); 892 } 893 894 /* 895 * Post an Asynchronous Event Notification 896 */ 897 static int32_t 898 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 899 uint32_t event_data) 900 { 901 struct pci_nvme_aen *aen; 902 903 if (atype >= PCI_NVME_AE_TYPE_MAX) { 904 return(EINVAL); 905 } 906 907 pthread_mutex_lock(&sc->aen_mtx); 908 aen = &sc->aen[atype]; 909 910 /* Has the controller already posted an event of this type? */ 911 if (aen->posted) { 912 pthread_mutex_unlock(&sc->aen_mtx); 913 return(EALREADY); 914 } 915 916 aen->event_data = event_data; 917 aen->posted = true; 918 pthread_mutex_unlock(&sc->aen_mtx); 919 920 pci_nvme_aen_notify(sc); 921 922 return(0); 923 } 924 925 static void 926 pci_nvme_aen_process(struct pci_nvme_softc *sc) 927 { 928 struct pci_nvme_aer *aer; 929 struct pci_nvme_aen *aen; 930 pci_nvme_async_type atype; 931 uint32_t mask; 932 uint16_t status; 933 uint8_t lid; 934 935 #ifndef __FreeBSD__ 936 lid = 0; 937 #endif 938 939 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 940 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 941 aen = &sc->aen[atype]; 942 /* Previous iterations may have depleted the available AER's */ 943 if (!pci_nvme_aer_available(sc)) { 944 DPRINTF("%s: no AER", __func__); 945 break; 946 } 947 948 if (!aen->posted) { 949 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 950 continue; 951 } 952 953 status = NVME_SC_SUCCESS; 954 955 /* Is the event masked? */ 956 mask = 957 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 958 959 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 960 switch (atype) { 961 case PCI_NVME_AE_TYPE_ERROR: 962 lid = NVME_LOG_ERROR; 963 break; 964 case PCI_NVME_AE_TYPE_SMART: 965 mask &= 0xff; 966 if ((mask & aen->event_data) == 0) 967 continue; 968 lid = NVME_LOG_HEALTH_INFORMATION; 969 break; 970 case PCI_NVME_AE_TYPE_NOTICE: 971 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 972 EPRINTLN("%s unknown AEN notice type %u", 973 __func__, aen->event_data); 974 status = NVME_SC_INTERNAL_DEVICE_ERROR; 975 break; 976 } 977 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 978 continue; 979 switch (aen->event_data) { 980 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 981 lid = NVME_LOG_CHANGED_NAMESPACE; 982 break; 983 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 984 lid = NVME_LOG_FIRMWARE_SLOT; 985 break; 986 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 987 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 988 break; 989 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 990 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 991 break; 992 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 993 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 994 break; 995 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 996 lid = NVME_LOG_LBA_STATUS_INFORMATION; 997 break; 998 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 999 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 1000 break; 1001 default: 1002 lid = 0; 1003 } 1004 break; 1005 default: 1006 /* bad type?!? */ 1007 EPRINTLN("%s unknown AEN type %u", __func__, atype); 1008 status = NVME_SC_INTERNAL_DEVICE_ERROR; 1009 break; 1010 } 1011 1012 aer = pci_nvme_aer_get(sc); 1013 assert(aer != NULL); 1014 1015 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1016 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1017 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1018 aer->cid, 1019 0, /* SQID */ 1020 status); 1021 1022 aen->event_data = 0; 1023 aen->posted = false; 1024 1025 pci_generate_msix(sc->nsc_pi, 0); 1026 } 1027 } 1028 1029 static void * 1030 aen_thr(void *arg) 1031 { 1032 struct pci_nvme_softc *sc; 1033 1034 sc = arg; 1035 1036 pthread_mutex_lock(&sc->aen_mtx); 1037 for (;;) { 1038 pci_nvme_aen_process(sc); 1039 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1040 } 1041 #ifdef __FreeBSD__ /* Smatch spots unreachable code */ 1042 pthread_mutex_unlock(&sc->aen_mtx); 1043 1044 pthread_exit(NULL); 1045 #endif 1046 return (NULL); 1047 } 1048 1049 static void 1050 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1051 { 1052 uint32_t i; 1053 1054 DPRINTF("%s", __func__); 1055 1056 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1057 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1058 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1059 1060 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1061 1062 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1063 1064 sc->regs.cc = 0; 1065 1066 assert(sc->submit_queues != NULL); 1067 1068 for (i = 0; i < sc->num_squeues + 1; i++) { 1069 sc->submit_queues[i].qbase = NULL; 1070 sc->submit_queues[i].size = 0; 1071 sc->submit_queues[i].cqid = 0; 1072 sc->submit_queues[i].tail = 0; 1073 sc->submit_queues[i].head = 0; 1074 } 1075 1076 assert(sc->compl_queues != NULL); 1077 1078 for (i = 0; i < sc->num_cqueues + 1; i++) { 1079 sc->compl_queues[i].qbase = NULL; 1080 sc->compl_queues[i].size = 0; 1081 sc->compl_queues[i].tail = 0; 1082 sc->compl_queues[i].head = 0; 1083 } 1084 1085 sc->num_q_is_set = false; 1086 1087 pci_nvme_aer_destroy(sc); 1088 pci_nvme_aen_destroy(sc); 1089 1090 /* 1091 * Clear CSTS.RDY last to prevent the host from enabling Controller 1092 * before cleanup completes 1093 */ 1094 sc->regs.csts = 0; 1095 } 1096 1097 static void 1098 pci_nvme_reset(struct pci_nvme_softc *sc) 1099 { 1100 pthread_mutex_lock(&sc->mtx); 1101 pci_nvme_reset_locked(sc); 1102 pthread_mutex_unlock(&sc->mtx); 1103 } 1104 1105 static int 1106 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1107 { 1108 uint16_t acqs, asqs; 1109 1110 DPRINTF("%s", __func__); 1111 1112 /* 1113 * NVMe 2.0 states that "enabling a controller while this field is 1114 * cleared to 0h produces undefined results" for both ACQS and 1115 * ASQS. If zero, set CFS and do not become ready. 1116 */ 1117 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1118 if (asqs < 2) { 1119 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1120 asqs - 1, sc->regs.aqa); 1121 sc->regs.csts |= NVME_CSTS_CFS; 1122 return (-1); 1123 } 1124 sc->submit_queues[0].size = asqs; 1125 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1126 sizeof(struct nvme_command) * asqs); 1127 if (sc->submit_queues[0].qbase == NULL) { 1128 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1129 sc->regs.asq); 1130 sc->regs.csts |= NVME_CSTS_CFS; 1131 return (-1); 1132 } 1133 1134 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1135 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1136 1137 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1138 NVME_AQA_REG_ACQS_MASK); 1139 if (acqs < 2) { 1140 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1141 acqs - 1, sc->regs.aqa); 1142 sc->regs.csts |= NVME_CSTS_CFS; 1143 return (-1); 1144 } 1145 sc->compl_queues[0].size = acqs; 1146 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1147 sizeof(struct nvme_completion) * acqs); 1148 if (sc->compl_queues[0].qbase == NULL) { 1149 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1150 sc->regs.acq); 1151 sc->regs.csts |= NVME_CSTS_CFS; 1152 return (-1); 1153 } 1154 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1155 1156 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1157 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1158 1159 return (0); 1160 } 1161 1162 static int 1163 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1164 size_t len, enum nvme_copy_dir dir) 1165 { 1166 uint8_t *p; 1167 size_t bytes; 1168 1169 if (len > (8 * 1024)) { 1170 return (-1); 1171 } 1172 1173 /* Copy from the start of prp1 to the end of the physical page */ 1174 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1175 bytes = MIN(bytes, len); 1176 1177 p = vm_map_gpa(ctx, prp1, bytes); 1178 if (p == NULL) { 1179 return (-1); 1180 } 1181 1182 if (dir == NVME_COPY_TO_PRP) 1183 memcpy(p, b, bytes); 1184 else 1185 memcpy(b, p, bytes); 1186 1187 b += bytes; 1188 1189 len -= bytes; 1190 if (len == 0) { 1191 return (0); 1192 } 1193 1194 len = MIN(len, PAGE_SIZE); 1195 1196 p = vm_map_gpa(ctx, prp2, len); 1197 if (p == NULL) { 1198 return (-1); 1199 } 1200 1201 if (dir == NVME_COPY_TO_PRP) 1202 memcpy(p, b, len); 1203 else 1204 memcpy(b, p, len); 1205 1206 return (0); 1207 } 1208 1209 /* 1210 * Write a Completion Queue Entry update 1211 * 1212 * Write the completion and update the doorbell value 1213 */ 1214 static void 1215 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1216 struct nvme_completion_queue *cq, 1217 uint32_t cdw0, 1218 uint16_t cid, 1219 uint16_t sqid, 1220 uint16_t status) 1221 { 1222 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1223 struct nvme_completion *cqe; 1224 1225 assert(cq->qbase != NULL); 1226 1227 pthread_mutex_lock(&cq->mtx); 1228 1229 cqe = &cq->qbase[cq->tail]; 1230 1231 /* Flip the phase bit */ 1232 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1233 1234 cqe->cdw0 = cdw0; 1235 cqe->sqhd = sq->head; 1236 cqe->sqid = sqid; 1237 cqe->cid = cid; 1238 cqe->status = status; 1239 1240 cq->tail++; 1241 if (cq->tail >= cq->size) { 1242 cq->tail = 0; 1243 } 1244 1245 pthread_mutex_unlock(&cq->mtx); 1246 } 1247 1248 static int 1249 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1250 struct nvme_completion* compl) 1251 { 1252 uint16_t qid = command->cdw10 & 0xffff; 1253 1254 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1255 if (qid == 0 || qid > sc->num_squeues || 1256 (sc->submit_queues[qid].qbase == NULL)) { 1257 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1258 __func__, qid, sc->num_squeues); 1259 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1260 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1261 return (1); 1262 } 1263 1264 sc->submit_queues[qid].qbase = NULL; 1265 sc->submit_queues[qid].cqid = 0; 1266 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1267 return (1); 1268 } 1269 1270 static int 1271 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1272 struct nvme_completion* compl) 1273 { 1274 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1275 uint16_t qid = command->cdw10 & 0xffff; 1276 struct nvme_submission_queue *nsq; 1277 1278 if ((qid == 0) || (qid > sc->num_squeues) || 1279 (sc->submit_queues[qid].qbase != NULL)) { 1280 WPRINTF("%s queue index %u > num_squeues %u", 1281 __func__, qid, sc->num_squeues); 1282 pci_nvme_status_tc(&compl->status, 1283 NVME_SCT_COMMAND_SPECIFIC, 1284 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1285 return (1); 1286 } 1287 1288 nsq = &sc->submit_queues[qid]; 1289 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1290 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1291 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1292 /* 1293 * Queues must specify at least two entries 1294 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1295 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1296 */ 1297 pci_nvme_status_tc(&compl->status, 1298 NVME_SCT_COMMAND_SPECIFIC, 1299 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1300 return (1); 1301 } 1302 nsq->head = nsq->tail = 0; 1303 1304 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1305 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1306 pci_nvme_status_tc(&compl->status, 1307 NVME_SCT_COMMAND_SPECIFIC, 1308 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1309 return (1); 1310 } 1311 1312 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1313 pci_nvme_status_tc(&compl->status, 1314 NVME_SCT_COMMAND_SPECIFIC, 1315 NVME_SC_COMPLETION_QUEUE_INVALID); 1316 return (1); 1317 } 1318 1319 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1320 1321 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1322 sizeof(struct nvme_command) * (size_t)nsq->size); 1323 1324 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1325 qid, nsq->size, nsq->qbase, nsq->cqid); 1326 1327 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1328 1329 DPRINTF("%s completed creating IOSQ qid %u", 1330 __func__, qid); 1331 } else { 1332 /* 1333 * Guest sent non-cont submission queue request. 1334 * This setting is unsupported by this emulation. 1335 */ 1336 WPRINTF("%s unsupported non-contig (list-based) " 1337 "create i/o submission queue", __func__); 1338 1339 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1340 } 1341 return (1); 1342 } 1343 1344 static int 1345 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1346 struct nvme_completion* compl) 1347 { 1348 uint16_t qid = command->cdw10 & 0xffff; 1349 uint16_t sqid; 1350 1351 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1352 if (qid == 0 || qid > sc->num_cqueues || 1353 (sc->compl_queues[qid].qbase == NULL)) { 1354 WPRINTF("%s queue index %u / num_cqueues %u", 1355 __func__, qid, sc->num_cqueues); 1356 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1357 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1358 return (1); 1359 } 1360 1361 /* Deleting an Active CQ is an error */ 1362 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1363 if (sc->submit_queues[sqid].cqid == qid) { 1364 pci_nvme_status_tc(&compl->status, 1365 NVME_SCT_COMMAND_SPECIFIC, 1366 NVME_SC_INVALID_QUEUE_DELETION); 1367 return (1); 1368 } 1369 1370 sc->compl_queues[qid].qbase = NULL; 1371 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1372 return (1); 1373 } 1374 1375 static int 1376 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1377 struct nvme_completion* compl) 1378 { 1379 struct nvme_completion_queue *ncq; 1380 uint16_t qid = command->cdw10 & 0xffff; 1381 1382 /* Only support Physically Contiguous queues */ 1383 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1384 WPRINTF("%s unsupported non-contig (list-based) " 1385 "create i/o completion queue", 1386 __func__); 1387 1388 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1389 return (1); 1390 } 1391 1392 if ((qid == 0) || (qid > sc->num_cqueues) || 1393 (sc->compl_queues[qid].qbase != NULL)) { 1394 WPRINTF("%s queue index %u > num_cqueues %u", 1395 __func__, qid, sc->num_cqueues); 1396 pci_nvme_status_tc(&compl->status, 1397 NVME_SCT_COMMAND_SPECIFIC, 1398 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1399 return (1); 1400 } 1401 1402 ncq = &sc->compl_queues[qid]; 1403 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1404 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1405 if (ncq->intr_vec > (sc->max_queues + 1)) { 1406 pci_nvme_status_tc(&compl->status, 1407 NVME_SCT_COMMAND_SPECIFIC, 1408 NVME_SC_INVALID_INTERRUPT_VECTOR); 1409 return (1); 1410 } 1411 1412 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1413 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1414 /* 1415 * Queues must specify at least two entries 1416 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1417 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1418 */ 1419 pci_nvme_status_tc(&compl->status, 1420 NVME_SCT_COMMAND_SPECIFIC, 1421 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1422 return (1); 1423 } 1424 ncq->head = ncq->tail = 0; 1425 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1426 command->prp1, 1427 sizeof(struct nvme_command) * (size_t)ncq->size); 1428 1429 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1430 1431 1432 return (1); 1433 } 1434 1435 static int 1436 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1437 struct nvme_completion* compl) 1438 { 1439 uint64_t logoff; 1440 uint32_t logsize; 1441 uint8_t logpage; 1442 1443 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1444 1445 /* 1446 * Command specifies the number of dwords to return in fields NUMDU 1447 * and NUMDL. This is a zero-based value. 1448 */ 1449 logpage = command->cdw10 & 0xFF; 1450 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1451 logsize *= sizeof(uint32_t); 1452 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1453 1454 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1455 1456 switch (logpage) { 1457 case NVME_LOG_ERROR: 1458 if (logoff >= sizeof(sc->err_log)) { 1459 pci_nvme_status_genc(&compl->status, 1460 NVME_SC_INVALID_FIELD); 1461 break; 1462 } 1463 1464 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1465 command->prp2, (uint8_t *)&sc->err_log + logoff, 1466 MIN(logsize - logoff, sizeof(sc->err_log)), 1467 NVME_COPY_TO_PRP); 1468 break; 1469 case NVME_LOG_HEALTH_INFORMATION: 1470 if (logoff >= sizeof(sc->health_log)) { 1471 pci_nvme_status_genc(&compl->status, 1472 NVME_SC_INVALID_FIELD); 1473 break; 1474 } 1475 1476 pthread_mutex_lock(&sc->mtx); 1477 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1478 sizeof(sc->health_log.data_units_read)); 1479 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1480 sizeof(sc->health_log.data_units_written)); 1481 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1482 sizeof(sc->health_log.host_read_commands)); 1483 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1484 sizeof(sc->health_log.host_write_commands)); 1485 pthread_mutex_unlock(&sc->mtx); 1486 1487 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1488 command->prp2, (uint8_t *)&sc->health_log + logoff, 1489 MIN(logsize - logoff, sizeof(sc->health_log)), 1490 NVME_COPY_TO_PRP); 1491 break; 1492 case NVME_LOG_FIRMWARE_SLOT: 1493 if (logoff >= sizeof(sc->fw_log)) { 1494 pci_nvme_status_genc(&compl->status, 1495 NVME_SC_INVALID_FIELD); 1496 break; 1497 } 1498 1499 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1500 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1501 MIN(logsize - logoff, sizeof(sc->fw_log)), 1502 NVME_COPY_TO_PRP); 1503 break; 1504 case NVME_LOG_CHANGED_NAMESPACE: 1505 if (logoff >= sizeof(sc->ns_log)) { 1506 pci_nvme_status_genc(&compl->status, 1507 NVME_SC_INVALID_FIELD); 1508 break; 1509 } 1510 1511 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1512 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1513 MIN(logsize - logoff, sizeof(sc->ns_log)), 1514 NVME_COPY_TO_PRP); 1515 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1516 break; 1517 default: 1518 DPRINTF("%s get log page %x command not supported", 1519 __func__, logpage); 1520 1521 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1522 NVME_SC_INVALID_LOG_PAGE); 1523 } 1524 1525 return (1); 1526 } 1527 1528 static int 1529 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1530 struct nvme_completion* compl) 1531 { 1532 void *dest; 1533 uint16_t status; 1534 1535 #ifndef __FreeBSD__ 1536 status = 0; 1537 #endif 1538 1539 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1540 command->cdw10 & 0xFF, command->nsid); 1541 1542 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1543 1544 switch (command->cdw10 & 0xFF) { 1545 case 0x00: /* return Identify Namespace data structure */ 1546 /* Global NS only valid with NS Management */ 1547 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1548 pci_nvme_status_genc(&status, 1549 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1550 break; 1551 } 1552 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1553 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1554 NVME_COPY_TO_PRP); 1555 break; 1556 case 0x01: /* return Identify Controller data structure */ 1557 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1558 command->prp2, (uint8_t *)&sc->ctrldata, 1559 sizeof(sc->ctrldata), 1560 NVME_COPY_TO_PRP); 1561 break; 1562 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1563 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1564 sizeof(uint32_t) * 1024); 1565 /* All unused entries shall be zero */ 1566 memset(dest, 0, sizeof(uint32_t) * 1024); 1567 ((uint32_t *)dest)[0] = 1; 1568 break; 1569 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1570 if (command->nsid != 1) { 1571 pci_nvme_status_genc(&status, 1572 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1573 break; 1574 } 1575 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1576 sizeof(uint32_t) * 1024); 1577 /* All bytes after the descriptor shall be zero */ 1578 memset(dest, 0, sizeof(uint32_t) * 1024); 1579 1580 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1581 ((uint8_t *)dest)[0] = 1; 1582 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1583 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1584 break; 1585 case 0x13: 1586 /* 1587 * Controller list is optional but used by UNH tests. Return 1588 * a valid but empty list. 1589 */ 1590 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1591 sizeof(uint16_t) * 2048); 1592 memset(dest, 0, sizeof(uint16_t) * 2048); 1593 break; 1594 default: 1595 DPRINTF("%s unsupported identify command requested 0x%x", 1596 __func__, command->cdw10 & 0xFF); 1597 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1598 break; 1599 } 1600 1601 compl->status = status; 1602 return (1); 1603 } 1604 1605 static const char * 1606 nvme_fid_to_name(uint8_t fid) 1607 { 1608 const char *name; 1609 1610 switch (fid) { 1611 case NVME_FEAT_ARBITRATION: 1612 name = "Arbitration"; 1613 break; 1614 case NVME_FEAT_POWER_MANAGEMENT: 1615 name = "Power Management"; 1616 break; 1617 case NVME_FEAT_LBA_RANGE_TYPE: 1618 name = "LBA Range Type"; 1619 break; 1620 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1621 name = "Temperature Threshold"; 1622 break; 1623 case NVME_FEAT_ERROR_RECOVERY: 1624 name = "Error Recovery"; 1625 break; 1626 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1627 name = "Volatile Write Cache"; 1628 break; 1629 case NVME_FEAT_NUMBER_OF_QUEUES: 1630 name = "Number of Queues"; 1631 break; 1632 case NVME_FEAT_INTERRUPT_COALESCING: 1633 name = "Interrupt Coalescing"; 1634 break; 1635 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1636 name = "Interrupt Vector Configuration"; 1637 break; 1638 case NVME_FEAT_WRITE_ATOMICITY: 1639 name = "Write Atomicity Normal"; 1640 break; 1641 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1642 name = "Asynchronous Event Configuration"; 1643 break; 1644 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1645 name = "Autonomous Power State Transition"; 1646 break; 1647 case NVME_FEAT_HOST_MEMORY_BUFFER: 1648 name = "Host Memory Buffer"; 1649 break; 1650 case NVME_FEAT_TIMESTAMP: 1651 name = "Timestamp"; 1652 break; 1653 case NVME_FEAT_KEEP_ALIVE_TIMER: 1654 name = "Keep Alive Timer"; 1655 break; 1656 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1657 name = "Host Controlled Thermal Management"; 1658 break; 1659 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1660 name = "Non-Operation Power State Config"; 1661 break; 1662 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1663 name = "Read Recovery Level Config"; 1664 break; 1665 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1666 name = "Predictable Latency Mode Config"; 1667 break; 1668 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1669 name = "Predictable Latency Mode Window"; 1670 break; 1671 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1672 name = "LBA Status Information Report Interval"; 1673 break; 1674 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1675 name = "Host Behavior Support"; 1676 break; 1677 case NVME_FEAT_SANITIZE_CONFIG: 1678 name = "Sanitize Config"; 1679 break; 1680 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1681 name = "Endurance Group Event Configuration"; 1682 break; 1683 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1684 name = "Software Progress Marker"; 1685 break; 1686 case NVME_FEAT_HOST_IDENTIFIER: 1687 name = "Host Identifier"; 1688 break; 1689 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1690 name = "Reservation Notification Mask"; 1691 break; 1692 case NVME_FEAT_RESERVATION_PERSISTENCE: 1693 name = "Reservation Persistence"; 1694 break; 1695 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1696 name = "Namespace Write Protection Config"; 1697 break; 1698 default: 1699 name = "Unknown"; 1700 break; 1701 } 1702 1703 return (name); 1704 } 1705 1706 static void 1707 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1708 struct nvme_feature_obj *feat __unused, 1709 struct nvme_command *command __unused, 1710 struct nvme_completion *compl) 1711 { 1712 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1713 } 1714 1715 static void 1716 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1717 struct nvme_feature_obj *feat __unused, 1718 struct nvme_command *command, 1719 struct nvme_completion *compl) 1720 { 1721 uint32_t i; 1722 uint32_t cdw11 = command->cdw11; 1723 uint16_t iv; 1724 bool cd; 1725 1726 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1727 1728 iv = cdw11 & 0xffff; 1729 cd = cdw11 & (1 << 16); 1730 1731 if (iv > (sc->max_queues + 1)) { 1732 return; 1733 } 1734 1735 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1736 if ((iv == 0) && !cd) 1737 return; 1738 1739 /* Requested Interrupt Vector must be used by a CQ */ 1740 for (i = 0; i < sc->num_cqueues + 1; i++) { 1741 if (sc->compl_queues[i].intr_vec == iv) { 1742 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1743 } 1744 } 1745 } 1746 1747 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1748 static void 1749 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1750 struct nvme_feature_obj *feat __unused, 1751 struct nvme_command *command, 1752 struct nvme_completion *compl) 1753 { 1754 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1755 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1756 } 1757 1758 #define NVME_TEMP_THRESH_OVER 0 1759 #define NVME_TEMP_THRESH_UNDER 1 1760 static void 1761 nvme_feature_temperature(struct pci_nvme_softc *sc, 1762 struct nvme_feature_obj *feat __unused, 1763 struct nvme_command *command, 1764 struct nvme_completion *compl) 1765 { 1766 uint16_t tmpth; /* Temperature Threshold */ 1767 uint8_t tmpsel; /* Threshold Temperature Select */ 1768 uint8_t thsel; /* Threshold Type Select */ 1769 bool set_crit = false; 1770 1771 tmpth = command->cdw11 & 0xffff; 1772 tmpsel = (command->cdw11 >> 16) & 0xf; 1773 thsel = (command->cdw11 >> 20) & 0x3; 1774 1775 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1776 1777 /* Check for unsupported values */ 1778 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1779 (thsel > NVME_TEMP_THRESH_UNDER)) { 1780 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1781 return; 1782 } 1783 1784 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1785 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1786 set_crit = true; 1787 1788 pthread_mutex_lock(&sc->mtx); 1789 if (set_crit) 1790 sc->health_log.critical_warning |= 1791 NVME_CRIT_WARN_ST_TEMPERATURE; 1792 else 1793 sc->health_log.critical_warning &= 1794 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1795 pthread_mutex_unlock(&sc->mtx); 1796 1797 if (set_crit) 1798 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1799 sc->health_log.critical_warning); 1800 1801 1802 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1803 } 1804 1805 static void 1806 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1807 struct nvme_feature_obj *feat __unused, 1808 struct nvme_command *command, 1809 struct nvme_completion *compl) 1810 { 1811 uint16_t nqr; /* Number of Queues Requested */ 1812 1813 if (sc->num_q_is_set) { 1814 WPRINTF("%s: Number of Queues already set", __func__); 1815 pci_nvme_status_genc(&compl->status, 1816 NVME_SC_COMMAND_SEQUENCE_ERROR); 1817 return; 1818 } 1819 1820 nqr = command->cdw11 & 0xFFFF; 1821 if (nqr == 0xffff) { 1822 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1823 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1824 return; 1825 } 1826 1827 sc->num_squeues = ONE_BASED(nqr); 1828 if (sc->num_squeues > sc->max_queues) { 1829 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1830 sc->max_queues); 1831 sc->num_squeues = sc->max_queues; 1832 } 1833 1834 nqr = (command->cdw11 >> 16) & 0xFFFF; 1835 if (nqr == 0xffff) { 1836 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1837 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1838 return; 1839 } 1840 1841 sc->num_cqueues = ONE_BASED(nqr); 1842 if (sc->num_cqueues > sc->max_queues) { 1843 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1844 sc->max_queues); 1845 sc->num_cqueues = sc->max_queues; 1846 } 1847 1848 /* Patch the command value which will be saved on callback's return */ 1849 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1850 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1851 1852 sc->num_q_is_set = true; 1853 } 1854 1855 static int 1856 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1857 struct nvme_completion *compl) 1858 { 1859 struct nvme_feature_obj *feat; 1860 uint32_t nsid = command->nsid; 1861 uint8_t fid = command->cdw10 & 0xFF; 1862 1863 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1864 1865 if (fid >= NVME_FID_MAX) { 1866 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1867 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1868 return (1); 1869 } 1870 feat = &sc->feat[fid]; 1871 1872 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1873 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1874 return (1); 1875 } 1876 1877 if (!feat->namespace_specific && 1878 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1879 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1880 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1881 return (1); 1882 } 1883 1884 compl->cdw0 = 0; 1885 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1886 1887 if (feat->set) 1888 feat->set(sc, feat, command, compl); 1889 1890 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1891 if (compl->status == NVME_SC_SUCCESS) { 1892 feat->cdw11 = command->cdw11; 1893 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1894 (command->cdw11 != 0)) 1895 pci_nvme_aen_notify(sc); 1896 } 1897 1898 return (0); 1899 } 1900 1901 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1902 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1903 1904 static int 1905 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1906 struct nvme_completion* compl) 1907 { 1908 struct nvme_feature_obj *feat; 1909 uint8_t fid = command->cdw10 & 0xFF; 1910 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1911 1912 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1913 1914 if (fid >= NVME_FID_MAX) { 1915 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1916 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1917 return (1); 1918 } 1919 1920 compl->cdw0 = 0; 1921 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1922 1923 feat = &sc->feat[fid]; 1924 if (feat->get) { 1925 feat->get(sc, feat, command, compl); 1926 } 1927 1928 if (compl->status == NVME_SC_SUCCESS) { 1929 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1930 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1931 else 1932 compl->cdw0 = feat->cdw11; 1933 } 1934 1935 return (0); 1936 } 1937 1938 static int 1939 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1940 struct nvme_completion* compl) 1941 { 1942 uint8_t ses, lbaf, pi; 1943 1944 /* Only supports Secure Erase Setting - User Data Erase */ 1945 ses = (command->cdw10 >> 9) & 0x7; 1946 if (ses > 0x1) { 1947 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1948 return (1); 1949 } 1950 1951 /* Only supports a single LBA Format */ 1952 lbaf = command->cdw10 & 0xf; 1953 if (lbaf != 0) { 1954 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1955 NVME_SC_INVALID_FORMAT); 1956 return (1); 1957 } 1958 1959 /* Doesn't support Protection Infomation */ 1960 pi = (command->cdw10 >> 5) & 0x7; 1961 if (pi != 0) { 1962 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1963 return (1); 1964 } 1965 1966 if (sc->nvstore.type == NVME_STOR_RAM) { 1967 if (sc->nvstore.ctx) 1968 free(sc->nvstore.ctx); 1969 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1970 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1971 } else { 1972 struct pci_nvme_ioreq *req; 1973 int err; 1974 1975 req = pci_nvme_get_ioreq(sc); 1976 if (req == NULL) { 1977 pci_nvme_status_genc(&compl->status, 1978 NVME_SC_INTERNAL_DEVICE_ERROR); 1979 WPRINTF("%s: unable to allocate IO req", __func__); 1980 return (1); 1981 } 1982 req->nvme_sq = &sc->submit_queues[0]; 1983 req->sqid = 0; 1984 req->opc = command->opc; 1985 req->cid = command->cid; 1986 req->nsid = command->nsid; 1987 1988 req->io_req.br_offset = 0; 1989 req->io_req.br_resid = sc->nvstore.size; 1990 req->io_req.br_callback = pci_nvme_io_done; 1991 1992 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1993 if (err) { 1994 pci_nvme_status_genc(&compl->status, 1995 NVME_SC_INTERNAL_DEVICE_ERROR); 1996 pci_nvme_release_ioreq(sc, req); 1997 } else 1998 compl->status = NVME_NO_STATUS; 1999 } 2000 2001 return (1); 2002 } 2003 2004 static int 2005 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 2006 struct nvme_completion *compl) 2007 { 2008 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 2009 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 2010 2011 /* TODO: search for the command ID and abort it */ 2012 2013 compl->cdw0 = 1; 2014 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2015 return (1); 2016 } 2017 2018 static int 2019 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2020 struct nvme_command* command, struct nvme_completion* compl) 2021 { 2022 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2023 sc->aer_count, sc->ctrldata.aerl, command->cid); 2024 2025 /* Don't exceed the Async Event Request Limit (AERL). */ 2026 if (pci_nvme_aer_limit_reached(sc)) { 2027 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2028 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2029 return (1); 2030 } 2031 2032 if (pci_nvme_aer_add(sc, command->cid)) { 2033 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2034 NVME_SC_INTERNAL_DEVICE_ERROR); 2035 return (1); 2036 } 2037 2038 /* 2039 * Raise events when they happen based on the Set Features cmd. 2040 * These events happen async, so only set completion successful if 2041 * there is an event reflective of the request to get event. 2042 */ 2043 compl->status = NVME_NO_STATUS; 2044 pci_nvme_aen_notify(sc); 2045 2046 return (0); 2047 } 2048 2049 static void 2050 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2051 { 2052 struct nvme_completion compl; 2053 struct nvme_command *cmd; 2054 struct nvme_submission_queue *sq; 2055 struct nvme_completion_queue *cq; 2056 uint16_t sqhead; 2057 2058 DPRINTF("%s index %u", __func__, (uint32_t)value); 2059 2060 sq = &sc->submit_queues[0]; 2061 cq = &sc->compl_queues[0]; 2062 2063 pthread_mutex_lock(&sq->mtx); 2064 2065 sqhead = sq->head; 2066 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2067 2068 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2069 cmd = &(sq->qbase)[sqhead]; 2070 compl.cdw0 = 0; 2071 compl.status = 0; 2072 2073 switch (cmd->opc) { 2074 case NVME_OPC_DELETE_IO_SQ: 2075 DPRINTF("%s command DELETE_IO_SQ", __func__); 2076 nvme_opc_delete_io_sq(sc, cmd, &compl); 2077 break; 2078 case NVME_OPC_CREATE_IO_SQ: 2079 DPRINTF("%s command CREATE_IO_SQ", __func__); 2080 nvme_opc_create_io_sq(sc, cmd, &compl); 2081 break; 2082 case NVME_OPC_DELETE_IO_CQ: 2083 DPRINTF("%s command DELETE_IO_CQ", __func__); 2084 nvme_opc_delete_io_cq(sc, cmd, &compl); 2085 break; 2086 case NVME_OPC_CREATE_IO_CQ: 2087 DPRINTF("%s command CREATE_IO_CQ", __func__); 2088 nvme_opc_create_io_cq(sc, cmd, &compl); 2089 break; 2090 case NVME_OPC_GET_LOG_PAGE: 2091 DPRINTF("%s command GET_LOG_PAGE", __func__); 2092 nvme_opc_get_log_page(sc, cmd, &compl); 2093 break; 2094 case NVME_OPC_IDENTIFY: 2095 DPRINTF("%s command IDENTIFY", __func__); 2096 nvme_opc_identify(sc, cmd, &compl); 2097 break; 2098 case NVME_OPC_ABORT: 2099 DPRINTF("%s command ABORT", __func__); 2100 nvme_opc_abort(sc, cmd, &compl); 2101 break; 2102 case NVME_OPC_SET_FEATURES: 2103 DPRINTF("%s command SET_FEATURES", __func__); 2104 nvme_opc_set_features(sc, cmd, &compl); 2105 break; 2106 case NVME_OPC_GET_FEATURES: 2107 DPRINTF("%s command GET_FEATURES", __func__); 2108 nvme_opc_get_features(sc, cmd, &compl); 2109 break; 2110 case NVME_OPC_FIRMWARE_ACTIVATE: 2111 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2112 pci_nvme_status_tc(&compl.status, 2113 NVME_SCT_COMMAND_SPECIFIC, 2114 NVME_SC_INVALID_FIRMWARE_SLOT); 2115 break; 2116 case NVME_OPC_ASYNC_EVENT_REQUEST: 2117 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2118 nvme_opc_async_event_req(sc, cmd, &compl); 2119 break; 2120 case NVME_OPC_FORMAT_NVM: 2121 DPRINTF("%s command FORMAT_NVM", __func__); 2122 if ((sc->ctrldata.oacs & 2123 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2124 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2125 break; 2126 } 2127 nvme_opc_format_nvm(sc, cmd, &compl); 2128 break; 2129 case NVME_OPC_SECURITY_SEND: 2130 case NVME_OPC_SECURITY_RECEIVE: 2131 case NVME_OPC_SANITIZE: 2132 case NVME_OPC_GET_LBA_STATUS: 2133 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2134 cmd->opc); 2135 /* Valid but unsupported opcodes */ 2136 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2137 break; 2138 default: 2139 DPRINTF("%s command OPC=%#X (not implemented)", 2140 __func__, 2141 cmd->opc); 2142 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2143 } 2144 sqhead = (sqhead + 1) % sq->size; 2145 2146 if (NVME_COMPLETION_VALID(compl)) { 2147 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2148 compl.cdw0, 2149 cmd->cid, 2150 0, /* SQID */ 2151 compl.status); 2152 } 2153 } 2154 2155 DPRINTF("setting sqhead %u", sqhead); 2156 sq->head = sqhead; 2157 2158 if (cq->head != cq->tail) 2159 pci_generate_msix(sc->nsc_pi, 0); 2160 2161 pthread_mutex_unlock(&sq->mtx); 2162 } 2163 2164 /* 2165 * Update the Write and Read statistics reported in SMART data 2166 * 2167 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2168 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2169 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2170 */ 2171 static void 2172 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2173 size_t bytes, uint16_t status) 2174 { 2175 2176 pthread_mutex_lock(&sc->mtx); 2177 switch (opc) { 2178 case NVME_OPC_WRITE: 2179 sc->write_commands++; 2180 if (status != NVME_SC_SUCCESS) 2181 break; 2182 sc->write_dunits_remainder += (bytes / 512); 2183 while (sc->write_dunits_remainder >= 1000) { 2184 sc->write_data_units++; 2185 sc->write_dunits_remainder -= 1000; 2186 } 2187 break; 2188 case NVME_OPC_READ: 2189 sc->read_commands++; 2190 if (status != NVME_SC_SUCCESS) 2191 break; 2192 sc->read_dunits_remainder += (bytes / 512); 2193 while (sc->read_dunits_remainder >= 1000) { 2194 sc->read_data_units++; 2195 sc->read_dunits_remainder -= 1000; 2196 } 2197 break; 2198 default: 2199 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2200 break; 2201 } 2202 pthread_mutex_unlock(&sc->mtx); 2203 } 2204 2205 /* 2206 * Check if the combination of Starting LBA (slba) and number of blocks 2207 * exceeds the range of the underlying storage. 2208 * 2209 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2210 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2211 * overflow. 2212 */ 2213 static bool 2214 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2215 uint32_t nblocks) 2216 { 2217 size_t offset, bytes; 2218 2219 /* Overflow check of multiplying Starting LBA by the sector size */ 2220 if (slba >> (64 - nvstore->sectsz_bits)) 2221 return (true); 2222 2223 offset = slba << nvstore->sectsz_bits; 2224 bytes = nblocks << nvstore->sectsz_bits; 2225 2226 /* Overflow check of Number of Logical Blocks */ 2227 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2228 return (true); 2229 2230 return (false); 2231 } 2232 2233 static int 2234 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2235 uint64_t gpaddr, size_t size, int do_write, uint64_t offset) 2236 { 2237 int iovidx; 2238 bool range_is_contiguous; 2239 2240 if (req == NULL) 2241 return (-1); 2242 2243 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2244 return (-1); 2245 } 2246 2247 /* 2248 * Minimize the number of IOVs by concatenating contiguous address 2249 * ranges. If the IOV count is zero, there is no previous range to 2250 * concatenate. 2251 */ 2252 if (req->io_req.br_iovcnt == 0) 2253 range_is_contiguous = false; 2254 else 2255 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2256 2257 if (range_is_contiguous) { 2258 iovidx = req->io_req.br_iovcnt - 1; 2259 2260 req->io_req.br_iov[iovidx].iov_base = 2261 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2262 req->prev_gpaddr, size); 2263 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2264 return (-1); 2265 2266 req->prev_size += size; 2267 req->io_req.br_resid += size; 2268 2269 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2270 } else { 2271 iovidx = req->io_req.br_iovcnt; 2272 if (iovidx == 0) { 2273 req->io_req.br_offset = offset; 2274 req->io_req.br_resid = 0; 2275 req->io_req.br_param = req; 2276 } 2277 2278 req->io_req.br_iov[iovidx].iov_base = 2279 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2280 gpaddr, size); 2281 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2282 return (-1); 2283 2284 req->io_req.br_iov[iovidx].iov_len = size; 2285 2286 req->prev_gpaddr = gpaddr; 2287 req->prev_size = size; 2288 req->io_req.br_resid += size; 2289 2290 req->io_req.br_iovcnt++; 2291 } 2292 2293 return (0); 2294 } 2295 2296 static void 2297 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2298 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2299 { 2300 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2301 2302 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2303 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2304 NVME_STATUS_GET_SC(status)); 2305 2306 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2307 2308 if (cq->head != cq->tail) { 2309 if (cq->intr_en & NVME_CQ_INTEN) { 2310 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2311 } else { 2312 DPRINTF("%s: CQ%u interrupt disabled", 2313 __func__, sq->cqid); 2314 } 2315 } 2316 } 2317 2318 static void 2319 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2320 { 2321 req->sc = NULL; 2322 req->nvme_sq = NULL; 2323 req->sqid = 0; 2324 2325 pthread_mutex_lock(&sc->mtx); 2326 2327 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2328 sc->pending_ios--; 2329 2330 /* when no more IO pending, can set to ready if device reset/enabled */ 2331 if (sc->pending_ios == 0 && 2332 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2333 sc->regs.csts |= NVME_CSTS_RDY; 2334 2335 pthread_mutex_unlock(&sc->mtx); 2336 2337 sem_post(&sc->iosemlock); 2338 } 2339 2340 static struct pci_nvme_ioreq * 2341 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2342 { 2343 struct pci_nvme_ioreq *req = NULL; 2344 2345 sem_wait(&sc->iosemlock); 2346 pthread_mutex_lock(&sc->mtx); 2347 2348 req = STAILQ_FIRST(&sc->ioreqs_free); 2349 assert(req != NULL); 2350 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2351 2352 req->sc = sc; 2353 2354 sc->pending_ios++; 2355 2356 pthread_mutex_unlock(&sc->mtx); 2357 2358 req->io_req.br_iovcnt = 0; 2359 req->io_req.br_offset = 0; 2360 req->io_req.br_resid = 0; 2361 req->io_req.br_param = req; 2362 req->prev_gpaddr = 0; 2363 req->prev_size = 0; 2364 2365 return req; 2366 } 2367 2368 static void 2369 pci_nvme_io_done(struct blockif_req *br, int err) 2370 { 2371 struct pci_nvme_ioreq *req = br->br_param; 2372 struct nvme_submission_queue *sq = req->nvme_sq; 2373 uint16_t code, status; 2374 2375 #ifndef __FreeBSD__ 2376 status = 0; 2377 #endif 2378 2379 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2380 2381 /* TODO return correct error */ 2382 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2383 pci_nvme_status_genc(&status, code); 2384 2385 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2386 pci_nvme_stats_write_read_update(req->sc, req->opc, 2387 req->bytes, status); 2388 pci_nvme_release_ioreq(req->sc, req); 2389 } 2390 2391 /* 2392 * Implements the Flush command. The specification states: 2393 * If a volatile write cache is not present, Flush commands complete 2394 * successfully and have no effect 2395 * in the description of the Volatile Write Cache (VWC) field of the Identify 2396 * Controller data. Therefore, set status to Success if the command is 2397 * not supported (i.e. RAM or as indicated by the blockif). 2398 */ 2399 static bool 2400 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2401 struct nvme_command *cmd __unused, 2402 struct pci_nvme_blockstore *nvstore, 2403 struct pci_nvme_ioreq *req, 2404 uint16_t *status) 2405 { 2406 bool pending = false; 2407 2408 if (nvstore->type == NVME_STOR_RAM) { 2409 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2410 } else { 2411 int err; 2412 2413 req->io_req.br_callback = pci_nvme_io_done; 2414 2415 err = blockif_flush(nvstore->ctx, &req->io_req); 2416 switch (err) { 2417 case 0: 2418 pending = true; 2419 break; 2420 case EOPNOTSUPP: 2421 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2422 break; 2423 default: 2424 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2425 } 2426 } 2427 2428 return (pending); 2429 } 2430 2431 static uint16_t 2432 nvme_write_read_ram(struct pci_nvme_softc *sc, 2433 struct pci_nvme_blockstore *nvstore, 2434 uint64_t prp1, uint64_t prp2, 2435 size_t offset, uint64_t bytes, 2436 bool is_write) 2437 { 2438 uint8_t *buf = nvstore->ctx; 2439 enum nvme_copy_dir dir; 2440 uint16_t status; 2441 2442 #ifndef __FreeBSD__ 2443 status = 0; 2444 #endif 2445 2446 if (is_write) 2447 dir = NVME_COPY_TO_PRP; 2448 else 2449 dir = NVME_COPY_FROM_PRP; 2450 2451 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2452 buf + offset, bytes, dir)) 2453 pci_nvme_status_genc(&status, 2454 NVME_SC_DATA_TRANSFER_ERROR); 2455 else 2456 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2457 2458 return (status); 2459 } 2460 2461 static uint16_t 2462 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2463 struct pci_nvme_blockstore *nvstore, 2464 struct pci_nvme_ioreq *req, 2465 uint64_t prp1, uint64_t prp2, 2466 size_t offset, uint64_t bytes, 2467 bool is_write) 2468 { 2469 uint64_t size; 2470 int err; 2471 uint16_t status = NVME_NO_STATUS; 2472 2473 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2474 if (pci_nvme_append_iov_req(sc, req, prp1, 2475 size, is_write, offset)) { 2476 err = -1; 2477 goto out; 2478 } 2479 2480 offset += size; 2481 bytes -= size; 2482 2483 if (bytes == 0) { 2484 ; 2485 } else if (bytes <= PAGE_SIZE) { 2486 size = bytes; 2487 if (pci_nvme_append_iov_req(sc, req, prp2, 2488 size, is_write, offset)) { 2489 err = -1; 2490 goto out; 2491 } 2492 } else { 2493 void *vmctx = sc->nsc_pi->pi_vmctx; 2494 uint64_t *prp_list = &prp2; 2495 uint64_t *last = prp_list; 2496 2497 /* PRP2 is pointer to a physical region page list */ 2498 while (bytes) { 2499 /* Last entry in list points to the next list */ 2500 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2501 uint64_t prp = *prp_list; 2502 2503 prp_list = paddr_guest2host(vmctx, prp, 2504 PAGE_SIZE - (prp % PAGE_SIZE)); 2505 if (prp_list == NULL) { 2506 err = -1; 2507 goto out; 2508 } 2509 last = prp_list + (NVME_PRP2_ITEMS - 1); 2510 } 2511 2512 size = MIN(bytes, PAGE_SIZE); 2513 2514 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2515 size, is_write, offset)) { 2516 err = -1; 2517 goto out; 2518 } 2519 2520 offset += size; 2521 bytes -= size; 2522 2523 prp_list++; 2524 } 2525 } 2526 req->io_req.br_callback = pci_nvme_io_done; 2527 if (is_write) 2528 err = blockif_write(nvstore->ctx, &req->io_req); 2529 else 2530 err = blockif_read(nvstore->ctx, &req->io_req); 2531 out: 2532 if (err) 2533 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2534 2535 return (status); 2536 } 2537 2538 static bool 2539 nvme_opc_write_read(struct pci_nvme_softc *sc, 2540 struct nvme_command *cmd, 2541 struct pci_nvme_blockstore *nvstore, 2542 struct pci_nvme_ioreq *req, 2543 uint16_t *status) 2544 { 2545 uint64_t lba, nblocks, bytes; 2546 size_t offset; 2547 bool is_write = cmd->opc == NVME_OPC_WRITE; 2548 bool pending = false; 2549 2550 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2551 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2552 bytes = nblocks << nvstore->sectsz_bits; 2553 if (bytes > NVME_MAX_DATA_SIZE) { 2554 WPRINTF("%s command would exceed MDTS", __func__); 2555 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2556 goto out; 2557 } 2558 2559 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2560 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2561 __func__, lba, nblocks); 2562 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2563 goto out; 2564 } 2565 2566 offset = lba << nvstore->sectsz_bits; 2567 2568 req->bytes = bytes; 2569 req->io_req.br_offset = lba; 2570 2571 /* PRP bits 1:0 must be zero */ 2572 cmd->prp1 &= ~0x3UL; 2573 cmd->prp2 &= ~0x3UL; 2574 2575 if (nvstore->type == NVME_STOR_RAM) { 2576 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2577 cmd->prp2, offset, bytes, is_write); 2578 } else { 2579 *status = nvme_write_read_blockif(sc, nvstore, req, 2580 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2581 2582 if (*status == NVME_NO_STATUS) 2583 pending = true; 2584 } 2585 out: 2586 if (!pending) 2587 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2588 2589 return (pending); 2590 } 2591 2592 static void 2593 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2594 { 2595 struct pci_nvme_ioreq *req = br->br_param; 2596 struct pci_nvme_softc *sc = req->sc; 2597 bool done = true; 2598 uint16_t status; 2599 2600 #ifndef __FreeBSD__ 2601 status = 0; 2602 #endif 2603 2604 if (err) { 2605 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2606 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2607 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2608 } else { 2609 struct iovec *iov = req->io_req.br_iov; 2610 2611 req->prev_gpaddr++; 2612 iov += req->prev_gpaddr; 2613 2614 /* The iov_* values already include the sector size */ 2615 req->io_req.br_offset = (off_t)iov->iov_base; 2616 req->io_req.br_resid = iov->iov_len; 2617 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2618 pci_nvme_status_genc(&status, 2619 NVME_SC_INTERNAL_DEVICE_ERROR); 2620 } else 2621 done = false; 2622 } 2623 2624 if (done) { 2625 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2626 status); 2627 pci_nvme_release_ioreq(sc, req); 2628 } 2629 } 2630 2631 static bool 2632 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2633 struct nvme_command *cmd, 2634 struct pci_nvme_blockstore *nvstore, 2635 struct pci_nvme_ioreq *req, 2636 uint16_t *status) 2637 { 2638 struct nvme_dsm_range *range = NULL; 2639 uint32_t nr, r, non_zero, dr; 2640 int err; 2641 bool pending = false; 2642 2643 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2644 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2645 goto out; 2646 } 2647 2648 nr = cmd->cdw10 & 0xff; 2649 2650 /* copy locally because a range entry could straddle PRPs */ 2651 range = calloc(1, NVME_MAX_DSM_TRIM); 2652 if (range == NULL) { 2653 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2654 goto out; 2655 } 2656 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2657 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2658 2659 /* Check for invalid ranges and the number of non-zero lengths */ 2660 non_zero = 0; 2661 for (r = 0; r <= nr; r++) { 2662 if (pci_nvme_out_of_range(nvstore, 2663 range[r].starting_lba, range[r].length)) { 2664 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2665 goto out; 2666 } 2667 if (range[r].length != 0) 2668 non_zero++; 2669 } 2670 2671 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2672 size_t offset, bytes; 2673 int sectsz_bits = sc->nvstore.sectsz_bits; 2674 2675 /* 2676 * DSM calls are advisory only, and compliant controllers 2677 * may choose to take no actions (i.e. return Success). 2678 */ 2679 if (!nvstore->deallocate) { 2680 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2681 goto out; 2682 } 2683 2684 /* If all ranges have a zero length, return Success */ 2685 if (non_zero == 0) { 2686 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2687 goto out; 2688 } 2689 2690 if (req == NULL) { 2691 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2692 goto out; 2693 } 2694 2695 offset = range[0].starting_lba << sectsz_bits; 2696 bytes = range[0].length << sectsz_bits; 2697 2698 /* 2699 * If the request is for more than a single range, store 2700 * the ranges in the br_iov. Optimize for the common case 2701 * of a single range. 2702 * 2703 * Note that NVMe Number of Ranges is a zero based value 2704 */ 2705 req->io_req.br_iovcnt = 0; 2706 req->io_req.br_offset = offset; 2707 req->io_req.br_resid = bytes; 2708 2709 if (nr == 0) { 2710 req->io_req.br_callback = pci_nvme_io_done; 2711 } else { 2712 struct iovec *iov = req->io_req.br_iov; 2713 2714 for (r = 0, dr = 0; r <= nr; r++) { 2715 offset = range[r].starting_lba << sectsz_bits; 2716 bytes = range[r].length << sectsz_bits; 2717 if (bytes == 0) 2718 continue; 2719 2720 if ((nvstore->size - offset) < bytes) { 2721 pci_nvme_status_genc(status, 2722 NVME_SC_LBA_OUT_OF_RANGE); 2723 goto out; 2724 } 2725 iov[dr].iov_base = (void *)offset; 2726 iov[dr].iov_len = bytes; 2727 dr++; 2728 } 2729 req->io_req.br_callback = pci_nvme_dealloc_sm; 2730 2731 /* 2732 * Use prev_gpaddr to track the current entry and 2733 * prev_size to track the number of entries 2734 */ 2735 req->prev_gpaddr = 0; 2736 req->prev_size = dr; 2737 } 2738 2739 err = blockif_delete(nvstore->ctx, &req->io_req); 2740 if (err) 2741 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2742 else 2743 pending = true; 2744 } 2745 out: 2746 free(range); 2747 return (pending); 2748 } 2749 2750 static void 2751 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2752 { 2753 struct nvme_submission_queue *sq; 2754 uint16_t status; 2755 uint16_t sqhead; 2756 2757 #ifndef __FreeBSD__ 2758 status = 0; 2759 #endif 2760 2761 /* handle all submissions up to sq->tail index */ 2762 sq = &sc->submit_queues[idx]; 2763 2764 pthread_mutex_lock(&sq->mtx); 2765 2766 sqhead = sq->head; 2767 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2768 idx, sqhead, sq->tail, sq->qbase); 2769 2770 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2771 struct nvme_command *cmd; 2772 struct pci_nvme_ioreq *req; 2773 uint32_t nsid; 2774 bool pending; 2775 2776 pending = false; 2777 req = NULL; 2778 status = 0; 2779 2780 cmd = &sq->qbase[sqhead]; 2781 sqhead = (sqhead + 1) % sq->size; 2782 2783 nsid = le32toh(cmd->nsid); 2784 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2785 pci_nvme_status_genc(&status, 2786 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2787 status |= 2788 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2789 goto complete; 2790 } 2791 2792 req = pci_nvme_get_ioreq(sc); 2793 if (req == NULL) { 2794 pci_nvme_status_genc(&status, 2795 NVME_SC_INTERNAL_DEVICE_ERROR); 2796 WPRINTF("%s: unable to allocate IO req", __func__); 2797 goto complete; 2798 } 2799 req->nvme_sq = sq; 2800 req->sqid = idx; 2801 req->opc = cmd->opc; 2802 req->cid = cmd->cid; 2803 req->nsid = cmd->nsid; 2804 2805 switch (cmd->opc) { 2806 case NVME_OPC_FLUSH: 2807 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2808 req, &status); 2809 break; 2810 case NVME_OPC_WRITE: 2811 case NVME_OPC_READ: 2812 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2813 req, &status); 2814 break; 2815 case NVME_OPC_WRITE_ZEROES: 2816 /* TODO: write zeroes 2817 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2818 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2819 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2820 break; 2821 case NVME_OPC_DATASET_MANAGEMENT: 2822 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2823 req, &status); 2824 break; 2825 default: 2826 WPRINTF("%s unhandled io command 0x%x", 2827 __func__, cmd->opc); 2828 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2829 } 2830 complete: 2831 if (!pending) { 2832 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2833 if (req != NULL) 2834 pci_nvme_release_ioreq(sc, req); 2835 } 2836 } 2837 2838 sq->head = sqhead; 2839 2840 pthread_mutex_unlock(&sq->mtx); 2841 } 2842 2843 static void 2844 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, 2845 uint64_t idx, int is_sq, uint64_t value) 2846 { 2847 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2848 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2849 2850 if (is_sq) { 2851 if (idx > sc->num_squeues) { 2852 WPRINTF("%s queue index %lu overflow from " 2853 "guest (max %u)", 2854 __func__, idx, sc->num_squeues); 2855 return; 2856 } 2857 2858 atomic_store_short(&sc->submit_queues[idx].tail, 2859 (uint16_t)value); 2860 2861 if (idx == 0) { 2862 pci_nvme_handle_admin_cmd(sc, value); 2863 } else { 2864 /* submission queue; handle new entries in SQ */ 2865 if (idx > sc->num_squeues) { 2866 WPRINTF("%s SQ index %lu overflow from " 2867 "guest (max %u)", 2868 __func__, idx, sc->num_squeues); 2869 return; 2870 } 2871 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2872 } 2873 } else { 2874 if (idx > sc->num_cqueues) { 2875 WPRINTF("%s queue index %lu overflow from " 2876 "guest (max %u)", 2877 __func__, idx, sc->num_cqueues); 2878 return; 2879 } 2880 2881 atomic_store_short(&sc->compl_queues[idx].head, 2882 (uint16_t)value); 2883 } 2884 } 2885 2886 static void 2887 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2888 { 2889 const char *s = iswrite ? "WRITE" : "READ"; 2890 2891 switch (offset) { 2892 case NVME_CR_CAP_LOW: 2893 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2894 break; 2895 case NVME_CR_CAP_HI: 2896 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2897 break; 2898 case NVME_CR_VS: 2899 DPRINTF("%s %s NVME_CR_VS", func, s); 2900 break; 2901 case NVME_CR_INTMS: 2902 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2903 break; 2904 case NVME_CR_INTMC: 2905 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2906 break; 2907 case NVME_CR_CC: 2908 DPRINTF("%s %s NVME_CR_CC", func, s); 2909 break; 2910 case NVME_CR_CSTS: 2911 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2912 break; 2913 case NVME_CR_NSSR: 2914 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2915 break; 2916 case NVME_CR_AQA: 2917 DPRINTF("%s %s NVME_CR_AQA", func, s); 2918 break; 2919 case NVME_CR_ASQ_LOW: 2920 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2921 break; 2922 case NVME_CR_ASQ_HI: 2923 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2924 break; 2925 case NVME_CR_ACQ_LOW: 2926 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2927 break; 2928 case NVME_CR_ACQ_HI: 2929 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2930 break; 2931 default: 2932 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2933 } 2934 2935 } 2936 2937 static void 2938 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2939 uint64_t offset, int size, uint64_t value) 2940 { 2941 uint32_t ccreg; 2942 2943 if (offset >= NVME_DOORBELL_OFFSET) { 2944 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2945 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2946 int is_sq = (belloffset % 8) < 4; 2947 2948 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2949 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2950 offset); 2951 return; 2952 } 2953 2954 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2955 WPRINTF("guest attempted an overflow write offset " 2956 "0x%lx, val 0x%lx in %s", 2957 offset, value, __func__); 2958 return; 2959 } 2960 2961 if (is_sq) { 2962 if (sc->submit_queues[idx].qbase == NULL) 2963 return; 2964 } else if (sc->compl_queues[idx].qbase == NULL) 2965 return; 2966 2967 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2968 return; 2969 } 2970 2971 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2972 offset, size, value); 2973 2974 if (size != 4) { 2975 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2976 "val 0x%lx) to bar0 in %s", 2977 size, offset, value, __func__); 2978 /* TODO: shutdown device */ 2979 return; 2980 } 2981 2982 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2983 2984 pthread_mutex_lock(&sc->mtx); 2985 2986 switch (offset) { 2987 case NVME_CR_CAP_LOW: 2988 case NVME_CR_CAP_HI: 2989 /* readonly */ 2990 break; 2991 case NVME_CR_VS: 2992 /* readonly */ 2993 break; 2994 case NVME_CR_INTMS: 2995 /* MSI-X, so ignore */ 2996 break; 2997 case NVME_CR_INTMC: 2998 /* MSI-X, so ignore */ 2999 break; 3000 case NVME_CR_CC: 3001 ccreg = (uint32_t)value; 3002 3003 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 3004 "iocqes %u", 3005 __func__, 3006 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 3007 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 3008 NVME_CC_GET_IOCQES(ccreg)); 3009 3010 if (NVME_CC_GET_SHN(ccreg)) { 3011 /* perform shutdown - flush out data to backend */ 3012 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 3013 NVME_CSTS_REG_SHST_SHIFT); 3014 sc->regs.csts |= NVME_SHST_COMPLETE << 3015 NVME_CSTS_REG_SHST_SHIFT; 3016 } 3017 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 3018 if (NVME_CC_GET_EN(ccreg) == 0) 3019 /* transition 1-> causes controller reset */ 3020 pci_nvme_reset_locked(sc); 3021 else 3022 pci_nvme_init_controller(ctx, sc); 3023 } 3024 3025 /* Insert the iocqes, iosqes and en bits from the write */ 3026 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3027 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3028 if (NVME_CC_GET_EN(ccreg) == 0) { 3029 /* Insert the ams, mps and css bit fields */ 3030 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3031 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3032 sc->regs.csts &= ~NVME_CSTS_RDY; 3033 } else if ((sc->pending_ios == 0) && 3034 !(sc->regs.csts & NVME_CSTS_CFS)) { 3035 sc->regs.csts |= NVME_CSTS_RDY; 3036 } 3037 break; 3038 case NVME_CR_CSTS: 3039 break; 3040 case NVME_CR_NSSR: 3041 /* ignore writes; don't support subsystem reset */ 3042 break; 3043 case NVME_CR_AQA: 3044 sc->regs.aqa = (uint32_t)value; 3045 break; 3046 case NVME_CR_ASQ_LOW: 3047 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3048 (0xFFFFF000 & value); 3049 break; 3050 case NVME_CR_ASQ_HI: 3051 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3052 (value << 32); 3053 break; 3054 case NVME_CR_ACQ_LOW: 3055 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3056 (0xFFFFF000 & value); 3057 break; 3058 case NVME_CR_ACQ_HI: 3059 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3060 (value << 32); 3061 break; 3062 default: 3063 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3064 __func__, offset, value, size); 3065 } 3066 pthread_mutex_unlock(&sc->mtx); 3067 } 3068 3069 static void 3070 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, 3071 int baridx, uint64_t offset, int size, uint64_t value) 3072 { 3073 struct pci_nvme_softc* sc = pi->pi_arg; 3074 3075 if (baridx == pci_msix_table_bar(pi) || 3076 baridx == pci_msix_pba_bar(pi)) { 3077 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3078 " value 0x%lx", baridx, offset, size, value); 3079 3080 pci_emul_msix_twrite(pi, offset, size, value); 3081 return; 3082 } 3083 3084 switch (baridx) { 3085 case 0: 3086 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 3087 break; 3088 3089 default: 3090 DPRINTF("%s unknown baridx %d, val 0x%lx", 3091 __func__, baridx, value); 3092 } 3093 } 3094 3095 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3096 uint64_t offset, int size) 3097 { 3098 uint64_t value; 3099 3100 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3101 3102 if (offset < NVME_DOORBELL_OFFSET) { 3103 void *p = &(sc->regs); 3104 pthread_mutex_lock(&sc->mtx); 3105 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3106 pthread_mutex_unlock(&sc->mtx); 3107 } else { 3108 value = 0; 3109 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3110 } 3111 3112 switch (size) { 3113 case 1: 3114 value &= 0xFF; 3115 break; 3116 case 2: 3117 value &= 0xFFFF; 3118 break; 3119 case 4: 3120 value &= 0xFFFFFFFF; 3121 break; 3122 } 3123 3124 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3125 offset, size, (uint32_t)value); 3126 3127 return (value); 3128 } 3129 3130 3131 3132 static uint64_t 3133 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, 3134 struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3135 { 3136 struct pci_nvme_softc* sc = pi->pi_arg; 3137 3138 if (baridx == pci_msix_table_bar(pi) || 3139 baridx == pci_msix_pba_bar(pi)) { 3140 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3141 baridx, offset, size); 3142 3143 return pci_emul_msix_tread(pi, offset, size); 3144 } 3145 3146 switch (baridx) { 3147 case 0: 3148 return pci_nvme_read_bar_0(sc, offset, size); 3149 3150 default: 3151 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3152 } 3153 3154 return (0); 3155 } 3156 3157 static int 3158 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3159 { 3160 char bident[sizeof("XX:X:X")]; 3161 const char *value; 3162 uint32_t sectsz; 3163 3164 sc->max_queues = NVME_QUEUES; 3165 sc->max_qentries = NVME_MAX_QENTRIES; 3166 sc->ioslots = NVME_IOSLOTS; 3167 sc->num_squeues = sc->max_queues; 3168 sc->num_cqueues = sc->max_queues; 3169 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3170 sectsz = 0; 3171 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3172 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3173 3174 value = get_config_value_node(nvl, "maxq"); 3175 if (value != NULL) 3176 sc->max_queues = atoi(value); 3177 value = get_config_value_node(nvl, "qsz"); 3178 if (value != NULL) { 3179 sc->max_qentries = atoi(value); 3180 if (sc->max_qentries <= 0) { 3181 EPRINTLN("nvme: Invalid qsz option %d", 3182 sc->max_qentries); 3183 return (-1); 3184 } 3185 } 3186 value = get_config_value_node(nvl, "ioslots"); 3187 if (value != NULL) { 3188 sc->ioslots = atoi(value); 3189 if (sc->ioslots <= 0) { 3190 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3191 return (-1); 3192 } 3193 } 3194 value = get_config_value_node(nvl, "sectsz"); 3195 if (value != NULL) 3196 sectsz = atoi(value); 3197 value = get_config_value_node(nvl, "ser"); 3198 if (value != NULL) { 3199 /* 3200 * This field indicates the Product Serial Number in 3201 * 7-bit ASCII, unused bytes should be space characters. 3202 * Ref: NVMe v1.3c. 3203 */ 3204 cpywithpad((char *)sc->ctrldata.sn, 3205 sizeof(sc->ctrldata.sn), value, ' '); 3206 } 3207 value = get_config_value_node(nvl, "eui64"); 3208 if (value != NULL) 3209 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3210 value = get_config_value_node(nvl, "dsm"); 3211 if (value != NULL) { 3212 if (strcmp(value, "auto") == 0) 3213 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3214 else if (strcmp(value, "enable") == 0) 3215 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3216 else if (strcmp(value, "disable") == 0) 3217 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3218 } 3219 3220 value = get_config_value_node(nvl, "ram"); 3221 if (value != NULL) { 3222 uint64_t sz = strtoull(value, NULL, 10); 3223 3224 sc->nvstore.type = NVME_STOR_RAM; 3225 sc->nvstore.size = sz * 1024 * 1024; 3226 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3227 sc->nvstore.sectsz = 4096; 3228 sc->nvstore.sectsz_bits = 12; 3229 if (sc->nvstore.ctx == NULL) { 3230 EPRINTLN("nvme: Unable to allocate RAM"); 3231 return (-1); 3232 } 3233 } else { 3234 snprintf(bident, sizeof(bident), "%d:%d", 3235 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3236 sc->nvstore.ctx = blockif_open(nvl, bident); 3237 if (sc->nvstore.ctx == NULL) { 3238 EPRINTLN("nvme: Could not open backing file: %s", 3239 strerror(errno)); 3240 return (-1); 3241 } 3242 sc->nvstore.type = NVME_STOR_BLOCKIF; 3243 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3244 } 3245 3246 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3247 sc->nvstore.sectsz = sectsz; 3248 else if (sc->nvstore.type != NVME_STOR_RAM) 3249 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3250 for (sc->nvstore.sectsz_bits = 9; 3251 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3252 sc->nvstore.sectsz_bits++); 3253 3254 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3255 sc->max_queues = NVME_QUEUES; 3256 3257 return (0); 3258 } 3259 3260 static void 3261 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3262 size_t new_size) 3263 { 3264 struct pci_nvme_softc *sc; 3265 struct pci_nvme_blockstore *nvstore; 3266 struct nvme_namespace_data *nd; 3267 3268 sc = arg; 3269 nvstore = &sc->nvstore; 3270 nd = &sc->nsdata; 3271 3272 nvstore->size = new_size; 3273 pci_nvme_init_nsdata_size(nvstore, nd); 3274 3275 /* Add changed NSID to list */ 3276 sc->ns_log.ns[0] = 1; 3277 sc->ns_log.ns[1] = 0; 3278 3279 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3280 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3281 } 3282 3283 static int 3284 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) 3285 { 3286 struct pci_nvme_softc *sc; 3287 uint32_t pci_membar_sz; 3288 int error; 3289 3290 error = 0; 3291 3292 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3293 pi->pi_arg = sc; 3294 sc->nsc_pi = pi; 3295 3296 error = pci_nvme_parse_config(sc, nvl); 3297 if (error < 0) 3298 goto done; 3299 else 3300 error = 0; 3301 3302 STAILQ_INIT(&sc->ioreqs_free); 3303 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3304 for (int i = 0; i < sc->ioslots; i++) { 3305 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3306 } 3307 3308 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3309 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3310 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3311 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3312 pci_set_cfgdata8(pi, PCIR_PROGIF, 3313 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3314 3315 /* 3316 * Allocate size of NVMe registers + doorbell space for all queues. 3317 * 3318 * The specification requires a minimum memory I/O window size of 16K. 3319 * The Windows driver will refuse to start a device with a smaller 3320 * window. 3321 */ 3322 pci_membar_sz = sizeof(struct nvme_registers) + 3323 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3324 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3325 3326 DPRINTF("nvme membar size: %u", pci_membar_sz); 3327 3328 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3329 if (error) { 3330 WPRINTF("%s pci alloc mem bar failed", __func__); 3331 goto done; 3332 } 3333 3334 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3335 if (error) { 3336 WPRINTF("%s pci add msixcap failed", __func__); 3337 goto done; 3338 } 3339 3340 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3341 if (error) { 3342 WPRINTF("%s pci add Express capability failed", __func__); 3343 goto done; 3344 } 3345 3346 pthread_mutex_init(&sc->mtx, NULL); 3347 sem_init(&sc->iosemlock, 0, sc->ioslots); 3348 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3349 3350 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3351 /* 3352 * Controller data depends on Namespace data so initialize Namespace 3353 * data first. 3354 */ 3355 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3356 pci_nvme_init_ctrldata(sc); 3357 pci_nvme_init_logpages(sc); 3358 pci_nvme_init_features(sc); 3359 3360 pci_nvme_aer_init(sc); 3361 pci_nvme_aen_init(sc); 3362 3363 pci_nvme_reset(sc); 3364 3365 pci_lintr_request(pi); 3366 3367 done: 3368 return (error); 3369 } 3370 3371 static int 3372 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3373 { 3374 char *cp, *ram; 3375 3376 if (opts == NULL) 3377 return (0); 3378 3379 if (strncmp(opts, "ram=", 4) == 0) { 3380 cp = strchr(opts, ','); 3381 if (cp == NULL) { 3382 set_config_value_node(nvl, "ram", opts + 4); 3383 return (0); 3384 } 3385 ram = strndup(opts + 4, cp - opts - 4); 3386 set_config_value_node(nvl, "ram", ram); 3387 free(ram); 3388 return (pci_parse_legacy_config(nvl, cp + 1)); 3389 } else 3390 return (blockif_legacy_config(nvl, opts)); 3391 } 3392 3393 static const struct pci_devemu pci_de_nvme = { 3394 .pe_emu = "nvme", 3395 .pe_init = pci_nvme_init, 3396 .pe_legacy_config = pci_nvme_legacy_config, 3397 .pe_barwrite = pci_nvme_write, 3398 .pe_barread = pci_nvme_read 3399 }; 3400 PCI_EMUL_SET(pci_de_nvme); 3401