1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 /** Asynchronous Event Information - Notice */ 273 typedef enum { 274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 277 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 281 PCI_NVME_AEI_NOTICE_MAX, 282 } pci_nvme_async_event_info_notice; 283 284 #define PCI_NVME_AEI_NOTICE_SHIFT 8 285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 286 287 /* Asynchronous Event Notifications */ 288 struct pci_nvme_aen { 289 pci_nvme_async_type atype; 290 uint32_t event_data; 291 bool posted; 292 }; 293 294 /* 295 * By default, enable all Asynchrnous Event Notifications: 296 * SMART / Health Critical Warnings 297 * Namespace Attribute Notices 298 */ 299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 300 301 typedef enum { 302 NVME_CNTRLTYPE_IO = 1, 303 NVME_CNTRLTYPE_DISCOVERY = 2, 304 NVME_CNTRLTYPE_ADMIN = 3, 305 } pci_nvme_cntrl_type; 306 307 struct pci_nvme_softc { 308 struct pci_devinst *nsc_pi; 309 310 pthread_mutex_t mtx; 311 312 struct nvme_registers regs; 313 314 struct nvme_namespace_data nsdata; 315 struct nvme_controller_data ctrldata; 316 struct nvme_error_information_entry err_log; 317 struct nvme_health_information_page health_log; 318 struct nvme_firmware_page fw_log; 319 struct nvme_ns_list ns_log; 320 321 struct pci_nvme_blockstore nvstore; 322 323 uint16_t max_qentries; /* max entries per queue */ 324 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 325 uint32_t num_cqueues; 326 uint32_t num_squeues; 327 bool num_q_is_set; /* Has host set Number of Queues */ 328 329 struct pci_nvme_ioreq *ioreqs; 330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 331 uint32_t pending_ios; 332 uint32_t ioslots; 333 sem_t iosemlock; 334 335 /* 336 * Memory mapped Submission and Completion queues 337 * Each array includes both Admin and IO queues 338 */ 339 struct nvme_completion_queue *compl_queues; 340 struct nvme_submission_queue *submit_queues; 341 342 struct nvme_feature_obj feat[NVME_FID_MAX]; 343 344 enum nvme_dsm_type dataset_management; 345 346 /* Accounting for SMART data */ 347 __uint128_t read_data_units; 348 __uint128_t write_data_units; 349 __uint128_t read_commands; 350 __uint128_t write_commands; 351 uint32_t read_dunits_remainder; 352 uint32_t write_dunits_remainder; 353 354 STAILQ_HEAD(, pci_nvme_aer) aer_list; 355 pthread_mutex_t aer_mtx; 356 uint32_t aer_count; 357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 358 pthread_t aen_tid; 359 pthread_mutex_t aen_mtx; 360 pthread_cond_t aen_cond; 361 }; 362 363 364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 365 struct nvme_completion_queue *cq, 366 uint32_t cdw0, 367 uint16_t cid, 368 uint16_t sqid, 369 uint16_t status); 370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 372 static void pci_nvme_io_done(struct blockif_req *, int); 373 374 /* Controller Configuration utils */ 375 #define NVME_CC_GET_EN(cc) \ 376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 377 #define NVME_CC_GET_CSS(cc) \ 378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 379 #define NVME_CC_GET_SHN(cc) \ 380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 381 #define NVME_CC_GET_IOSQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 383 #define NVME_CC_GET_IOCQES(cc) \ 384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 385 386 #define NVME_CC_WRITE_MASK \ 387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 390 391 #define NVME_CC_NEN_WRITE_MASK \ 392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 395 396 /* Controller Status utils */ 397 #define NVME_CSTS_GET_RDY(sts) \ 398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 399 400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 401 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 402 403 /* Completion Queue status word utils */ 404 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 405 #define NVME_STATUS_MASK \ 406 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 407 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 408 409 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 410 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 411 412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_temperature(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_num_queues(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_iv_config(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 static void nvme_feature_async_event(struct pci_nvme_softc *, 429 struct nvme_feature_obj *, 430 struct nvme_command *, 431 struct nvme_completion *); 432 433 static void *aen_thr(void *arg); 434 435 static __inline void 436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 437 { 438 size_t len; 439 440 len = strnlen(src, dst_size); 441 memset(dst, pad, dst_size); 442 memcpy(dst, src, len); 443 } 444 445 static __inline void 446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 447 { 448 449 *status &= ~NVME_STATUS_MASK; 450 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 451 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 452 } 453 454 static __inline void 455 pci_nvme_status_genc(uint16_t *status, uint16_t code) 456 { 457 458 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 459 } 460 461 /* 462 * Initialize the requested number or IO Submission and Completion Queues. 463 * Admin queues are allocated implicitly. 464 */ 465 static void 466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 467 { 468 uint32_t i; 469 470 /* 471 * Allocate and initialize the Submission Queues 472 */ 473 if (nsq > NVME_QUEUES) { 474 WPRINTF("%s: clamping number of SQ from %u to %u", 475 __func__, nsq, NVME_QUEUES); 476 nsq = NVME_QUEUES; 477 } 478 479 sc->num_squeues = nsq; 480 481 sc->submit_queues = calloc(sc->num_squeues + 1, 482 sizeof(struct nvme_submission_queue)); 483 if (sc->submit_queues == NULL) { 484 WPRINTF("%s: SQ allocation failed", __func__); 485 sc->num_squeues = 0; 486 } else { 487 struct nvme_submission_queue *sq = sc->submit_queues; 488 489 for (i = 0; i < sc->num_squeues + 1; i++) 490 pthread_mutex_init(&sq[i].mtx, NULL); 491 } 492 493 /* 494 * Allocate and initialize the Completion Queues 495 */ 496 if (ncq > NVME_QUEUES) { 497 WPRINTF("%s: clamping number of CQ from %u to %u", 498 __func__, ncq, NVME_QUEUES); 499 ncq = NVME_QUEUES; 500 } 501 502 sc->num_cqueues = ncq; 503 504 sc->compl_queues = calloc(sc->num_cqueues + 1, 505 sizeof(struct nvme_completion_queue)); 506 if (sc->compl_queues == NULL) { 507 WPRINTF("%s: CQ allocation failed", __func__); 508 sc->num_cqueues = 0; 509 } else { 510 struct nvme_completion_queue *cq = sc->compl_queues; 511 512 for (i = 0; i < sc->num_cqueues + 1; i++) 513 pthread_mutex_init(&cq[i].mtx, NULL); 514 } 515 } 516 517 static void 518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 519 { 520 struct nvme_controller_data *cd = &sc->ctrldata; 521 522 cd->vid = 0xFB5D; 523 cd->ssvid = 0x0000; 524 525 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 526 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 527 528 /* Num of submission commands that we can handle at a time (2^rab) */ 529 cd->rab = 4; 530 531 /* FreeBSD OUI */ 532 cd->ieee[0] = 0x58; 533 cd->ieee[1] = 0x9c; 534 cd->ieee[2] = 0xfc; 535 536 cd->mic = 0; 537 538 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 539 540 cd->ver = NVME_REV(1,4); 541 542 cd->cntrltype = NVME_CNTRLTYPE_IO; 543 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 544 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 545 cd->acl = 2; 546 cd->aerl = 4; 547 548 /* Advertise 1, Read-only firmware slot */ 549 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 550 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 551 cd->lpa = 0; /* TODO: support some simple things like SMART */ 552 cd->elpe = 0; /* max error log page entries */ 553 /* 554 * Report a single power state (zero-based value) 555 * power_state[] values are left as zero to indicate "Not reported" 556 */ 557 cd->npss = 0; 558 559 /* Warning Composite Temperature Threshold */ 560 cd->wctemp = 0x0157; 561 cd->cctemp = 0x0157; 562 563 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 564 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 565 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 566 567 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 568 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 569 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 570 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 571 cd->nn = 1; /* number of namespaces */ 572 573 cd->oncs = 0; 574 switch (sc->dataset_management) { 575 case NVME_DATASET_MANAGEMENT_AUTO: 576 if (sc->nvstore.deallocate) 577 cd->oncs |= NVME_ONCS_DSM; 578 break; 579 case NVME_DATASET_MANAGEMENT_ENABLE: 580 cd->oncs |= NVME_ONCS_DSM; 581 break; 582 default: 583 break; 584 } 585 586 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 587 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 588 589 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 590 } 591 592 /* 593 * Calculate the CRC-16 of the given buffer 594 * See copyright attribution at top of file 595 */ 596 static uint16_t 597 crc16(uint16_t crc, const void *buffer, unsigned int len) 598 { 599 const unsigned char *cp = buffer; 600 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 601 static uint16_t const crc16_table[256] = { 602 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 603 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 604 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 605 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 606 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 607 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 608 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 609 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 610 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 611 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 612 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 613 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 614 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 615 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 616 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 617 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 618 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 619 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 620 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 621 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 622 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 623 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 624 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 625 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 626 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 627 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 628 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 629 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 630 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 631 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 632 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 633 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 634 }; 635 636 while (len--) 637 crc = (((crc >> 8) & 0xffU) ^ 638 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 639 return crc; 640 } 641 642 static void 643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 644 struct nvme_namespace_data *nd) 645 { 646 647 /* Get capacity and block size information from backing store */ 648 nd->nsze = nvstore->size / nvstore->sectsz; 649 nd->ncap = nd->nsze; 650 nd->nuse = nd->nsze; 651 } 652 653 static void 654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 655 struct nvme_namespace_data *nd, uint32_t nsid, 656 struct pci_nvme_blockstore *nvstore) 657 { 658 659 pci_nvme_init_nsdata_size(nvstore, nd); 660 661 if (nvstore->type == NVME_STOR_BLOCKIF) 662 nvstore->deallocate = blockif_candelete(nvstore->ctx); 663 664 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 665 nd->flbas = 0; 666 667 /* Create an EUI-64 if user did not provide one */ 668 if (nvstore->eui64 == 0) { 669 char *data = NULL; 670 uint64_t eui64 = nvstore->eui64; 671 672 asprintf(&data, "%s%u%u%u", get_config_value("name"), 673 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 674 sc->nsc_pi->pi_func); 675 676 if (data != NULL) { 677 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 678 free(data); 679 } 680 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 681 } 682 be64enc(nd->eui64, nvstore->eui64); 683 684 /* LBA data-sz = 2^lbads */ 685 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 686 } 687 688 static void 689 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 690 { 691 __uint128_t power_cycles = 1; 692 693 memset(&sc->err_log, 0, sizeof(sc->err_log)); 694 memset(&sc->health_log, 0, sizeof(sc->health_log)); 695 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 696 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 697 698 /* Set read/write remainder to round up according to spec */ 699 sc->read_dunits_remainder = 999; 700 sc->write_dunits_remainder = 999; 701 702 /* Set nominal Health values checked by implementations */ 703 sc->health_log.temperature = NVME_TEMPERATURE; 704 sc->health_log.available_spare = 100; 705 sc->health_log.available_spare_threshold = 10; 706 707 /* Set Active Firmware Info to slot 1 */ 708 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 709 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 710 sizeof(sc->fw_log.revision[0])); 711 712 memcpy(&sc->health_log.power_cycles, &power_cycles, 713 sizeof(sc->health_log.power_cycles)); 714 } 715 716 static void 717 pci_nvme_init_features(struct pci_nvme_softc *sc) 718 { 719 enum nvme_feature fid; 720 721 for (fid = 0; fid < NVME_FID_MAX; fid++) { 722 switch (fid) { 723 case NVME_FEAT_ARBITRATION: 724 case NVME_FEAT_POWER_MANAGEMENT: 725 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 726 case NVME_FEAT_WRITE_ATOMICITY: 727 /* Mandatory but no special handling required */ 728 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 729 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 730 // this returns a data buffer 731 break; 732 case NVME_FEAT_TEMPERATURE_THRESHOLD: 733 sc->feat[fid].set = nvme_feature_temperature; 734 break; 735 case NVME_FEAT_ERROR_RECOVERY: 736 sc->feat[fid].namespace_specific = true; 737 break; 738 case NVME_FEAT_NUMBER_OF_QUEUES: 739 sc->feat[fid].set = nvme_feature_num_queues; 740 break; 741 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 742 sc->feat[fid].set = nvme_feature_iv_config; 743 break; 744 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 745 sc->feat[fid].set = nvme_feature_async_event; 746 /* Enable all AENs by default */ 747 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 748 break; 749 default: 750 sc->feat[fid].set = nvme_feature_invalid_cb; 751 sc->feat[fid].get = nvme_feature_invalid_cb; 752 } 753 } 754 } 755 756 static void 757 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 758 { 759 760 STAILQ_INIT(&sc->aer_list); 761 sc->aer_count = 0; 762 } 763 764 static void 765 pci_nvme_aer_init(struct pci_nvme_softc *sc) 766 { 767 768 pthread_mutex_init(&sc->aer_mtx, NULL); 769 pci_nvme_aer_reset(sc); 770 } 771 772 static void 773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 774 { 775 struct pci_nvme_aer *aer = NULL; 776 777 pthread_mutex_lock(&sc->aer_mtx); 778 while (!STAILQ_EMPTY(&sc->aer_list)) { 779 aer = STAILQ_FIRST(&sc->aer_list); 780 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 781 free(aer); 782 } 783 pthread_mutex_unlock(&sc->aer_mtx); 784 785 pci_nvme_aer_reset(sc); 786 } 787 788 static bool 789 pci_nvme_aer_available(struct pci_nvme_softc *sc) 790 { 791 792 return (sc->aer_count != 0); 793 } 794 795 static bool 796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 797 { 798 struct nvme_controller_data *cd = &sc->ctrldata; 799 800 /* AERL is a zero based value while aer_count is one's based */ 801 return (sc->aer_count == (cd->aerl + 1U)); 802 } 803 804 /* 805 * Add an Async Event Request 806 * 807 * Stores an AER to be returned later if the Controller needs to notify the 808 * host of an event. 809 * Note that while the NVMe spec doesn't require Controllers to return AER's 810 * in order, this implementation does preserve the order. 811 */ 812 static int 813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 814 { 815 struct pci_nvme_aer *aer = NULL; 816 817 aer = calloc(1, sizeof(struct pci_nvme_aer)); 818 if (aer == NULL) 819 return (-1); 820 821 /* Save the Command ID for use in the completion message */ 822 aer->cid = cid; 823 824 pthread_mutex_lock(&sc->aer_mtx); 825 sc->aer_count++; 826 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 827 pthread_mutex_unlock(&sc->aer_mtx); 828 829 return (0); 830 } 831 832 /* 833 * Get an Async Event Request structure 834 * 835 * Returns a pointer to an AER previously submitted by the host or NULL if 836 * no AER's exist. Caller is responsible for freeing the returned struct. 837 */ 838 static struct pci_nvme_aer * 839 pci_nvme_aer_get(struct pci_nvme_softc *sc) 840 { 841 struct pci_nvme_aer *aer = NULL; 842 843 pthread_mutex_lock(&sc->aer_mtx); 844 aer = STAILQ_FIRST(&sc->aer_list); 845 if (aer != NULL) { 846 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 847 sc->aer_count--; 848 } 849 pthread_mutex_unlock(&sc->aer_mtx); 850 851 return (aer); 852 } 853 854 static void 855 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 856 { 857 uint32_t atype; 858 859 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 860 861 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 862 sc->aen[atype].atype = atype; 863 } 864 } 865 866 static void 867 pci_nvme_aen_init(struct pci_nvme_softc *sc) 868 { 869 char nstr[80]; 870 871 pci_nvme_aen_reset(sc); 872 873 pthread_mutex_init(&sc->aen_mtx, NULL); 874 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 875 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 876 sc->nsc_pi->pi_func); 877 pthread_set_name_np(sc->aen_tid, nstr); 878 } 879 880 static void 881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 882 { 883 884 pci_nvme_aen_reset(sc); 885 } 886 887 /* Notify the AEN thread of pending work */ 888 static void 889 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 890 { 891 892 pthread_cond_signal(&sc->aen_cond); 893 } 894 895 /* 896 * Post an Asynchronous Event Notification 897 */ 898 static int32_t 899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 900 uint32_t event_data) 901 { 902 struct pci_nvme_aen *aen; 903 904 if (atype >= PCI_NVME_AE_TYPE_MAX) { 905 return(EINVAL); 906 } 907 908 pthread_mutex_lock(&sc->aen_mtx); 909 aen = &sc->aen[atype]; 910 911 /* Has the controller already posted an event of this type? */ 912 if (aen->posted) { 913 pthread_mutex_unlock(&sc->aen_mtx); 914 return(EALREADY); 915 } 916 917 aen->event_data = event_data; 918 aen->posted = true; 919 pthread_mutex_unlock(&sc->aen_mtx); 920 921 pci_nvme_aen_notify(sc); 922 923 return(0); 924 } 925 926 static void 927 pci_nvme_aen_process(struct pci_nvme_softc *sc) 928 { 929 struct pci_nvme_aer *aer; 930 struct pci_nvme_aen *aen; 931 pci_nvme_async_type atype; 932 uint32_t mask; 933 uint16_t status; 934 uint8_t lid; 935 936 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 937 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 938 aen = &sc->aen[atype]; 939 /* Previous iterations may have depleted the available AER's */ 940 if (!pci_nvme_aer_available(sc)) { 941 DPRINTF("%s: no AER", __func__); 942 break; 943 } 944 945 if (!aen->posted) { 946 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 947 continue; 948 } 949 950 status = NVME_SC_SUCCESS; 951 952 /* Is the event masked? */ 953 mask = 954 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 955 956 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 957 switch (atype) { 958 case PCI_NVME_AE_TYPE_ERROR: 959 lid = NVME_LOG_ERROR; 960 break; 961 case PCI_NVME_AE_TYPE_SMART: 962 mask &= 0xff; 963 if ((mask & aen->event_data) == 0) 964 continue; 965 lid = NVME_LOG_HEALTH_INFORMATION; 966 break; 967 case PCI_NVME_AE_TYPE_NOTICE: 968 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 969 EPRINTLN("%s unknown AEN notice type %u", 970 __func__, aen->event_data); 971 status = NVME_SC_INTERNAL_DEVICE_ERROR; 972 lid = 0; 973 break; 974 } 975 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 976 continue; 977 switch (aen->event_data) { 978 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 979 lid = NVME_LOG_CHANGED_NAMESPACE; 980 break; 981 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 982 lid = NVME_LOG_FIRMWARE_SLOT; 983 break; 984 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 985 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 986 break; 987 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 988 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 989 break; 990 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 991 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 992 break; 993 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 994 lid = NVME_LOG_LBA_STATUS_INFORMATION; 995 break; 996 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 997 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 998 break; 999 default: 1000 lid = 0; 1001 } 1002 break; 1003 default: 1004 /* bad type?!? */ 1005 EPRINTLN("%s unknown AEN type %u", __func__, atype); 1006 status = NVME_SC_INTERNAL_DEVICE_ERROR; 1007 lid = 0; 1008 break; 1009 } 1010 1011 aer = pci_nvme_aer_get(sc); 1012 assert(aer != NULL); 1013 1014 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1015 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1016 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1017 aer->cid, 1018 0, /* SQID */ 1019 status); 1020 1021 aen->event_data = 0; 1022 aen->posted = false; 1023 1024 pci_generate_msix(sc->nsc_pi, 0); 1025 } 1026 } 1027 1028 static void * 1029 aen_thr(void *arg) 1030 { 1031 struct pci_nvme_softc *sc; 1032 1033 sc = arg; 1034 1035 pthread_mutex_lock(&sc->aen_mtx); 1036 for (;;) { 1037 pci_nvme_aen_process(sc); 1038 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1039 } 1040 pthread_mutex_unlock(&sc->aen_mtx); 1041 1042 pthread_exit(NULL); 1043 return (NULL); 1044 } 1045 1046 static void 1047 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1048 { 1049 uint32_t i; 1050 1051 DPRINTF("%s", __func__); 1052 1053 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1054 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1055 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1056 1057 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1058 1059 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1060 1061 sc->regs.cc = 0; 1062 1063 assert(sc->submit_queues != NULL); 1064 1065 for (i = 0; i < sc->num_squeues + 1; i++) { 1066 sc->submit_queues[i].qbase = NULL; 1067 sc->submit_queues[i].size = 0; 1068 sc->submit_queues[i].cqid = 0; 1069 sc->submit_queues[i].tail = 0; 1070 sc->submit_queues[i].head = 0; 1071 } 1072 1073 assert(sc->compl_queues != NULL); 1074 1075 for (i = 0; i < sc->num_cqueues + 1; i++) { 1076 sc->compl_queues[i].qbase = NULL; 1077 sc->compl_queues[i].size = 0; 1078 sc->compl_queues[i].tail = 0; 1079 sc->compl_queues[i].head = 0; 1080 } 1081 1082 sc->num_q_is_set = false; 1083 1084 pci_nvme_aer_destroy(sc); 1085 pci_nvme_aen_destroy(sc); 1086 1087 /* 1088 * Clear CSTS.RDY last to prevent the host from enabling Controller 1089 * before cleanup completes 1090 */ 1091 sc->regs.csts = 0; 1092 } 1093 1094 static void 1095 pci_nvme_reset(struct pci_nvme_softc *sc) 1096 { 1097 pthread_mutex_lock(&sc->mtx); 1098 pci_nvme_reset_locked(sc); 1099 pthread_mutex_unlock(&sc->mtx); 1100 } 1101 1102 static int 1103 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1104 { 1105 uint16_t acqs, asqs; 1106 1107 DPRINTF("%s", __func__); 1108 1109 /* 1110 * NVMe 2.0 states that "enabling a controller while this field is 1111 * cleared to 0h produces undefined results" for both ACQS and 1112 * ASQS. If zero, set CFS and do not become ready. 1113 */ 1114 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1115 if (asqs < 2) { 1116 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1117 asqs - 1, sc->regs.aqa); 1118 sc->regs.csts |= NVME_CSTS_CFS; 1119 return (-1); 1120 } 1121 sc->submit_queues[0].size = asqs; 1122 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1123 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1124 if (sc->submit_queues[0].qbase == NULL) { 1125 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1126 sc->regs.asq); 1127 sc->regs.csts |= NVME_CSTS_CFS; 1128 return (-1); 1129 } 1130 1131 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1132 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1133 1134 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1135 NVME_AQA_REG_ACQS_MASK); 1136 if (acqs < 2) { 1137 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1138 acqs - 1, sc->regs.aqa); 1139 sc->regs.csts |= NVME_CSTS_CFS; 1140 return (-1); 1141 } 1142 sc->compl_queues[0].size = acqs; 1143 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1144 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1145 if (sc->compl_queues[0].qbase == NULL) { 1146 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1147 sc->regs.acq); 1148 sc->regs.csts |= NVME_CSTS_CFS; 1149 return (-1); 1150 } 1151 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1152 1153 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1154 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1155 1156 return (0); 1157 } 1158 1159 static int 1160 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1161 size_t len, enum nvme_copy_dir dir) 1162 { 1163 uint8_t *p; 1164 size_t bytes; 1165 1166 if (len > (8 * 1024)) { 1167 return (-1); 1168 } 1169 1170 /* Copy from the start of prp1 to the end of the physical page */ 1171 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1172 bytes = MIN(bytes, len); 1173 1174 p = vm_map_gpa(ctx, prp1, bytes); 1175 if (p == NULL) { 1176 return (-1); 1177 } 1178 1179 if (dir == NVME_COPY_TO_PRP) 1180 memcpy(p, b, bytes); 1181 else 1182 memcpy(b, p, bytes); 1183 1184 b += bytes; 1185 1186 len -= bytes; 1187 if (len == 0) { 1188 return (0); 1189 } 1190 1191 len = MIN(len, PAGE_SIZE); 1192 1193 p = vm_map_gpa(ctx, prp2, len); 1194 if (p == NULL) { 1195 return (-1); 1196 } 1197 1198 if (dir == NVME_COPY_TO_PRP) 1199 memcpy(p, b, len); 1200 else 1201 memcpy(b, p, len); 1202 1203 return (0); 1204 } 1205 1206 /* 1207 * Write a Completion Queue Entry update 1208 * 1209 * Write the completion and update the doorbell value 1210 */ 1211 static void 1212 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1213 struct nvme_completion_queue *cq, 1214 uint32_t cdw0, 1215 uint16_t cid, 1216 uint16_t sqid, 1217 uint16_t status) 1218 { 1219 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1220 struct nvme_completion *cqe; 1221 1222 assert(cq->qbase != NULL); 1223 1224 pthread_mutex_lock(&cq->mtx); 1225 1226 cqe = &cq->qbase[cq->tail]; 1227 1228 /* Flip the phase bit */ 1229 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1230 1231 cqe->cdw0 = cdw0; 1232 cqe->sqhd = sq->head; 1233 cqe->sqid = sqid; 1234 cqe->cid = cid; 1235 cqe->status = status; 1236 1237 cq->tail++; 1238 if (cq->tail >= cq->size) { 1239 cq->tail = 0; 1240 } 1241 1242 pthread_mutex_unlock(&cq->mtx); 1243 } 1244 1245 static int 1246 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1247 struct nvme_completion* compl) 1248 { 1249 uint16_t qid = command->cdw10 & 0xffff; 1250 1251 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1252 if (qid == 0 || qid > sc->num_squeues || 1253 (sc->submit_queues[qid].qbase == NULL)) { 1254 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1255 __func__, qid, sc->num_squeues); 1256 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1257 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1258 return (1); 1259 } 1260 1261 sc->submit_queues[qid].qbase = NULL; 1262 sc->submit_queues[qid].cqid = 0; 1263 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1264 return (1); 1265 } 1266 1267 static int 1268 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1269 struct nvme_completion* compl) 1270 { 1271 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1272 uint16_t qid = command->cdw10 & 0xffff; 1273 struct nvme_submission_queue *nsq; 1274 1275 if ((qid == 0) || (qid > sc->num_squeues) || 1276 (sc->submit_queues[qid].qbase != NULL)) { 1277 WPRINTF("%s queue index %u > num_squeues %u", 1278 __func__, qid, sc->num_squeues); 1279 pci_nvme_status_tc(&compl->status, 1280 NVME_SCT_COMMAND_SPECIFIC, 1281 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1282 return (1); 1283 } 1284 1285 nsq = &sc->submit_queues[qid]; 1286 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1287 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1288 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1289 /* 1290 * Queues must specify at least two entries 1291 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1292 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1293 */ 1294 pci_nvme_status_tc(&compl->status, 1295 NVME_SCT_COMMAND_SPECIFIC, 1296 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1297 return (1); 1298 } 1299 nsq->head = nsq->tail = 0; 1300 1301 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1302 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1303 pci_nvme_status_tc(&compl->status, 1304 NVME_SCT_COMMAND_SPECIFIC, 1305 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1306 return (1); 1307 } 1308 1309 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1310 pci_nvme_status_tc(&compl->status, 1311 NVME_SCT_COMMAND_SPECIFIC, 1312 NVME_SC_COMPLETION_QUEUE_INVALID); 1313 return (1); 1314 } 1315 1316 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1317 1318 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1319 sizeof(struct nvme_command) * (size_t)nsq->size); 1320 1321 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1322 qid, nsq->size, nsq->qbase, nsq->cqid); 1323 1324 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1325 1326 DPRINTF("%s completed creating IOSQ qid %u", 1327 __func__, qid); 1328 } else { 1329 /* 1330 * Guest sent non-cont submission queue request. 1331 * This setting is unsupported by this emulation. 1332 */ 1333 WPRINTF("%s unsupported non-contig (list-based) " 1334 "create i/o submission queue", __func__); 1335 1336 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1337 } 1338 return (1); 1339 } 1340 1341 static int 1342 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1343 struct nvme_completion* compl) 1344 { 1345 uint16_t qid = command->cdw10 & 0xffff; 1346 uint16_t sqid; 1347 1348 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1349 if (qid == 0 || qid > sc->num_cqueues || 1350 (sc->compl_queues[qid].qbase == NULL)) { 1351 WPRINTF("%s queue index %u / num_cqueues %u", 1352 __func__, qid, sc->num_cqueues); 1353 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1354 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1355 return (1); 1356 } 1357 1358 /* Deleting an Active CQ is an error */ 1359 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1360 if (sc->submit_queues[sqid].cqid == qid) { 1361 pci_nvme_status_tc(&compl->status, 1362 NVME_SCT_COMMAND_SPECIFIC, 1363 NVME_SC_INVALID_QUEUE_DELETION); 1364 return (1); 1365 } 1366 1367 sc->compl_queues[qid].qbase = NULL; 1368 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1369 return (1); 1370 } 1371 1372 static int 1373 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1374 struct nvme_completion* compl) 1375 { 1376 struct nvme_completion_queue *ncq; 1377 uint16_t qid = command->cdw10 & 0xffff; 1378 1379 /* Only support Physically Contiguous queues */ 1380 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1381 WPRINTF("%s unsupported non-contig (list-based) " 1382 "create i/o completion queue", 1383 __func__); 1384 1385 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1386 return (1); 1387 } 1388 1389 if ((qid == 0) || (qid > sc->num_cqueues) || 1390 (sc->compl_queues[qid].qbase != NULL)) { 1391 WPRINTF("%s queue index %u > num_cqueues %u", 1392 __func__, qid, sc->num_cqueues); 1393 pci_nvme_status_tc(&compl->status, 1394 NVME_SCT_COMMAND_SPECIFIC, 1395 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1396 return (1); 1397 } 1398 1399 ncq = &sc->compl_queues[qid]; 1400 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1401 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1402 if (ncq->intr_vec > (sc->max_queues + 1)) { 1403 pci_nvme_status_tc(&compl->status, 1404 NVME_SCT_COMMAND_SPECIFIC, 1405 NVME_SC_INVALID_INTERRUPT_VECTOR); 1406 return (1); 1407 } 1408 1409 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1410 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1411 /* 1412 * Queues must specify at least two entries 1413 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1414 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1415 */ 1416 pci_nvme_status_tc(&compl->status, 1417 NVME_SCT_COMMAND_SPECIFIC, 1418 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1419 return (1); 1420 } 1421 ncq->head = ncq->tail = 0; 1422 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1423 command->prp1, 1424 sizeof(struct nvme_command) * (size_t)ncq->size); 1425 1426 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1427 1428 1429 return (1); 1430 } 1431 1432 static int 1433 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1434 struct nvme_completion* compl) 1435 { 1436 uint64_t logoff; 1437 uint32_t logsize; 1438 uint8_t logpage; 1439 1440 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1441 1442 /* 1443 * Command specifies the number of dwords to return in fields NUMDU 1444 * and NUMDL. This is a zero-based value. 1445 */ 1446 logpage = command->cdw10 & 0xFF; 1447 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1448 logsize *= sizeof(uint32_t); 1449 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1450 1451 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1452 1453 switch (logpage) { 1454 case NVME_LOG_ERROR: 1455 if (logoff >= sizeof(sc->err_log)) { 1456 pci_nvme_status_genc(&compl->status, 1457 NVME_SC_INVALID_FIELD); 1458 break; 1459 } 1460 1461 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1462 command->prp2, (uint8_t *)&sc->err_log + logoff, 1463 MIN(logsize - logoff, sizeof(sc->err_log)), 1464 NVME_COPY_TO_PRP); 1465 break; 1466 case NVME_LOG_HEALTH_INFORMATION: 1467 if (logoff >= sizeof(sc->health_log)) { 1468 pci_nvme_status_genc(&compl->status, 1469 NVME_SC_INVALID_FIELD); 1470 break; 1471 } 1472 1473 pthread_mutex_lock(&sc->mtx); 1474 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1475 sizeof(sc->health_log.data_units_read)); 1476 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1477 sizeof(sc->health_log.data_units_written)); 1478 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1479 sizeof(sc->health_log.host_read_commands)); 1480 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1481 sizeof(sc->health_log.host_write_commands)); 1482 pthread_mutex_unlock(&sc->mtx); 1483 1484 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1485 command->prp2, (uint8_t *)&sc->health_log + logoff, 1486 MIN(logsize - logoff, sizeof(sc->health_log)), 1487 NVME_COPY_TO_PRP); 1488 break; 1489 case NVME_LOG_FIRMWARE_SLOT: 1490 if (logoff >= sizeof(sc->fw_log)) { 1491 pci_nvme_status_genc(&compl->status, 1492 NVME_SC_INVALID_FIELD); 1493 break; 1494 } 1495 1496 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1497 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1498 MIN(logsize - logoff, sizeof(sc->fw_log)), 1499 NVME_COPY_TO_PRP); 1500 break; 1501 case NVME_LOG_CHANGED_NAMESPACE: 1502 if (logoff >= sizeof(sc->ns_log)) { 1503 pci_nvme_status_genc(&compl->status, 1504 NVME_SC_INVALID_FIELD); 1505 break; 1506 } 1507 1508 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1509 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1510 MIN(logsize - logoff, sizeof(sc->ns_log)), 1511 NVME_COPY_TO_PRP); 1512 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1513 break; 1514 default: 1515 DPRINTF("%s get log page %x command not supported", 1516 __func__, logpage); 1517 1518 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1519 NVME_SC_INVALID_LOG_PAGE); 1520 } 1521 1522 return (1); 1523 } 1524 1525 static int 1526 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1527 struct nvme_completion* compl) 1528 { 1529 void *dest; 1530 uint16_t status; 1531 1532 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1533 command->cdw10 & 0xFF, command->nsid); 1534 1535 status = 0; 1536 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1537 1538 switch (command->cdw10 & 0xFF) { 1539 case 0x00: /* return Identify Namespace data structure */ 1540 /* Global NS only valid with NS Management */ 1541 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1542 pci_nvme_status_genc(&status, 1543 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1544 break; 1545 } 1546 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1547 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1548 NVME_COPY_TO_PRP); 1549 break; 1550 case 0x01: /* return Identify Controller data structure */ 1551 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1552 command->prp2, (uint8_t *)&sc->ctrldata, 1553 sizeof(sc->ctrldata), 1554 NVME_COPY_TO_PRP); 1555 break; 1556 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1557 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1558 sizeof(uint32_t) * 1024); 1559 /* All unused entries shall be zero */ 1560 memset(dest, 0, sizeof(uint32_t) * 1024); 1561 ((uint32_t *)dest)[0] = 1; 1562 break; 1563 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1564 if (command->nsid != 1) { 1565 pci_nvme_status_genc(&status, 1566 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1567 break; 1568 } 1569 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1570 sizeof(uint32_t) * 1024); 1571 /* All bytes after the descriptor shall be zero */ 1572 memset(dest, 0, sizeof(uint32_t) * 1024); 1573 1574 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1575 ((uint8_t *)dest)[0] = 1; 1576 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1577 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1578 break; 1579 case 0x13: 1580 /* 1581 * Controller list is optional but used by UNH tests. Return 1582 * a valid but empty list. 1583 */ 1584 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1585 sizeof(uint16_t) * 2048); 1586 memset(dest, 0, sizeof(uint16_t) * 2048); 1587 break; 1588 default: 1589 DPRINTF("%s unsupported identify command requested 0x%x", 1590 __func__, command->cdw10 & 0xFF); 1591 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1592 break; 1593 } 1594 1595 compl->status = status; 1596 return (1); 1597 } 1598 1599 static const char * 1600 nvme_fid_to_name(uint8_t fid) 1601 { 1602 const char *name; 1603 1604 switch (fid) { 1605 case NVME_FEAT_ARBITRATION: 1606 name = "Arbitration"; 1607 break; 1608 case NVME_FEAT_POWER_MANAGEMENT: 1609 name = "Power Management"; 1610 break; 1611 case NVME_FEAT_LBA_RANGE_TYPE: 1612 name = "LBA Range Type"; 1613 break; 1614 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1615 name = "Temperature Threshold"; 1616 break; 1617 case NVME_FEAT_ERROR_RECOVERY: 1618 name = "Error Recovery"; 1619 break; 1620 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1621 name = "Volatile Write Cache"; 1622 break; 1623 case NVME_FEAT_NUMBER_OF_QUEUES: 1624 name = "Number of Queues"; 1625 break; 1626 case NVME_FEAT_INTERRUPT_COALESCING: 1627 name = "Interrupt Coalescing"; 1628 break; 1629 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1630 name = "Interrupt Vector Configuration"; 1631 break; 1632 case NVME_FEAT_WRITE_ATOMICITY: 1633 name = "Write Atomicity Normal"; 1634 break; 1635 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1636 name = "Asynchronous Event Configuration"; 1637 break; 1638 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1639 name = "Autonomous Power State Transition"; 1640 break; 1641 case NVME_FEAT_HOST_MEMORY_BUFFER: 1642 name = "Host Memory Buffer"; 1643 break; 1644 case NVME_FEAT_TIMESTAMP: 1645 name = "Timestamp"; 1646 break; 1647 case NVME_FEAT_KEEP_ALIVE_TIMER: 1648 name = "Keep Alive Timer"; 1649 break; 1650 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1651 name = "Host Controlled Thermal Management"; 1652 break; 1653 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1654 name = "Non-Operation Power State Config"; 1655 break; 1656 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1657 name = "Read Recovery Level Config"; 1658 break; 1659 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1660 name = "Predictable Latency Mode Config"; 1661 break; 1662 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1663 name = "Predictable Latency Mode Window"; 1664 break; 1665 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1666 name = "LBA Status Information Report Interval"; 1667 break; 1668 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1669 name = "Host Behavior Support"; 1670 break; 1671 case NVME_FEAT_SANITIZE_CONFIG: 1672 name = "Sanitize Config"; 1673 break; 1674 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1675 name = "Endurance Group Event Configuration"; 1676 break; 1677 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1678 name = "Software Progress Marker"; 1679 break; 1680 case NVME_FEAT_HOST_IDENTIFIER: 1681 name = "Host Identifier"; 1682 break; 1683 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1684 name = "Reservation Notification Mask"; 1685 break; 1686 case NVME_FEAT_RESERVATION_PERSISTENCE: 1687 name = "Reservation Persistence"; 1688 break; 1689 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1690 name = "Namespace Write Protection Config"; 1691 break; 1692 default: 1693 name = "Unknown"; 1694 break; 1695 } 1696 1697 return (name); 1698 } 1699 1700 static void 1701 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1702 struct nvme_feature_obj *feat __unused, 1703 struct nvme_command *command __unused, 1704 struct nvme_completion *compl) 1705 { 1706 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1707 } 1708 1709 static void 1710 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1711 struct nvme_feature_obj *feat __unused, 1712 struct nvme_command *command, 1713 struct nvme_completion *compl) 1714 { 1715 uint32_t i; 1716 uint32_t cdw11 = command->cdw11; 1717 uint16_t iv; 1718 bool cd; 1719 1720 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1721 1722 iv = cdw11 & 0xffff; 1723 cd = cdw11 & (1 << 16); 1724 1725 if (iv > (sc->max_queues + 1)) { 1726 return; 1727 } 1728 1729 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1730 if ((iv == 0) && !cd) 1731 return; 1732 1733 /* Requested Interrupt Vector must be used by a CQ */ 1734 for (i = 0; i < sc->num_cqueues + 1; i++) { 1735 if (sc->compl_queues[i].intr_vec == iv) { 1736 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1737 } 1738 } 1739 } 1740 1741 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1742 static void 1743 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1744 struct nvme_feature_obj *feat __unused, 1745 struct nvme_command *command, 1746 struct nvme_completion *compl) 1747 { 1748 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1749 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1750 } 1751 1752 #define NVME_TEMP_THRESH_OVER 0 1753 #define NVME_TEMP_THRESH_UNDER 1 1754 static void 1755 nvme_feature_temperature(struct pci_nvme_softc *sc, 1756 struct nvme_feature_obj *feat __unused, 1757 struct nvme_command *command, 1758 struct nvme_completion *compl) 1759 { 1760 uint16_t tmpth; /* Temperature Threshold */ 1761 uint8_t tmpsel; /* Threshold Temperature Select */ 1762 uint8_t thsel; /* Threshold Type Select */ 1763 bool set_crit = false; 1764 bool report_crit; 1765 1766 tmpth = command->cdw11 & 0xffff; 1767 tmpsel = (command->cdw11 >> 16) & 0xf; 1768 thsel = (command->cdw11 >> 20) & 0x3; 1769 1770 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1771 1772 /* Check for unsupported values */ 1773 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1774 (thsel > NVME_TEMP_THRESH_UNDER)) { 1775 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1776 return; 1777 } 1778 1779 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1780 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1781 set_crit = true; 1782 1783 pthread_mutex_lock(&sc->mtx); 1784 if (set_crit) 1785 sc->health_log.critical_warning |= 1786 NVME_CRIT_WARN_ST_TEMPERATURE; 1787 else 1788 sc->health_log.critical_warning &= 1789 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1790 pthread_mutex_unlock(&sc->mtx); 1791 1792 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1793 NVME_CRIT_WARN_ST_TEMPERATURE; 1794 1795 if (set_crit && report_crit) 1796 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1797 sc->health_log.critical_warning); 1798 1799 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1800 } 1801 1802 static void 1803 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1804 struct nvme_feature_obj *feat __unused, 1805 struct nvme_command *command, 1806 struct nvme_completion *compl) 1807 { 1808 uint16_t nqr; /* Number of Queues Requested */ 1809 1810 if (sc->num_q_is_set) { 1811 WPRINTF("%s: Number of Queues already set", __func__); 1812 pci_nvme_status_genc(&compl->status, 1813 NVME_SC_COMMAND_SEQUENCE_ERROR); 1814 return; 1815 } 1816 1817 nqr = command->cdw11 & 0xFFFF; 1818 if (nqr == 0xffff) { 1819 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1820 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1821 return; 1822 } 1823 1824 sc->num_squeues = ONE_BASED(nqr); 1825 if (sc->num_squeues > sc->max_queues) { 1826 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1827 sc->max_queues); 1828 sc->num_squeues = sc->max_queues; 1829 } 1830 1831 nqr = (command->cdw11 >> 16) & 0xFFFF; 1832 if (nqr == 0xffff) { 1833 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1834 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1835 return; 1836 } 1837 1838 sc->num_cqueues = ONE_BASED(nqr); 1839 if (sc->num_cqueues > sc->max_queues) { 1840 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1841 sc->max_queues); 1842 sc->num_cqueues = sc->max_queues; 1843 } 1844 1845 /* Patch the command value which will be saved on callback's return */ 1846 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1847 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1848 1849 sc->num_q_is_set = true; 1850 } 1851 1852 static int 1853 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1854 struct nvme_completion *compl) 1855 { 1856 struct nvme_feature_obj *feat; 1857 uint32_t nsid = command->nsid; 1858 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1859 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1860 1861 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1862 1863 if (fid >= NVME_FID_MAX) { 1864 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1865 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1866 return (1); 1867 } 1868 1869 if (sv) { 1870 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1871 NVME_SC_FEATURE_NOT_SAVEABLE); 1872 return (1); 1873 } 1874 1875 feat = &sc->feat[fid]; 1876 1877 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1878 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1879 return (1); 1880 } 1881 1882 if (!feat->namespace_specific && 1883 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1884 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1885 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1886 return (1); 1887 } 1888 1889 compl->cdw0 = 0; 1890 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1891 1892 if (feat->set) 1893 feat->set(sc, feat, command, compl); 1894 else { 1895 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1896 NVME_SC_FEATURE_NOT_CHANGEABLE); 1897 return (1); 1898 } 1899 1900 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1901 if (compl->status == NVME_SC_SUCCESS) { 1902 feat->cdw11 = command->cdw11; 1903 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1904 (command->cdw11 != 0)) 1905 pci_nvme_aen_notify(sc); 1906 } 1907 1908 return (0); 1909 } 1910 1911 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1912 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1913 1914 static int 1915 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1916 struct nvme_completion* compl) 1917 { 1918 struct nvme_feature_obj *feat; 1919 uint8_t fid = command->cdw10 & 0xFF; 1920 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1921 1922 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1923 1924 if (fid >= NVME_FID_MAX) { 1925 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1926 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1927 return (1); 1928 } 1929 1930 compl->cdw0 = 0; 1931 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1932 1933 feat = &sc->feat[fid]; 1934 if (feat->get) { 1935 feat->get(sc, feat, command, compl); 1936 } 1937 1938 if (compl->status == NVME_SC_SUCCESS) { 1939 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1940 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1941 else 1942 compl->cdw0 = feat->cdw11; 1943 } 1944 1945 return (0); 1946 } 1947 1948 static int 1949 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1950 struct nvme_completion* compl) 1951 { 1952 uint8_t ses, lbaf, pi; 1953 1954 /* Only supports Secure Erase Setting - User Data Erase */ 1955 ses = (command->cdw10 >> 9) & 0x7; 1956 if (ses > 0x1) { 1957 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1958 return (1); 1959 } 1960 1961 /* Only supports a single LBA Format */ 1962 lbaf = command->cdw10 & 0xf; 1963 if (lbaf != 0) { 1964 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1965 NVME_SC_INVALID_FORMAT); 1966 return (1); 1967 } 1968 1969 /* Doesn't support Protection Infomation */ 1970 pi = (command->cdw10 >> 5) & 0x7; 1971 if (pi != 0) { 1972 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1973 return (1); 1974 } 1975 1976 if (sc->nvstore.type == NVME_STOR_RAM) { 1977 if (sc->nvstore.ctx) 1978 free(sc->nvstore.ctx); 1979 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1980 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1981 } else { 1982 struct pci_nvme_ioreq *req; 1983 int err; 1984 1985 req = pci_nvme_get_ioreq(sc); 1986 if (req == NULL) { 1987 pci_nvme_status_genc(&compl->status, 1988 NVME_SC_INTERNAL_DEVICE_ERROR); 1989 WPRINTF("%s: unable to allocate IO req", __func__); 1990 return (1); 1991 } 1992 req->nvme_sq = &sc->submit_queues[0]; 1993 req->sqid = 0; 1994 req->opc = command->opc; 1995 req->cid = command->cid; 1996 req->nsid = command->nsid; 1997 1998 req->io_req.br_offset = 0; 1999 req->io_req.br_resid = sc->nvstore.size; 2000 req->io_req.br_callback = pci_nvme_io_done; 2001 2002 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 2003 if (err) { 2004 pci_nvme_status_genc(&compl->status, 2005 NVME_SC_INTERNAL_DEVICE_ERROR); 2006 pci_nvme_release_ioreq(sc, req); 2007 } else 2008 compl->status = NVME_NO_STATUS; 2009 } 2010 2011 return (1); 2012 } 2013 2014 static int 2015 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 2016 struct nvme_completion *compl) 2017 { 2018 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 2019 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 2020 2021 /* TODO: search for the command ID and abort it */ 2022 2023 compl->cdw0 = 1; 2024 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2025 return (1); 2026 } 2027 2028 static int 2029 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2030 struct nvme_command* command, struct nvme_completion* compl) 2031 { 2032 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2033 sc->aer_count, sc->ctrldata.aerl, command->cid); 2034 2035 /* Don't exceed the Async Event Request Limit (AERL). */ 2036 if (pci_nvme_aer_limit_reached(sc)) { 2037 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2038 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2039 return (1); 2040 } 2041 2042 if (pci_nvme_aer_add(sc, command->cid)) { 2043 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2044 NVME_SC_INTERNAL_DEVICE_ERROR); 2045 return (1); 2046 } 2047 2048 /* 2049 * Raise events when they happen based on the Set Features cmd. 2050 * These events happen async, so only set completion successful if 2051 * there is an event reflective of the request to get event. 2052 */ 2053 compl->status = NVME_NO_STATUS; 2054 pci_nvme_aen_notify(sc); 2055 2056 return (0); 2057 } 2058 2059 static void 2060 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2061 { 2062 struct nvme_completion compl; 2063 struct nvme_command *cmd; 2064 struct nvme_submission_queue *sq; 2065 struct nvme_completion_queue *cq; 2066 uint16_t sqhead; 2067 2068 DPRINTF("%s index %u", __func__, (uint32_t)value); 2069 2070 sq = &sc->submit_queues[0]; 2071 cq = &sc->compl_queues[0]; 2072 2073 pthread_mutex_lock(&sq->mtx); 2074 2075 sqhead = sq->head; 2076 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2077 2078 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2079 cmd = &(sq->qbase)[sqhead]; 2080 compl.cdw0 = 0; 2081 compl.status = 0; 2082 2083 switch (cmd->opc) { 2084 case NVME_OPC_DELETE_IO_SQ: 2085 DPRINTF("%s command DELETE_IO_SQ", __func__); 2086 nvme_opc_delete_io_sq(sc, cmd, &compl); 2087 break; 2088 case NVME_OPC_CREATE_IO_SQ: 2089 DPRINTF("%s command CREATE_IO_SQ", __func__); 2090 nvme_opc_create_io_sq(sc, cmd, &compl); 2091 break; 2092 case NVME_OPC_DELETE_IO_CQ: 2093 DPRINTF("%s command DELETE_IO_CQ", __func__); 2094 nvme_opc_delete_io_cq(sc, cmd, &compl); 2095 break; 2096 case NVME_OPC_CREATE_IO_CQ: 2097 DPRINTF("%s command CREATE_IO_CQ", __func__); 2098 nvme_opc_create_io_cq(sc, cmd, &compl); 2099 break; 2100 case NVME_OPC_GET_LOG_PAGE: 2101 DPRINTF("%s command GET_LOG_PAGE", __func__); 2102 nvme_opc_get_log_page(sc, cmd, &compl); 2103 break; 2104 case NVME_OPC_IDENTIFY: 2105 DPRINTF("%s command IDENTIFY", __func__); 2106 nvme_opc_identify(sc, cmd, &compl); 2107 break; 2108 case NVME_OPC_ABORT: 2109 DPRINTF("%s command ABORT", __func__); 2110 nvme_opc_abort(sc, cmd, &compl); 2111 break; 2112 case NVME_OPC_SET_FEATURES: 2113 DPRINTF("%s command SET_FEATURES", __func__); 2114 nvme_opc_set_features(sc, cmd, &compl); 2115 break; 2116 case NVME_OPC_GET_FEATURES: 2117 DPRINTF("%s command GET_FEATURES", __func__); 2118 nvme_opc_get_features(sc, cmd, &compl); 2119 break; 2120 case NVME_OPC_FIRMWARE_ACTIVATE: 2121 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2122 pci_nvme_status_tc(&compl.status, 2123 NVME_SCT_COMMAND_SPECIFIC, 2124 NVME_SC_INVALID_FIRMWARE_SLOT); 2125 break; 2126 case NVME_OPC_ASYNC_EVENT_REQUEST: 2127 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2128 nvme_opc_async_event_req(sc, cmd, &compl); 2129 break; 2130 case NVME_OPC_FORMAT_NVM: 2131 DPRINTF("%s command FORMAT_NVM", __func__); 2132 if ((sc->ctrldata.oacs & 2133 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2134 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2135 break; 2136 } 2137 nvme_opc_format_nvm(sc, cmd, &compl); 2138 break; 2139 case NVME_OPC_SECURITY_SEND: 2140 case NVME_OPC_SECURITY_RECEIVE: 2141 case NVME_OPC_SANITIZE: 2142 case NVME_OPC_GET_LBA_STATUS: 2143 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2144 cmd->opc); 2145 /* Valid but unsupported opcodes */ 2146 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2147 break; 2148 default: 2149 DPRINTF("%s command OPC=%#X (not implemented)", 2150 __func__, 2151 cmd->opc); 2152 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2153 } 2154 sqhead = (sqhead + 1) % sq->size; 2155 2156 if (NVME_COMPLETION_VALID(compl)) { 2157 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2158 compl.cdw0, 2159 cmd->cid, 2160 0, /* SQID */ 2161 compl.status); 2162 } 2163 } 2164 2165 DPRINTF("setting sqhead %u", sqhead); 2166 sq->head = sqhead; 2167 2168 if (cq->head != cq->tail) 2169 pci_generate_msix(sc->nsc_pi, 0); 2170 2171 pthread_mutex_unlock(&sq->mtx); 2172 } 2173 2174 /* 2175 * Update the Write and Read statistics reported in SMART data 2176 * 2177 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2178 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2179 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2180 */ 2181 static void 2182 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2183 size_t bytes, uint16_t status) 2184 { 2185 2186 pthread_mutex_lock(&sc->mtx); 2187 switch (opc) { 2188 case NVME_OPC_WRITE: 2189 sc->write_commands++; 2190 if (status != NVME_SC_SUCCESS) 2191 break; 2192 sc->write_dunits_remainder += (bytes / 512); 2193 while (sc->write_dunits_remainder >= 1000) { 2194 sc->write_data_units++; 2195 sc->write_dunits_remainder -= 1000; 2196 } 2197 break; 2198 case NVME_OPC_READ: 2199 sc->read_commands++; 2200 if (status != NVME_SC_SUCCESS) 2201 break; 2202 sc->read_dunits_remainder += (bytes / 512); 2203 while (sc->read_dunits_remainder >= 1000) { 2204 sc->read_data_units++; 2205 sc->read_dunits_remainder -= 1000; 2206 } 2207 break; 2208 default: 2209 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2210 break; 2211 } 2212 pthread_mutex_unlock(&sc->mtx); 2213 } 2214 2215 /* 2216 * Check if the combination of Starting LBA (slba) and number of blocks 2217 * exceeds the range of the underlying storage. 2218 * 2219 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2220 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2221 * overflow. 2222 */ 2223 static bool 2224 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2225 uint32_t nblocks) 2226 { 2227 size_t offset, bytes; 2228 2229 /* Overflow check of multiplying Starting LBA by the sector size */ 2230 if (slba >> (64 - nvstore->sectsz_bits)) 2231 return (true); 2232 2233 offset = slba << nvstore->sectsz_bits; 2234 bytes = nblocks << nvstore->sectsz_bits; 2235 2236 /* Overflow check of Number of Logical Blocks */ 2237 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2238 return (true); 2239 2240 return (false); 2241 } 2242 2243 static int 2244 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2245 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2246 { 2247 int iovidx; 2248 bool range_is_contiguous; 2249 2250 if (req == NULL) 2251 return (-1); 2252 2253 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2254 return (-1); 2255 } 2256 2257 /* 2258 * Minimize the number of IOVs by concatenating contiguous address 2259 * ranges. If the IOV count is zero, there is no previous range to 2260 * concatenate. 2261 */ 2262 if (req->io_req.br_iovcnt == 0) 2263 range_is_contiguous = false; 2264 else 2265 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2266 2267 if (range_is_contiguous) { 2268 iovidx = req->io_req.br_iovcnt - 1; 2269 2270 req->io_req.br_iov[iovidx].iov_base = 2271 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2272 req->prev_gpaddr, size); 2273 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2274 return (-1); 2275 2276 req->prev_size += size; 2277 req->io_req.br_resid += size; 2278 2279 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2280 } else { 2281 iovidx = req->io_req.br_iovcnt; 2282 if (iovidx == 0) { 2283 req->io_req.br_offset = offset; 2284 req->io_req.br_resid = 0; 2285 req->io_req.br_param = req; 2286 } 2287 2288 req->io_req.br_iov[iovidx].iov_base = 2289 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2290 gpaddr, size); 2291 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2292 return (-1); 2293 2294 req->io_req.br_iov[iovidx].iov_len = size; 2295 2296 req->prev_gpaddr = gpaddr; 2297 req->prev_size = size; 2298 req->io_req.br_resid += size; 2299 2300 req->io_req.br_iovcnt++; 2301 } 2302 2303 return (0); 2304 } 2305 2306 static void 2307 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2308 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2309 { 2310 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2311 2312 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2313 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2314 NVME_STATUS_GET_SC(status)); 2315 2316 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2317 2318 if (cq->head != cq->tail) { 2319 if (cq->intr_en & NVME_CQ_INTEN) { 2320 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2321 } else { 2322 DPRINTF("%s: CQ%u interrupt disabled", 2323 __func__, sq->cqid); 2324 } 2325 } 2326 } 2327 2328 static void 2329 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2330 { 2331 req->sc = NULL; 2332 req->nvme_sq = NULL; 2333 req->sqid = 0; 2334 2335 pthread_mutex_lock(&sc->mtx); 2336 2337 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2338 sc->pending_ios--; 2339 2340 /* when no more IO pending, can set to ready if device reset/enabled */ 2341 if (sc->pending_ios == 0 && 2342 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2343 sc->regs.csts |= NVME_CSTS_RDY; 2344 2345 pthread_mutex_unlock(&sc->mtx); 2346 2347 sem_post(&sc->iosemlock); 2348 } 2349 2350 static struct pci_nvme_ioreq * 2351 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2352 { 2353 struct pci_nvme_ioreq *req = NULL; 2354 2355 sem_wait(&sc->iosemlock); 2356 pthread_mutex_lock(&sc->mtx); 2357 2358 req = STAILQ_FIRST(&sc->ioreqs_free); 2359 assert(req != NULL); 2360 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2361 2362 req->sc = sc; 2363 2364 sc->pending_ios++; 2365 2366 pthread_mutex_unlock(&sc->mtx); 2367 2368 req->io_req.br_iovcnt = 0; 2369 req->io_req.br_offset = 0; 2370 req->io_req.br_resid = 0; 2371 req->io_req.br_param = req; 2372 req->prev_gpaddr = 0; 2373 req->prev_size = 0; 2374 2375 return req; 2376 } 2377 2378 static void 2379 pci_nvme_io_done(struct blockif_req *br, int err) 2380 { 2381 struct pci_nvme_ioreq *req = br->br_param; 2382 struct nvme_submission_queue *sq = req->nvme_sq; 2383 uint16_t code, status; 2384 2385 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2386 2387 /* TODO return correct error */ 2388 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2389 status = 0; 2390 pci_nvme_status_genc(&status, code); 2391 2392 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2393 pci_nvme_stats_write_read_update(req->sc, req->opc, 2394 req->bytes, status); 2395 pci_nvme_release_ioreq(req->sc, req); 2396 } 2397 2398 /* 2399 * Implements the Flush command. The specification states: 2400 * If a volatile write cache is not present, Flush commands complete 2401 * successfully and have no effect 2402 * in the description of the Volatile Write Cache (VWC) field of the Identify 2403 * Controller data. Therefore, set status to Success if the command is 2404 * not supported (i.e. RAM or as indicated by the blockif). 2405 */ 2406 static bool 2407 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2408 struct nvme_command *cmd __unused, 2409 struct pci_nvme_blockstore *nvstore, 2410 struct pci_nvme_ioreq *req, 2411 uint16_t *status) 2412 { 2413 bool pending = false; 2414 2415 if (nvstore->type == NVME_STOR_RAM) { 2416 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2417 } else { 2418 int err; 2419 2420 req->io_req.br_callback = pci_nvme_io_done; 2421 2422 err = blockif_flush(nvstore->ctx, &req->io_req); 2423 switch (err) { 2424 case 0: 2425 pending = true; 2426 break; 2427 case EOPNOTSUPP: 2428 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2429 break; 2430 default: 2431 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2432 } 2433 } 2434 2435 return (pending); 2436 } 2437 2438 static uint16_t 2439 nvme_write_read_ram(struct pci_nvme_softc *sc, 2440 struct pci_nvme_blockstore *nvstore, 2441 uint64_t prp1, uint64_t prp2, 2442 size_t offset, uint64_t bytes, 2443 bool is_write) 2444 { 2445 uint8_t *buf = nvstore->ctx; 2446 enum nvme_copy_dir dir; 2447 uint16_t status; 2448 2449 if (is_write) 2450 dir = NVME_COPY_TO_PRP; 2451 else 2452 dir = NVME_COPY_FROM_PRP; 2453 2454 status = 0; 2455 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2456 buf + offset, bytes, dir)) 2457 pci_nvme_status_genc(&status, 2458 NVME_SC_DATA_TRANSFER_ERROR); 2459 else 2460 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2461 2462 return (status); 2463 } 2464 2465 static uint16_t 2466 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2467 struct pci_nvme_blockstore *nvstore, 2468 struct pci_nvme_ioreq *req, 2469 uint64_t prp1, uint64_t prp2, 2470 size_t offset, uint64_t bytes, 2471 bool is_write) 2472 { 2473 uint64_t size; 2474 int err; 2475 uint16_t status = NVME_NO_STATUS; 2476 2477 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2478 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2479 err = -1; 2480 goto out; 2481 } 2482 2483 offset += size; 2484 bytes -= size; 2485 2486 if (bytes == 0) { 2487 ; 2488 } else if (bytes <= PAGE_SIZE) { 2489 size = bytes; 2490 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2491 err = -1; 2492 goto out; 2493 } 2494 } else { 2495 void *vmctx = sc->nsc_pi->pi_vmctx; 2496 uint64_t *prp_list = &prp2; 2497 uint64_t *last = prp_list; 2498 2499 /* PRP2 is pointer to a physical region page list */ 2500 while (bytes) { 2501 /* Last entry in list points to the next list */ 2502 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2503 uint64_t prp = *prp_list; 2504 2505 prp_list = paddr_guest2host(vmctx, prp, 2506 PAGE_SIZE - (prp % PAGE_SIZE)); 2507 if (prp_list == NULL) { 2508 err = -1; 2509 goto out; 2510 } 2511 last = prp_list + (NVME_PRP2_ITEMS - 1); 2512 } 2513 2514 size = MIN(bytes, PAGE_SIZE); 2515 2516 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2517 offset)) { 2518 err = -1; 2519 goto out; 2520 } 2521 2522 offset += size; 2523 bytes -= size; 2524 2525 prp_list++; 2526 } 2527 } 2528 req->io_req.br_callback = pci_nvme_io_done; 2529 if (is_write) 2530 err = blockif_write(nvstore->ctx, &req->io_req); 2531 else 2532 err = blockif_read(nvstore->ctx, &req->io_req); 2533 out: 2534 if (err) 2535 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2536 2537 return (status); 2538 } 2539 2540 static bool 2541 nvme_opc_write_read(struct pci_nvme_softc *sc, 2542 struct nvme_command *cmd, 2543 struct pci_nvme_blockstore *nvstore, 2544 struct pci_nvme_ioreq *req, 2545 uint16_t *status) 2546 { 2547 uint64_t lba, nblocks, bytes; 2548 size_t offset; 2549 bool is_write = cmd->opc == NVME_OPC_WRITE; 2550 bool pending = false; 2551 2552 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2553 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2554 bytes = nblocks << nvstore->sectsz_bits; 2555 if (bytes > NVME_MAX_DATA_SIZE) { 2556 WPRINTF("%s command would exceed MDTS", __func__); 2557 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2558 goto out; 2559 } 2560 2561 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2562 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2563 __func__, lba, nblocks); 2564 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2565 goto out; 2566 } 2567 2568 offset = lba << nvstore->sectsz_bits; 2569 2570 req->bytes = bytes; 2571 req->io_req.br_offset = lba; 2572 2573 /* PRP bits 1:0 must be zero */ 2574 cmd->prp1 &= ~0x3UL; 2575 cmd->prp2 &= ~0x3UL; 2576 2577 if (nvstore->type == NVME_STOR_RAM) { 2578 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2579 cmd->prp2, offset, bytes, is_write); 2580 } else { 2581 *status = nvme_write_read_blockif(sc, nvstore, req, 2582 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2583 2584 if (*status == NVME_NO_STATUS) 2585 pending = true; 2586 } 2587 out: 2588 if (!pending) 2589 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2590 2591 return (pending); 2592 } 2593 2594 static void 2595 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2596 { 2597 struct pci_nvme_ioreq *req = br->br_param; 2598 struct pci_nvme_softc *sc = req->sc; 2599 bool done = true; 2600 uint16_t status; 2601 2602 status = 0; 2603 if (err) { 2604 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2605 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2606 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2607 } else { 2608 struct iovec *iov = req->io_req.br_iov; 2609 2610 req->prev_gpaddr++; 2611 iov += req->prev_gpaddr; 2612 2613 /* The iov_* values already include the sector size */ 2614 req->io_req.br_offset = (off_t)iov->iov_base; 2615 req->io_req.br_resid = iov->iov_len; 2616 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2617 pci_nvme_status_genc(&status, 2618 NVME_SC_INTERNAL_DEVICE_ERROR); 2619 } else 2620 done = false; 2621 } 2622 2623 if (done) { 2624 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2625 status); 2626 pci_nvme_release_ioreq(sc, req); 2627 } 2628 } 2629 2630 static bool 2631 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2632 struct nvme_command *cmd, 2633 struct pci_nvme_blockstore *nvstore, 2634 struct pci_nvme_ioreq *req, 2635 uint16_t *status) 2636 { 2637 struct nvme_dsm_range *range = NULL; 2638 uint32_t nr, r, non_zero, dr; 2639 int err; 2640 bool pending = false; 2641 2642 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2643 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2644 goto out; 2645 } 2646 2647 nr = cmd->cdw10 & 0xff; 2648 2649 /* copy locally because a range entry could straddle PRPs */ 2650 range = calloc(1, NVME_MAX_DSM_TRIM); 2651 if (range == NULL) { 2652 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2653 goto out; 2654 } 2655 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2656 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2657 2658 /* Check for invalid ranges and the number of non-zero lengths */ 2659 non_zero = 0; 2660 for (r = 0; r <= nr; r++) { 2661 if (pci_nvme_out_of_range(nvstore, 2662 range[r].starting_lba, range[r].length)) { 2663 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2664 goto out; 2665 } 2666 if (range[r].length != 0) 2667 non_zero++; 2668 } 2669 2670 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2671 size_t offset, bytes; 2672 int sectsz_bits = sc->nvstore.sectsz_bits; 2673 2674 /* 2675 * DSM calls are advisory only, and compliant controllers 2676 * may choose to take no actions (i.e. return Success). 2677 */ 2678 if (!nvstore->deallocate) { 2679 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2680 goto out; 2681 } 2682 2683 /* If all ranges have a zero length, return Success */ 2684 if (non_zero == 0) { 2685 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2686 goto out; 2687 } 2688 2689 if (req == NULL) { 2690 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2691 goto out; 2692 } 2693 2694 offset = range[0].starting_lba << sectsz_bits; 2695 bytes = range[0].length << sectsz_bits; 2696 2697 /* 2698 * If the request is for more than a single range, store 2699 * the ranges in the br_iov. Optimize for the common case 2700 * of a single range. 2701 * 2702 * Note that NVMe Number of Ranges is a zero based value 2703 */ 2704 req->io_req.br_iovcnt = 0; 2705 req->io_req.br_offset = offset; 2706 req->io_req.br_resid = bytes; 2707 2708 if (nr == 0) { 2709 req->io_req.br_callback = pci_nvme_io_done; 2710 } else { 2711 struct iovec *iov = req->io_req.br_iov; 2712 2713 for (r = 0, dr = 0; r <= nr; r++) { 2714 offset = range[r].starting_lba << sectsz_bits; 2715 bytes = range[r].length << sectsz_bits; 2716 if (bytes == 0) 2717 continue; 2718 2719 if ((nvstore->size - offset) < bytes) { 2720 pci_nvme_status_genc(status, 2721 NVME_SC_LBA_OUT_OF_RANGE); 2722 goto out; 2723 } 2724 iov[dr].iov_base = (void *)offset; 2725 iov[dr].iov_len = bytes; 2726 dr++; 2727 } 2728 req->io_req.br_callback = pci_nvme_dealloc_sm; 2729 2730 /* 2731 * Use prev_gpaddr to track the current entry and 2732 * prev_size to track the number of entries 2733 */ 2734 req->prev_gpaddr = 0; 2735 req->prev_size = dr; 2736 } 2737 2738 err = blockif_delete(nvstore->ctx, &req->io_req); 2739 if (err) 2740 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2741 else 2742 pending = true; 2743 } 2744 out: 2745 free(range); 2746 return (pending); 2747 } 2748 2749 static void 2750 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2751 { 2752 struct nvme_submission_queue *sq; 2753 uint16_t status; 2754 uint16_t sqhead; 2755 2756 /* handle all submissions up to sq->tail index */ 2757 sq = &sc->submit_queues[idx]; 2758 2759 pthread_mutex_lock(&sq->mtx); 2760 2761 sqhead = sq->head; 2762 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2763 idx, sqhead, sq->tail, sq->qbase); 2764 2765 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2766 struct nvme_command *cmd; 2767 struct pci_nvme_ioreq *req; 2768 uint32_t nsid; 2769 bool pending; 2770 2771 pending = false; 2772 req = NULL; 2773 status = 0; 2774 2775 cmd = &sq->qbase[sqhead]; 2776 sqhead = (sqhead + 1) % sq->size; 2777 2778 nsid = le32toh(cmd->nsid); 2779 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2780 pci_nvme_status_genc(&status, 2781 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2782 status |= 2783 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2784 goto complete; 2785 } 2786 2787 req = pci_nvme_get_ioreq(sc); 2788 if (req == NULL) { 2789 pci_nvme_status_genc(&status, 2790 NVME_SC_INTERNAL_DEVICE_ERROR); 2791 WPRINTF("%s: unable to allocate IO req", __func__); 2792 goto complete; 2793 } 2794 req->nvme_sq = sq; 2795 req->sqid = idx; 2796 req->opc = cmd->opc; 2797 req->cid = cmd->cid; 2798 req->nsid = cmd->nsid; 2799 2800 switch (cmd->opc) { 2801 case NVME_OPC_FLUSH: 2802 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2803 req, &status); 2804 break; 2805 case NVME_OPC_WRITE: 2806 case NVME_OPC_READ: 2807 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2808 req, &status); 2809 break; 2810 case NVME_OPC_WRITE_ZEROES: 2811 /* TODO: write zeroes 2812 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2813 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2814 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2815 break; 2816 case NVME_OPC_DATASET_MANAGEMENT: 2817 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2818 req, &status); 2819 break; 2820 default: 2821 WPRINTF("%s unhandled io command 0x%x", 2822 __func__, cmd->opc); 2823 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2824 } 2825 complete: 2826 if (!pending) { 2827 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2828 if (req != NULL) 2829 pci_nvme_release_ioreq(sc, req); 2830 } 2831 } 2832 2833 sq->head = sqhead; 2834 2835 pthread_mutex_unlock(&sq->mtx); 2836 } 2837 2838 static void 2839 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2840 uint64_t idx, int is_sq, uint64_t value) 2841 { 2842 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2843 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2844 2845 if (is_sq) { 2846 if (idx > sc->num_squeues) { 2847 WPRINTF("%s queue index %lu overflow from " 2848 "guest (max %u)", 2849 __func__, idx, sc->num_squeues); 2850 return; 2851 } 2852 2853 atomic_store_short(&sc->submit_queues[idx].tail, 2854 (uint16_t)value); 2855 2856 if (idx == 0) { 2857 pci_nvme_handle_admin_cmd(sc, value); 2858 } else { 2859 /* submission queue; handle new entries in SQ */ 2860 if (idx > sc->num_squeues) { 2861 WPRINTF("%s SQ index %lu overflow from " 2862 "guest (max %u)", 2863 __func__, idx, sc->num_squeues); 2864 return; 2865 } 2866 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2867 } 2868 } else { 2869 if (idx > sc->num_cqueues) { 2870 WPRINTF("%s queue index %lu overflow from " 2871 "guest (max %u)", 2872 __func__, idx, sc->num_cqueues); 2873 return; 2874 } 2875 2876 atomic_store_short(&sc->compl_queues[idx].head, 2877 (uint16_t)value); 2878 } 2879 } 2880 2881 static void 2882 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2883 { 2884 const char *s = iswrite ? "WRITE" : "READ"; 2885 2886 switch (offset) { 2887 case NVME_CR_CAP_LOW: 2888 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2889 break; 2890 case NVME_CR_CAP_HI: 2891 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2892 break; 2893 case NVME_CR_VS: 2894 DPRINTF("%s %s NVME_CR_VS", func, s); 2895 break; 2896 case NVME_CR_INTMS: 2897 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2898 break; 2899 case NVME_CR_INTMC: 2900 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2901 break; 2902 case NVME_CR_CC: 2903 DPRINTF("%s %s NVME_CR_CC", func, s); 2904 break; 2905 case NVME_CR_CSTS: 2906 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2907 break; 2908 case NVME_CR_NSSR: 2909 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2910 break; 2911 case NVME_CR_AQA: 2912 DPRINTF("%s %s NVME_CR_AQA", func, s); 2913 break; 2914 case NVME_CR_ASQ_LOW: 2915 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2916 break; 2917 case NVME_CR_ASQ_HI: 2918 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2919 break; 2920 case NVME_CR_ACQ_LOW: 2921 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2922 break; 2923 case NVME_CR_ACQ_HI: 2924 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2925 break; 2926 default: 2927 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2928 } 2929 2930 } 2931 2932 static void 2933 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2934 uint64_t value) 2935 { 2936 uint32_t ccreg; 2937 2938 if (offset >= NVME_DOORBELL_OFFSET) { 2939 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2940 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2941 int is_sq = (belloffset % 8) < 4; 2942 2943 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2944 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2945 offset); 2946 return; 2947 } 2948 2949 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2950 WPRINTF("guest attempted an overflow write offset " 2951 "0x%lx, val 0x%lx in %s", 2952 offset, value, __func__); 2953 return; 2954 } 2955 2956 if (is_sq) { 2957 if (sc->submit_queues[idx].qbase == NULL) 2958 return; 2959 } else if (sc->compl_queues[idx].qbase == NULL) 2960 return; 2961 2962 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2963 return; 2964 } 2965 2966 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2967 offset, size, value); 2968 2969 if (size != 4) { 2970 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2971 "val 0x%lx) to bar0 in %s", 2972 size, offset, value, __func__); 2973 /* TODO: shutdown device */ 2974 return; 2975 } 2976 2977 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2978 2979 pthread_mutex_lock(&sc->mtx); 2980 2981 switch (offset) { 2982 case NVME_CR_CAP_LOW: 2983 case NVME_CR_CAP_HI: 2984 /* readonly */ 2985 break; 2986 case NVME_CR_VS: 2987 /* readonly */ 2988 break; 2989 case NVME_CR_INTMS: 2990 /* MSI-X, so ignore */ 2991 break; 2992 case NVME_CR_INTMC: 2993 /* MSI-X, so ignore */ 2994 break; 2995 case NVME_CR_CC: 2996 ccreg = (uint32_t)value; 2997 2998 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2999 "iocqes %u", 3000 __func__, 3001 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 3002 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 3003 NVME_CC_GET_IOCQES(ccreg)); 3004 3005 if (NVME_CC_GET_SHN(ccreg)) { 3006 /* perform shutdown - flush out data to backend */ 3007 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 3008 NVME_CSTS_REG_SHST_SHIFT); 3009 sc->regs.csts |= NVME_SHST_COMPLETE << 3010 NVME_CSTS_REG_SHST_SHIFT; 3011 } 3012 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 3013 if (NVME_CC_GET_EN(ccreg) == 0) 3014 /* transition 1-> causes controller reset */ 3015 pci_nvme_reset_locked(sc); 3016 else 3017 pci_nvme_init_controller(sc); 3018 } 3019 3020 /* Insert the iocqes, iosqes and en bits from the write */ 3021 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3022 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3023 if (NVME_CC_GET_EN(ccreg) == 0) { 3024 /* Insert the ams, mps and css bit fields */ 3025 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3026 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3027 sc->regs.csts &= ~NVME_CSTS_RDY; 3028 } else if ((sc->pending_ios == 0) && 3029 !(sc->regs.csts & NVME_CSTS_CFS)) { 3030 sc->regs.csts |= NVME_CSTS_RDY; 3031 } 3032 break; 3033 case NVME_CR_CSTS: 3034 break; 3035 case NVME_CR_NSSR: 3036 /* ignore writes; don't support subsystem reset */ 3037 break; 3038 case NVME_CR_AQA: 3039 sc->regs.aqa = (uint32_t)value; 3040 break; 3041 case NVME_CR_ASQ_LOW: 3042 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3043 (0xFFFFF000 & value); 3044 break; 3045 case NVME_CR_ASQ_HI: 3046 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3047 (value << 32); 3048 break; 3049 case NVME_CR_ACQ_LOW: 3050 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3051 (0xFFFFF000 & value); 3052 break; 3053 case NVME_CR_ACQ_HI: 3054 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3055 (value << 32); 3056 break; 3057 default: 3058 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3059 __func__, offset, value, size); 3060 } 3061 pthread_mutex_unlock(&sc->mtx); 3062 } 3063 3064 static void 3065 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3066 uint64_t value) 3067 { 3068 struct pci_nvme_softc* sc = pi->pi_arg; 3069 3070 if (baridx == pci_msix_table_bar(pi) || 3071 baridx == pci_msix_pba_bar(pi)) { 3072 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3073 " value 0x%lx", baridx, offset, size, value); 3074 3075 pci_emul_msix_twrite(pi, offset, size, value); 3076 return; 3077 } 3078 3079 switch (baridx) { 3080 case 0: 3081 pci_nvme_write_bar_0(sc, offset, size, value); 3082 break; 3083 3084 default: 3085 DPRINTF("%s unknown baridx %d, val 0x%lx", 3086 __func__, baridx, value); 3087 } 3088 } 3089 3090 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3091 uint64_t offset, int size) 3092 { 3093 uint64_t value; 3094 3095 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3096 3097 if (offset < NVME_DOORBELL_OFFSET) { 3098 void *p = &(sc->regs); 3099 pthread_mutex_lock(&sc->mtx); 3100 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3101 pthread_mutex_unlock(&sc->mtx); 3102 } else { 3103 value = 0; 3104 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3105 } 3106 3107 switch (size) { 3108 case 1: 3109 value &= 0xFF; 3110 break; 3111 case 2: 3112 value &= 0xFFFF; 3113 break; 3114 case 4: 3115 value &= 0xFFFFFFFF; 3116 break; 3117 } 3118 3119 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3120 offset, size, (uint32_t)value); 3121 3122 return (value); 3123 } 3124 3125 3126 3127 static uint64_t 3128 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3129 { 3130 struct pci_nvme_softc* sc = pi->pi_arg; 3131 3132 if (baridx == pci_msix_table_bar(pi) || 3133 baridx == pci_msix_pba_bar(pi)) { 3134 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3135 baridx, offset, size); 3136 3137 return pci_emul_msix_tread(pi, offset, size); 3138 } 3139 3140 switch (baridx) { 3141 case 0: 3142 return pci_nvme_read_bar_0(sc, offset, size); 3143 3144 default: 3145 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3146 } 3147 3148 return (0); 3149 } 3150 3151 static int 3152 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3153 { 3154 char bident[sizeof("XXX:XXX")]; 3155 const char *value; 3156 uint32_t sectsz; 3157 3158 sc->max_queues = NVME_QUEUES; 3159 sc->max_qentries = NVME_MAX_QENTRIES; 3160 sc->ioslots = NVME_IOSLOTS; 3161 sc->num_squeues = sc->max_queues; 3162 sc->num_cqueues = sc->max_queues; 3163 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3164 sectsz = 0; 3165 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3166 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3167 3168 value = get_config_value_node(nvl, "maxq"); 3169 if (value != NULL) 3170 sc->max_queues = atoi(value); 3171 value = get_config_value_node(nvl, "qsz"); 3172 if (value != NULL) { 3173 sc->max_qentries = atoi(value); 3174 if (sc->max_qentries <= 0) { 3175 EPRINTLN("nvme: Invalid qsz option %d", 3176 sc->max_qentries); 3177 return (-1); 3178 } 3179 } 3180 value = get_config_value_node(nvl, "ioslots"); 3181 if (value != NULL) { 3182 sc->ioslots = atoi(value); 3183 if (sc->ioslots <= 0) { 3184 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3185 return (-1); 3186 } 3187 } 3188 value = get_config_value_node(nvl, "sectsz"); 3189 if (value != NULL) 3190 sectsz = atoi(value); 3191 value = get_config_value_node(nvl, "ser"); 3192 if (value != NULL) { 3193 /* 3194 * This field indicates the Product Serial Number in 3195 * 7-bit ASCII, unused bytes should be space characters. 3196 * Ref: NVMe v1.3c. 3197 */ 3198 cpywithpad((char *)sc->ctrldata.sn, 3199 sizeof(sc->ctrldata.sn), value, ' '); 3200 } 3201 value = get_config_value_node(nvl, "eui64"); 3202 if (value != NULL) 3203 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3204 value = get_config_value_node(nvl, "dsm"); 3205 if (value != NULL) { 3206 if (strcmp(value, "auto") == 0) 3207 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3208 else if (strcmp(value, "enable") == 0) 3209 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3210 else if (strcmp(value, "disable") == 0) 3211 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3212 } 3213 3214 value = get_config_value_node(nvl, "ram"); 3215 if (value != NULL) { 3216 uint64_t sz = strtoull(value, NULL, 10); 3217 3218 sc->nvstore.type = NVME_STOR_RAM; 3219 sc->nvstore.size = sz * 1024 * 1024; 3220 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3221 sc->nvstore.sectsz = 4096; 3222 sc->nvstore.sectsz_bits = 12; 3223 if (sc->nvstore.ctx == NULL) { 3224 EPRINTLN("nvme: Unable to allocate RAM"); 3225 return (-1); 3226 } 3227 } else { 3228 snprintf(bident, sizeof(bident), "%u:%u", 3229 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3230 sc->nvstore.ctx = blockif_open(nvl, bident); 3231 if (sc->nvstore.ctx == NULL) { 3232 EPRINTLN("nvme: Could not open backing file: %s", 3233 strerror(errno)); 3234 return (-1); 3235 } 3236 sc->nvstore.type = NVME_STOR_BLOCKIF; 3237 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3238 } 3239 3240 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3241 sc->nvstore.sectsz = sectsz; 3242 else if (sc->nvstore.type != NVME_STOR_RAM) 3243 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3244 for (sc->nvstore.sectsz_bits = 9; 3245 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3246 sc->nvstore.sectsz_bits++); 3247 3248 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3249 sc->max_queues = NVME_QUEUES; 3250 3251 return (0); 3252 } 3253 3254 static void 3255 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3256 size_t new_size) 3257 { 3258 struct pci_nvme_softc *sc; 3259 struct pci_nvme_blockstore *nvstore; 3260 struct nvme_namespace_data *nd; 3261 3262 sc = arg; 3263 nvstore = &sc->nvstore; 3264 nd = &sc->nsdata; 3265 3266 nvstore->size = new_size; 3267 pci_nvme_init_nsdata_size(nvstore, nd); 3268 3269 /* Add changed NSID to list */ 3270 sc->ns_log.ns[0] = 1; 3271 sc->ns_log.ns[1] = 0; 3272 3273 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3275 } 3276 3277 static int 3278 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3279 { 3280 struct pci_nvme_softc *sc; 3281 uint32_t pci_membar_sz; 3282 int error; 3283 3284 error = 0; 3285 3286 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3287 pi->pi_arg = sc; 3288 sc->nsc_pi = pi; 3289 3290 error = pci_nvme_parse_config(sc, nvl); 3291 if (error < 0) 3292 goto done; 3293 else 3294 error = 0; 3295 3296 STAILQ_INIT(&sc->ioreqs_free); 3297 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3298 for (uint32_t i = 0; i < sc->ioslots; i++) { 3299 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3300 } 3301 3302 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3303 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3304 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3305 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3306 pci_set_cfgdata8(pi, PCIR_PROGIF, 3307 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3308 3309 /* 3310 * Allocate size of NVMe registers + doorbell space for all queues. 3311 * 3312 * The specification requires a minimum memory I/O window size of 16K. 3313 * The Windows driver will refuse to start a device with a smaller 3314 * window. 3315 */ 3316 pci_membar_sz = sizeof(struct nvme_registers) + 3317 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3318 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3319 3320 DPRINTF("nvme membar size: %u", pci_membar_sz); 3321 3322 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3323 if (error) { 3324 WPRINTF("%s pci alloc mem bar failed", __func__); 3325 goto done; 3326 } 3327 3328 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3329 if (error) { 3330 WPRINTF("%s pci add msixcap failed", __func__); 3331 goto done; 3332 } 3333 3334 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3335 if (error) { 3336 WPRINTF("%s pci add Express capability failed", __func__); 3337 goto done; 3338 } 3339 3340 pthread_mutex_init(&sc->mtx, NULL); 3341 sem_init(&sc->iosemlock, 0, sc->ioslots); 3342 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3343 3344 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3345 /* 3346 * Controller data depends on Namespace data so initialize Namespace 3347 * data first. 3348 */ 3349 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3350 pci_nvme_init_ctrldata(sc); 3351 pci_nvme_init_logpages(sc); 3352 pci_nvme_init_features(sc); 3353 3354 pci_nvme_aer_init(sc); 3355 pci_nvme_aen_init(sc); 3356 3357 pci_nvme_reset(sc); 3358 3359 pci_lintr_request(pi); 3360 3361 done: 3362 return (error); 3363 } 3364 3365 static int 3366 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3367 { 3368 char *cp, *ram; 3369 3370 if (opts == NULL) 3371 return (0); 3372 3373 if (strncmp(opts, "ram=", 4) == 0) { 3374 cp = strchr(opts, ','); 3375 if (cp == NULL) { 3376 set_config_value_node(nvl, "ram", opts + 4); 3377 return (0); 3378 } 3379 ram = strndup(opts + 4, cp - opts - 4); 3380 set_config_value_node(nvl, "ram", ram); 3381 free(ram); 3382 return (pci_parse_legacy_config(nvl, cp + 1)); 3383 } else 3384 return (blockif_legacy_config(nvl, opts)); 3385 } 3386 3387 static const struct pci_devemu pci_de_nvme = { 3388 .pe_emu = "nvme", 3389 .pe_init = pci_nvme_init, 3390 .pe_legacy_config = pci_nvme_legacy_config, 3391 .pe_barwrite = pci_nvme_write, 3392 .pe_barread = pci_nvme_read 3393 }; 3394 PCI_EMUL_SET(pci_de_nvme); 3395