1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 /** Asynchronous Event Information - Notice */ 273 typedef enum { 274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 277 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 281 PCI_NVME_AEI_NOTICE_MAX, 282 } pci_nvme_async_event_info_notice; 283 284 #define PCI_NVME_AEI_NOTICE_SHIFT 8 285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 286 287 /* Asynchronous Event Notifications */ 288 struct pci_nvme_aen { 289 pci_nvme_async_type atype; 290 uint32_t event_data; 291 bool posted; 292 }; 293 294 /* 295 * By default, enable all Asynchrnous Event Notifications: 296 * SMART / Health Critical Warnings 297 * Namespace Attribute Notices 298 */ 299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 300 301 typedef enum { 302 NVME_CNTRLTYPE_IO = 1, 303 NVME_CNTRLTYPE_DISCOVERY = 2, 304 NVME_CNTRLTYPE_ADMIN = 3, 305 } pci_nvme_cntrl_type; 306 307 struct pci_nvme_softc { 308 struct pci_devinst *nsc_pi; 309 310 pthread_mutex_t mtx; 311 312 struct nvme_registers regs; 313 314 struct nvme_namespace_data nsdata; 315 struct nvme_controller_data ctrldata; 316 struct nvme_error_information_entry err_log; 317 struct nvme_health_information_page health_log; 318 struct nvme_firmware_page fw_log; 319 struct nvme_ns_list ns_log; 320 321 struct pci_nvme_blockstore nvstore; 322 323 uint16_t max_qentries; /* max entries per queue */ 324 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 325 uint32_t num_cqueues; 326 uint32_t num_squeues; 327 bool num_q_is_set; /* Has host set Number of Queues */ 328 329 struct pci_nvme_ioreq *ioreqs; 330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 331 uint32_t pending_ios; 332 uint32_t ioslots; 333 sem_t iosemlock; 334 335 /* 336 * Memory mapped Submission and Completion queues 337 * Each array includes both Admin and IO queues 338 */ 339 struct nvme_completion_queue *compl_queues; 340 struct nvme_submission_queue *submit_queues; 341 342 struct nvme_feature_obj feat[NVME_FID_MAX]; 343 344 enum nvme_dsm_type dataset_management; 345 346 /* Accounting for SMART data */ 347 __uint128_t read_data_units; 348 __uint128_t write_data_units; 349 __uint128_t read_commands; 350 __uint128_t write_commands; 351 uint32_t read_dunits_remainder; 352 uint32_t write_dunits_remainder; 353 354 STAILQ_HEAD(, pci_nvme_aer) aer_list; 355 pthread_mutex_t aer_mtx; 356 uint32_t aer_count; 357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 358 pthread_t aen_tid; 359 pthread_mutex_t aen_mtx; 360 pthread_cond_t aen_cond; 361 }; 362 363 364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 365 struct nvme_completion_queue *cq, 366 uint32_t cdw0, 367 uint16_t cid, 368 uint16_t sqid, 369 uint16_t status); 370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 372 static void pci_nvme_io_done(struct blockif_req *, int); 373 374 /* Controller Configuration utils */ 375 #define NVME_CC_GET_EN(cc) \ 376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 377 #define NVME_CC_GET_CSS(cc) \ 378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 379 #define NVME_CC_GET_SHN(cc) \ 380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 381 #define NVME_CC_GET_IOSQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 383 #define NVME_CC_GET_IOCQES(cc) \ 384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 385 386 #define NVME_CC_WRITE_MASK \ 387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 390 391 #define NVME_CC_NEN_WRITE_MASK \ 392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 395 396 /* Controller Status utils */ 397 #define NVME_CSTS_GET_RDY(sts) \ 398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 399 400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 401 402 /* Completion Queue status word utils */ 403 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 404 #define NVME_STATUS_MASK \ 405 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 406 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 407 408 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 409 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 410 411 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 412 struct nvme_feature_obj *, 413 struct nvme_command *, 414 struct nvme_completion *); 415 static void nvme_feature_temperature(struct pci_nvme_softc *, 416 struct nvme_feature_obj *, 417 struct nvme_command *, 418 struct nvme_completion *); 419 static void nvme_feature_num_queues(struct pci_nvme_softc *, 420 struct nvme_feature_obj *, 421 struct nvme_command *, 422 struct nvme_completion *); 423 static void nvme_feature_iv_config(struct pci_nvme_softc *, 424 struct nvme_feature_obj *, 425 struct nvme_command *, 426 struct nvme_completion *); 427 static void nvme_feature_async_event(struct pci_nvme_softc *, 428 struct nvme_feature_obj *, 429 struct nvme_command *, 430 struct nvme_completion *); 431 432 static void *aen_thr(void *arg); 433 434 static __inline void 435 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 436 { 437 size_t len; 438 439 len = strnlen(src, dst_size); 440 memset(dst, pad, dst_size); 441 memcpy(dst, src, len); 442 } 443 444 static __inline void 445 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 446 { 447 448 *status &= ~NVME_STATUS_MASK; 449 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 450 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 451 } 452 453 static __inline void 454 pci_nvme_status_genc(uint16_t *status, uint16_t code) 455 { 456 457 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 458 } 459 460 /* 461 * Initialize the requested number or IO Submission and Completion Queues. 462 * Admin queues are allocated implicitly. 463 */ 464 static void 465 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 466 { 467 uint32_t i; 468 469 /* 470 * Allocate and initialize the Submission Queues 471 */ 472 if (nsq > NVME_QUEUES) { 473 WPRINTF("%s: clamping number of SQ from %u to %u", 474 __func__, nsq, NVME_QUEUES); 475 nsq = NVME_QUEUES; 476 } 477 478 sc->num_squeues = nsq; 479 480 sc->submit_queues = calloc(sc->num_squeues + 1, 481 sizeof(struct nvme_submission_queue)); 482 if (sc->submit_queues == NULL) { 483 WPRINTF("%s: SQ allocation failed", __func__); 484 sc->num_squeues = 0; 485 } else { 486 struct nvme_submission_queue *sq = sc->submit_queues; 487 488 for (i = 0; i < sc->num_squeues + 1; i++) 489 pthread_mutex_init(&sq[i].mtx, NULL); 490 } 491 492 /* 493 * Allocate and initialize the Completion Queues 494 */ 495 if (ncq > NVME_QUEUES) { 496 WPRINTF("%s: clamping number of CQ from %u to %u", 497 __func__, ncq, NVME_QUEUES); 498 ncq = NVME_QUEUES; 499 } 500 501 sc->num_cqueues = ncq; 502 503 sc->compl_queues = calloc(sc->num_cqueues + 1, 504 sizeof(struct nvme_completion_queue)); 505 if (sc->compl_queues == NULL) { 506 WPRINTF("%s: CQ allocation failed", __func__); 507 sc->num_cqueues = 0; 508 } else { 509 struct nvme_completion_queue *cq = sc->compl_queues; 510 511 for (i = 0; i < sc->num_cqueues + 1; i++) 512 pthread_mutex_init(&cq[i].mtx, NULL); 513 } 514 } 515 516 static void 517 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 518 { 519 struct nvme_controller_data *cd = &sc->ctrldata; 520 521 cd->vid = 0xFB5D; 522 cd->ssvid = 0x0000; 523 524 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 525 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 526 527 /* Num of submission commands that we can handle at a time (2^rab) */ 528 cd->rab = 4; 529 530 /* FreeBSD OUI */ 531 cd->ieee[0] = 0x58; 532 cd->ieee[1] = 0x9c; 533 cd->ieee[2] = 0xfc; 534 535 cd->mic = 0; 536 537 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 538 539 cd->ver = NVME_REV(1,4); 540 541 cd->cntrltype = NVME_CNTRLTYPE_IO; 542 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 543 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 544 cd->acl = 2; 545 cd->aerl = 4; 546 547 /* Advertise 1, Read-only firmware slot */ 548 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 549 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 550 cd->lpa = 0; /* TODO: support some simple things like SMART */ 551 cd->elpe = 0; /* max error log page entries */ 552 cd->npss = 1; /* number of power states support */ 553 554 /* Warning Composite Temperature Threshold */ 555 cd->wctemp = 0x0157; 556 cd->cctemp = 0x0157; 557 558 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 559 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 560 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 561 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 562 cd->nn = 1; /* number of namespaces */ 563 564 cd->oncs = 0; 565 switch (sc->dataset_management) { 566 case NVME_DATASET_MANAGEMENT_AUTO: 567 if (sc->nvstore.deallocate) 568 cd->oncs |= NVME_ONCS_DSM; 569 break; 570 case NVME_DATASET_MANAGEMENT_ENABLE: 571 cd->oncs |= NVME_ONCS_DSM; 572 break; 573 default: 574 break; 575 } 576 577 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 578 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 579 580 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 581 582 cd->power_state[0].mp = 10; 583 } 584 585 /* 586 * Calculate the CRC-16 of the given buffer 587 * See copyright attribution at top of file 588 */ 589 static uint16_t 590 crc16(uint16_t crc, const void *buffer, unsigned int len) 591 { 592 const unsigned char *cp = buffer; 593 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 594 static uint16_t const crc16_table[256] = { 595 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 596 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 597 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 598 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 599 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 600 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 601 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 602 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 603 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 604 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 605 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 606 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 607 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 608 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 609 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 610 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 611 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 612 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 613 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 614 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 615 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 616 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 617 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 618 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 619 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 620 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 621 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 622 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 623 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 624 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 625 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 626 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 627 }; 628 629 while (len--) 630 crc = (((crc >> 8) & 0xffU) ^ 631 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 632 return crc; 633 } 634 635 static void 636 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 637 struct nvme_namespace_data *nd) 638 { 639 640 /* Get capacity and block size information from backing store */ 641 nd->nsze = nvstore->size / nvstore->sectsz; 642 nd->ncap = nd->nsze; 643 nd->nuse = nd->nsze; 644 } 645 646 static void 647 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 648 struct nvme_namespace_data *nd, uint32_t nsid, 649 struct pci_nvme_blockstore *nvstore) 650 { 651 652 pci_nvme_init_nsdata_size(nvstore, nd); 653 654 if (nvstore->type == NVME_STOR_BLOCKIF) 655 nvstore->deallocate = blockif_candelete(nvstore->ctx); 656 657 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 658 nd->flbas = 0; 659 660 /* Create an EUI-64 if user did not provide one */ 661 if (nvstore->eui64 == 0) { 662 char *data = NULL; 663 uint64_t eui64 = nvstore->eui64; 664 665 asprintf(&data, "%s%u%u%u", get_config_value("name"), 666 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 667 sc->nsc_pi->pi_func); 668 669 if (data != NULL) { 670 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 671 free(data); 672 } 673 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 674 } 675 be64enc(nd->eui64, nvstore->eui64); 676 677 /* LBA data-sz = 2^lbads */ 678 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 679 } 680 681 static void 682 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 683 { 684 685 memset(&sc->err_log, 0, sizeof(sc->err_log)); 686 memset(&sc->health_log, 0, sizeof(sc->health_log)); 687 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 688 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 689 690 /* Set read/write remainder to round up according to spec */ 691 sc->read_dunits_remainder = 999; 692 sc->write_dunits_remainder = 999; 693 694 /* Set nominal Health values checked by implementations */ 695 sc->health_log.temperature = NVME_TEMPERATURE; 696 sc->health_log.available_spare = 100; 697 sc->health_log.available_spare_threshold = 10; 698 } 699 700 static void 701 pci_nvme_init_features(struct pci_nvme_softc *sc) 702 { 703 enum nvme_feature fid; 704 705 for (fid = 0; fid < NVME_FID_MAX; fid++) { 706 switch (fid) { 707 case NVME_FEAT_ARBITRATION: 708 case NVME_FEAT_POWER_MANAGEMENT: 709 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 710 case NVME_FEAT_WRITE_ATOMICITY: 711 /* Mandatory but no special handling required */ 712 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 713 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 714 // this returns a data buffer 715 break; 716 case NVME_FEAT_TEMPERATURE_THRESHOLD: 717 sc->feat[fid].set = nvme_feature_temperature; 718 break; 719 case NVME_FEAT_ERROR_RECOVERY: 720 sc->feat[fid].namespace_specific = true; 721 break; 722 case NVME_FEAT_NUMBER_OF_QUEUES: 723 sc->feat[fid].set = nvme_feature_num_queues; 724 break; 725 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 726 sc->feat[fid].set = nvme_feature_iv_config; 727 break; 728 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 729 sc->feat[fid].set = nvme_feature_async_event; 730 /* Enable all AENs by default */ 731 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 732 break; 733 default: 734 sc->feat[fid].set = nvme_feature_invalid_cb; 735 sc->feat[fid].get = nvme_feature_invalid_cb; 736 } 737 } 738 } 739 740 static void 741 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 742 { 743 744 STAILQ_INIT(&sc->aer_list); 745 sc->aer_count = 0; 746 } 747 748 static void 749 pci_nvme_aer_init(struct pci_nvme_softc *sc) 750 { 751 752 pthread_mutex_init(&sc->aer_mtx, NULL); 753 pci_nvme_aer_reset(sc); 754 } 755 756 static void 757 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 758 { 759 struct pci_nvme_aer *aer = NULL; 760 761 pthread_mutex_lock(&sc->aer_mtx); 762 while (!STAILQ_EMPTY(&sc->aer_list)) { 763 aer = STAILQ_FIRST(&sc->aer_list); 764 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 765 free(aer); 766 } 767 pthread_mutex_unlock(&sc->aer_mtx); 768 769 pci_nvme_aer_reset(sc); 770 } 771 772 static bool 773 pci_nvme_aer_available(struct pci_nvme_softc *sc) 774 { 775 776 return (sc->aer_count != 0); 777 } 778 779 static bool 780 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 781 { 782 struct nvme_controller_data *cd = &sc->ctrldata; 783 784 /* AERL is a zero based value while aer_count is one's based */ 785 return (sc->aer_count == (cd->aerl + 1)); 786 } 787 788 /* 789 * Add an Async Event Request 790 * 791 * Stores an AER to be returned later if the Controller needs to notify the 792 * host of an event. 793 * Note that while the NVMe spec doesn't require Controllers to return AER's 794 * in order, this implementation does preserve the order. 795 */ 796 static int 797 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 798 { 799 struct pci_nvme_aer *aer = NULL; 800 801 aer = calloc(1, sizeof(struct pci_nvme_aer)); 802 if (aer == NULL) 803 return (-1); 804 805 /* Save the Command ID for use in the completion message */ 806 aer->cid = cid; 807 808 pthread_mutex_lock(&sc->aer_mtx); 809 sc->aer_count++; 810 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 811 pthread_mutex_unlock(&sc->aer_mtx); 812 813 return (0); 814 } 815 816 /* 817 * Get an Async Event Request structure 818 * 819 * Returns a pointer to an AER previously submitted by the host or NULL if 820 * no AER's exist. Caller is responsible for freeing the returned struct. 821 */ 822 static struct pci_nvme_aer * 823 pci_nvme_aer_get(struct pci_nvme_softc *sc) 824 { 825 struct pci_nvme_aer *aer = NULL; 826 827 pthread_mutex_lock(&sc->aer_mtx); 828 aer = STAILQ_FIRST(&sc->aer_list); 829 if (aer != NULL) { 830 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 831 sc->aer_count--; 832 } 833 pthread_mutex_unlock(&sc->aer_mtx); 834 835 return (aer); 836 } 837 838 static void 839 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 840 { 841 uint32_t atype; 842 843 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 844 845 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 846 sc->aen[atype].atype = atype; 847 } 848 } 849 850 static void 851 pci_nvme_aen_init(struct pci_nvme_softc *sc) 852 { 853 char nstr[80]; 854 855 pci_nvme_aen_reset(sc); 856 857 pthread_mutex_init(&sc->aen_mtx, NULL); 858 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 859 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 860 sc->nsc_pi->pi_func); 861 pthread_set_name_np(sc->aen_tid, nstr); 862 } 863 864 static void 865 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 866 { 867 868 pci_nvme_aen_reset(sc); 869 } 870 871 /* Notify the AEN thread of pending work */ 872 static void 873 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 874 { 875 876 pthread_cond_signal(&sc->aen_cond); 877 } 878 879 /* 880 * Post an Asynchronous Event Notification 881 */ 882 static int32_t 883 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 884 uint32_t event_data) 885 { 886 struct pci_nvme_aen *aen; 887 888 if (atype >= PCI_NVME_AE_TYPE_MAX) { 889 return(EINVAL); 890 } 891 892 pthread_mutex_lock(&sc->aen_mtx); 893 aen = &sc->aen[atype]; 894 895 /* Has the controller already posted an event of this type? */ 896 if (aen->posted) { 897 pthread_mutex_unlock(&sc->aen_mtx); 898 return(EALREADY); 899 } 900 901 aen->event_data = event_data; 902 aen->posted = true; 903 pthread_mutex_unlock(&sc->aen_mtx); 904 905 pci_nvme_aen_notify(sc); 906 907 return(0); 908 } 909 910 static void 911 pci_nvme_aen_process(struct pci_nvme_softc *sc) 912 { 913 struct pci_nvme_aer *aer; 914 struct pci_nvme_aen *aen; 915 pci_nvme_async_type atype; 916 uint32_t mask; 917 uint16_t status; 918 uint8_t lid; 919 920 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 921 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 922 aen = &sc->aen[atype]; 923 /* Previous iterations may have depleted the available AER's */ 924 if (!pci_nvme_aer_available(sc)) { 925 DPRINTF("%s: no AER", __func__); 926 break; 927 } 928 929 if (!aen->posted) { 930 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 931 continue; 932 } 933 934 status = NVME_SC_SUCCESS; 935 936 /* Is the event masked? */ 937 mask = 938 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 939 940 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 941 switch (atype) { 942 case PCI_NVME_AE_TYPE_ERROR: 943 lid = NVME_LOG_ERROR; 944 break; 945 case PCI_NVME_AE_TYPE_SMART: 946 mask &= 0xff; 947 if ((mask & aen->event_data) == 0) 948 continue; 949 lid = NVME_LOG_HEALTH_INFORMATION; 950 break; 951 case PCI_NVME_AE_TYPE_NOTICE: 952 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 953 EPRINTLN("%s unknown AEN notice type %u", 954 __func__, aen->event_data); 955 status = NVME_SC_INTERNAL_DEVICE_ERROR; 956 break; 957 } 958 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 959 continue; 960 switch (aen->event_data) { 961 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 962 lid = NVME_LOG_CHANGED_NAMESPACE; 963 break; 964 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 965 lid = NVME_LOG_FIRMWARE_SLOT; 966 break; 967 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 968 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 969 break; 970 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 971 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 972 break; 973 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 974 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 975 break; 976 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 977 lid = NVME_LOG_LBA_STATUS_INFORMATION; 978 break; 979 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 980 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 981 break; 982 default: 983 lid = 0; 984 } 985 break; 986 default: 987 /* bad type?!? */ 988 EPRINTLN("%s unknown AEN type %u", __func__, atype); 989 status = NVME_SC_INTERNAL_DEVICE_ERROR; 990 break; 991 } 992 993 aer = pci_nvme_aer_get(sc); 994 assert(aer != NULL); 995 996 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 997 pci_nvme_cq_update(sc, &sc->compl_queues[0], 998 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 999 aer->cid, 1000 0, /* SQID */ 1001 status); 1002 1003 aen->event_data = 0; 1004 aen->posted = false; 1005 1006 pci_generate_msix(sc->nsc_pi, 0); 1007 } 1008 } 1009 1010 static void * 1011 aen_thr(void *arg) 1012 { 1013 struct pci_nvme_softc *sc; 1014 1015 sc = arg; 1016 1017 pthread_mutex_lock(&sc->aen_mtx); 1018 for (;;) { 1019 pci_nvme_aen_process(sc); 1020 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1021 } 1022 pthread_mutex_unlock(&sc->aen_mtx); 1023 1024 pthread_exit(NULL); 1025 return (NULL); 1026 } 1027 1028 static void 1029 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1030 { 1031 uint32_t i; 1032 1033 DPRINTF("%s", __func__); 1034 1035 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1036 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1037 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1038 1039 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1040 1041 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1042 1043 sc->regs.cc = 0; 1044 1045 assert(sc->submit_queues != NULL); 1046 1047 for (i = 0; i < sc->num_squeues + 1; i++) { 1048 sc->submit_queues[i].qbase = NULL; 1049 sc->submit_queues[i].size = 0; 1050 sc->submit_queues[i].cqid = 0; 1051 sc->submit_queues[i].tail = 0; 1052 sc->submit_queues[i].head = 0; 1053 } 1054 1055 assert(sc->compl_queues != NULL); 1056 1057 for (i = 0; i < sc->num_cqueues + 1; i++) { 1058 sc->compl_queues[i].qbase = NULL; 1059 sc->compl_queues[i].size = 0; 1060 sc->compl_queues[i].tail = 0; 1061 sc->compl_queues[i].head = 0; 1062 } 1063 1064 sc->num_q_is_set = false; 1065 1066 pci_nvme_aer_destroy(sc); 1067 pci_nvme_aen_destroy(sc); 1068 1069 /* 1070 * Clear CSTS.RDY last to prevent the host from enabling Controller 1071 * before cleanup completes 1072 */ 1073 sc->regs.csts = 0; 1074 } 1075 1076 static void 1077 pci_nvme_reset(struct pci_nvme_softc *sc) 1078 { 1079 pthread_mutex_lock(&sc->mtx); 1080 pci_nvme_reset_locked(sc); 1081 pthread_mutex_unlock(&sc->mtx); 1082 } 1083 1084 static void 1085 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1086 { 1087 uint16_t acqs, asqs; 1088 1089 DPRINTF("%s", __func__); 1090 1091 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 1092 sc->submit_queues[0].size = asqs; 1093 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1094 sizeof(struct nvme_command) * asqs); 1095 1096 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1097 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1098 1099 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1100 NVME_AQA_REG_ACQS_MASK) + 1; 1101 sc->compl_queues[0].size = acqs; 1102 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1103 sizeof(struct nvme_completion) * acqs); 1104 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1105 1106 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1107 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1108 } 1109 1110 static int 1111 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1112 size_t len, enum nvme_copy_dir dir) 1113 { 1114 uint8_t *p; 1115 size_t bytes; 1116 1117 if (len > (8 * 1024)) { 1118 return (-1); 1119 } 1120 1121 /* Copy from the start of prp1 to the end of the physical page */ 1122 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1123 bytes = MIN(bytes, len); 1124 1125 p = vm_map_gpa(ctx, prp1, bytes); 1126 if (p == NULL) { 1127 return (-1); 1128 } 1129 1130 if (dir == NVME_COPY_TO_PRP) 1131 memcpy(p, b, bytes); 1132 else 1133 memcpy(b, p, bytes); 1134 1135 b += bytes; 1136 1137 len -= bytes; 1138 if (len == 0) { 1139 return (0); 1140 } 1141 1142 len = MIN(len, PAGE_SIZE); 1143 1144 p = vm_map_gpa(ctx, prp2, len); 1145 if (p == NULL) { 1146 return (-1); 1147 } 1148 1149 if (dir == NVME_COPY_TO_PRP) 1150 memcpy(p, b, len); 1151 else 1152 memcpy(b, p, len); 1153 1154 return (0); 1155 } 1156 1157 /* 1158 * Write a Completion Queue Entry update 1159 * 1160 * Write the completion and update the doorbell value 1161 */ 1162 static void 1163 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1164 struct nvme_completion_queue *cq, 1165 uint32_t cdw0, 1166 uint16_t cid, 1167 uint16_t sqid, 1168 uint16_t status) 1169 { 1170 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1171 struct nvme_completion *cqe; 1172 1173 assert(cq->qbase != NULL); 1174 1175 pthread_mutex_lock(&cq->mtx); 1176 1177 cqe = &cq->qbase[cq->tail]; 1178 1179 /* Flip the phase bit */ 1180 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1181 1182 cqe->cdw0 = cdw0; 1183 cqe->sqhd = sq->head; 1184 cqe->sqid = sqid; 1185 cqe->cid = cid; 1186 cqe->status = status; 1187 1188 cq->tail++; 1189 if (cq->tail >= cq->size) { 1190 cq->tail = 0; 1191 } 1192 1193 pthread_mutex_unlock(&cq->mtx); 1194 } 1195 1196 static int 1197 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1198 struct nvme_completion* compl) 1199 { 1200 uint16_t qid = command->cdw10 & 0xffff; 1201 1202 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1203 if (qid == 0 || qid > sc->num_squeues || 1204 (sc->submit_queues[qid].qbase == NULL)) { 1205 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1206 __func__, qid, sc->num_squeues); 1207 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1208 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1209 return (1); 1210 } 1211 1212 sc->submit_queues[qid].qbase = NULL; 1213 sc->submit_queues[qid].cqid = 0; 1214 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1215 return (1); 1216 } 1217 1218 static int 1219 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1220 struct nvme_completion* compl) 1221 { 1222 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1223 uint16_t qid = command->cdw10 & 0xffff; 1224 struct nvme_submission_queue *nsq; 1225 1226 if ((qid == 0) || (qid > sc->num_squeues) || 1227 (sc->submit_queues[qid].qbase != NULL)) { 1228 WPRINTF("%s queue index %u > num_squeues %u", 1229 __func__, qid, sc->num_squeues); 1230 pci_nvme_status_tc(&compl->status, 1231 NVME_SCT_COMMAND_SPECIFIC, 1232 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1233 return (1); 1234 } 1235 1236 nsq = &sc->submit_queues[qid]; 1237 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1238 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1239 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1240 /* 1241 * Queues must specify at least two entries 1242 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1243 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1244 */ 1245 pci_nvme_status_tc(&compl->status, 1246 NVME_SCT_COMMAND_SPECIFIC, 1247 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1248 return (1); 1249 } 1250 nsq->head = nsq->tail = 0; 1251 1252 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1253 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1254 pci_nvme_status_tc(&compl->status, 1255 NVME_SCT_COMMAND_SPECIFIC, 1256 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1257 return (1); 1258 } 1259 1260 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1261 pci_nvme_status_tc(&compl->status, 1262 NVME_SCT_COMMAND_SPECIFIC, 1263 NVME_SC_COMPLETION_QUEUE_INVALID); 1264 return (1); 1265 } 1266 1267 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1268 1269 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1270 sizeof(struct nvme_command) * (size_t)nsq->size); 1271 1272 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1273 qid, nsq->size, nsq->qbase, nsq->cqid); 1274 1275 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1276 1277 DPRINTF("%s completed creating IOSQ qid %u", 1278 __func__, qid); 1279 } else { 1280 /* 1281 * Guest sent non-cont submission queue request. 1282 * This setting is unsupported by this emulation. 1283 */ 1284 WPRINTF("%s unsupported non-contig (list-based) " 1285 "create i/o submission queue", __func__); 1286 1287 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1288 } 1289 return (1); 1290 } 1291 1292 static int 1293 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1294 struct nvme_completion* compl) 1295 { 1296 uint16_t qid = command->cdw10 & 0xffff; 1297 uint16_t sqid; 1298 1299 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1300 if (qid == 0 || qid > sc->num_cqueues || 1301 (sc->compl_queues[qid].qbase == NULL)) { 1302 WPRINTF("%s queue index %u / num_cqueues %u", 1303 __func__, qid, sc->num_cqueues); 1304 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1305 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1306 return (1); 1307 } 1308 1309 /* Deleting an Active CQ is an error */ 1310 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1311 if (sc->submit_queues[sqid].cqid == qid) { 1312 pci_nvme_status_tc(&compl->status, 1313 NVME_SCT_COMMAND_SPECIFIC, 1314 NVME_SC_INVALID_QUEUE_DELETION); 1315 return (1); 1316 } 1317 1318 sc->compl_queues[qid].qbase = NULL; 1319 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1320 return (1); 1321 } 1322 1323 static int 1324 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1325 struct nvme_completion* compl) 1326 { 1327 struct nvme_completion_queue *ncq; 1328 uint16_t qid = command->cdw10 & 0xffff; 1329 1330 /* Only support Physically Contiguous queues */ 1331 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1332 WPRINTF("%s unsupported non-contig (list-based) " 1333 "create i/o completion queue", 1334 __func__); 1335 1336 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1337 return (1); 1338 } 1339 1340 if ((qid == 0) || (qid > sc->num_cqueues) || 1341 (sc->compl_queues[qid].qbase != NULL)) { 1342 WPRINTF("%s queue index %u > num_cqueues %u", 1343 __func__, qid, sc->num_cqueues); 1344 pci_nvme_status_tc(&compl->status, 1345 NVME_SCT_COMMAND_SPECIFIC, 1346 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1347 return (1); 1348 } 1349 1350 ncq = &sc->compl_queues[qid]; 1351 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1352 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1353 if (ncq->intr_vec > (sc->max_queues + 1)) { 1354 pci_nvme_status_tc(&compl->status, 1355 NVME_SCT_COMMAND_SPECIFIC, 1356 NVME_SC_INVALID_INTERRUPT_VECTOR); 1357 return (1); 1358 } 1359 1360 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1361 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1362 /* 1363 * Queues must specify at least two entries 1364 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1365 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1366 */ 1367 pci_nvme_status_tc(&compl->status, 1368 NVME_SCT_COMMAND_SPECIFIC, 1369 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1370 return (1); 1371 } 1372 ncq->head = ncq->tail = 0; 1373 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1374 command->prp1, 1375 sizeof(struct nvme_command) * (size_t)ncq->size); 1376 1377 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1378 1379 1380 return (1); 1381 } 1382 1383 static int 1384 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1385 struct nvme_completion* compl) 1386 { 1387 uint64_t logoff; 1388 uint32_t logsize; 1389 uint8_t logpage = command->cdw10 & 0xFF; 1390 1391 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1392 1393 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1394 1395 /* 1396 * Command specifies the number of dwords to return in fields NUMDU 1397 * and NUMDL. This is a zero-based value. 1398 */ 1399 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1400 logsize *= sizeof(uint32_t); 1401 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1402 1403 switch (logpage) { 1404 case NVME_LOG_ERROR: 1405 if (logoff >= sizeof(sc->err_log)) { 1406 pci_nvme_status_genc(&compl->status, 1407 NVME_SC_INVALID_FIELD); 1408 break; 1409 } 1410 1411 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1412 command->prp2, (uint8_t *)&sc->err_log + logoff, 1413 MIN(logsize - logoff, sizeof(sc->err_log)), 1414 NVME_COPY_TO_PRP); 1415 break; 1416 case NVME_LOG_HEALTH_INFORMATION: 1417 if (logoff >= sizeof(sc->health_log)) { 1418 pci_nvme_status_genc(&compl->status, 1419 NVME_SC_INVALID_FIELD); 1420 break; 1421 } 1422 1423 pthread_mutex_lock(&sc->mtx); 1424 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1425 sizeof(sc->health_log.data_units_read)); 1426 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1427 sizeof(sc->health_log.data_units_written)); 1428 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1429 sizeof(sc->health_log.host_read_commands)); 1430 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1431 sizeof(sc->health_log.host_write_commands)); 1432 pthread_mutex_unlock(&sc->mtx); 1433 1434 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1435 command->prp2, (uint8_t *)&sc->health_log + logoff, 1436 MIN(logsize - logoff, sizeof(sc->health_log)), 1437 NVME_COPY_TO_PRP); 1438 break; 1439 case NVME_LOG_FIRMWARE_SLOT: 1440 if (logoff >= sizeof(sc->fw_log)) { 1441 pci_nvme_status_genc(&compl->status, 1442 NVME_SC_INVALID_FIELD); 1443 break; 1444 } 1445 1446 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1447 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1448 MIN(logsize - logoff, sizeof(sc->fw_log)), 1449 NVME_COPY_TO_PRP); 1450 break; 1451 case NVME_LOG_CHANGED_NAMESPACE: 1452 if (logoff >= sizeof(sc->ns_log)) { 1453 pci_nvme_status_genc(&compl->status, 1454 NVME_SC_INVALID_FIELD); 1455 break; 1456 } 1457 1458 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1459 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1460 MIN(logsize - logoff, sizeof(sc->ns_log)), 1461 NVME_COPY_TO_PRP); 1462 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1463 break; 1464 default: 1465 DPRINTF("%s get log page %x command not supported", 1466 __func__, logpage); 1467 1468 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1469 NVME_SC_INVALID_LOG_PAGE); 1470 } 1471 1472 return (1); 1473 } 1474 1475 static int 1476 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1477 struct nvme_completion* compl) 1478 { 1479 void *dest; 1480 uint16_t status; 1481 1482 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1483 command->cdw10 & 0xFF, command->nsid); 1484 1485 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1486 1487 switch (command->cdw10 & 0xFF) { 1488 case 0x00: /* return Identify Namespace data structure */ 1489 /* Global NS only valid with NS Management */ 1490 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1491 pci_nvme_status_genc(&status, 1492 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1493 break; 1494 } 1495 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1496 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1497 NVME_COPY_TO_PRP); 1498 break; 1499 case 0x01: /* return Identify Controller data structure */ 1500 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1501 command->prp2, (uint8_t *)&sc->ctrldata, 1502 sizeof(sc->ctrldata), 1503 NVME_COPY_TO_PRP); 1504 break; 1505 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1506 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1507 sizeof(uint32_t) * 1024); 1508 /* All unused entries shall be zero */ 1509 bzero(dest, sizeof(uint32_t) * 1024); 1510 ((uint32_t *)dest)[0] = 1; 1511 break; 1512 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1513 if (command->nsid != 1) { 1514 pci_nvme_status_genc(&status, 1515 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1516 break; 1517 } 1518 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1519 sizeof(uint32_t) * 1024); 1520 /* All bytes after the descriptor shall be zero */ 1521 bzero(dest, sizeof(uint32_t) * 1024); 1522 1523 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1524 ((uint8_t *)dest)[0] = 1; 1525 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1526 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1527 break; 1528 default: 1529 DPRINTF("%s unsupported identify command requested 0x%x", 1530 __func__, command->cdw10 & 0xFF); 1531 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1532 break; 1533 } 1534 1535 compl->status = status; 1536 return (1); 1537 } 1538 1539 static const char * 1540 nvme_fid_to_name(uint8_t fid) 1541 { 1542 const char *name; 1543 1544 switch (fid) { 1545 case NVME_FEAT_ARBITRATION: 1546 name = "Arbitration"; 1547 break; 1548 case NVME_FEAT_POWER_MANAGEMENT: 1549 name = "Power Management"; 1550 break; 1551 case NVME_FEAT_LBA_RANGE_TYPE: 1552 name = "LBA Range Type"; 1553 break; 1554 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1555 name = "Temperature Threshold"; 1556 break; 1557 case NVME_FEAT_ERROR_RECOVERY: 1558 name = "Error Recovery"; 1559 break; 1560 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1561 name = "Volatile Write Cache"; 1562 break; 1563 case NVME_FEAT_NUMBER_OF_QUEUES: 1564 name = "Number of Queues"; 1565 break; 1566 case NVME_FEAT_INTERRUPT_COALESCING: 1567 name = "Interrupt Coalescing"; 1568 break; 1569 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1570 name = "Interrupt Vector Configuration"; 1571 break; 1572 case NVME_FEAT_WRITE_ATOMICITY: 1573 name = "Write Atomicity Normal"; 1574 break; 1575 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1576 name = "Asynchronous Event Configuration"; 1577 break; 1578 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1579 name = "Autonomous Power State Transition"; 1580 break; 1581 case NVME_FEAT_HOST_MEMORY_BUFFER: 1582 name = "Host Memory Buffer"; 1583 break; 1584 case NVME_FEAT_TIMESTAMP: 1585 name = "Timestamp"; 1586 break; 1587 case NVME_FEAT_KEEP_ALIVE_TIMER: 1588 name = "Keep Alive Timer"; 1589 break; 1590 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1591 name = "Host Controlled Thermal Management"; 1592 break; 1593 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1594 name = "Non-Operation Power State Config"; 1595 break; 1596 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1597 name = "Read Recovery Level Config"; 1598 break; 1599 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1600 name = "Predictable Latency Mode Config"; 1601 break; 1602 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1603 name = "Predictable Latency Mode Window"; 1604 break; 1605 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1606 name = "LBA Status Information Report Interval"; 1607 break; 1608 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1609 name = "Host Behavior Support"; 1610 break; 1611 case NVME_FEAT_SANITIZE_CONFIG: 1612 name = "Sanitize Config"; 1613 break; 1614 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1615 name = "Endurance Group Event Configuration"; 1616 break; 1617 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1618 name = "Software Progress Marker"; 1619 break; 1620 case NVME_FEAT_HOST_IDENTIFIER: 1621 name = "Host Identifier"; 1622 break; 1623 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1624 name = "Reservation Notification Mask"; 1625 break; 1626 case NVME_FEAT_RESERVATION_PERSISTENCE: 1627 name = "Reservation Persistence"; 1628 break; 1629 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1630 name = "Namespace Write Protection Config"; 1631 break; 1632 default: 1633 name = "Unknown"; 1634 break; 1635 } 1636 1637 return (name); 1638 } 1639 1640 static void 1641 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1642 struct nvme_feature_obj *feat, 1643 struct nvme_command *command, 1644 struct nvme_completion *compl) 1645 { 1646 1647 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1648 } 1649 1650 static void 1651 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1652 struct nvme_feature_obj *feat, 1653 struct nvme_command *command, 1654 struct nvme_completion *compl) 1655 { 1656 uint32_t i; 1657 uint32_t cdw11 = command->cdw11; 1658 uint16_t iv; 1659 bool cd; 1660 1661 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1662 1663 iv = cdw11 & 0xffff; 1664 cd = cdw11 & (1 << 16); 1665 1666 if (iv > (sc->max_queues + 1)) { 1667 return; 1668 } 1669 1670 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1671 if ((iv == 0) && !cd) 1672 return; 1673 1674 /* Requested Interrupt Vector must be used by a CQ */ 1675 for (i = 0; i < sc->num_cqueues + 1; i++) { 1676 if (sc->compl_queues[i].intr_vec == iv) { 1677 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1678 } 1679 } 1680 } 1681 1682 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1683 static void 1684 nvme_feature_async_event(struct pci_nvme_softc *sc, 1685 struct nvme_feature_obj *feat, 1686 struct nvme_command *command, 1687 struct nvme_completion *compl) 1688 { 1689 1690 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1691 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1692 } 1693 1694 #define NVME_TEMP_THRESH_OVER 0 1695 #define NVME_TEMP_THRESH_UNDER 1 1696 static void 1697 nvme_feature_temperature(struct pci_nvme_softc *sc, 1698 struct nvme_feature_obj *feat, 1699 struct nvme_command *command, 1700 struct nvme_completion *compl) 1701 { 1702 uint16_t tmpth; /* Temperature Threshold */ 1703 uint8_t tmpsel; /* Threshold Temperature Select */ 1704 uint8_t thsel; /* Threshold Type Select */ 1705 bool set_crit = false; 1706 1707 tmpth = command->cdw11 & 0xffff; 1708 tmpsel = (command->cdw11 >> 16) & 0xf; 1709 thsel = (command->cdw11 >> 20) & 0x3; 1710 1711 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1712 1713 /* Check for unsupported values */ 1714 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1715 (thsel > NVME_TEMP_THRESH_UNDER)) { 1716 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1717 return; 1718 } 1719 1720 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1721 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1722 set_crit = true; 1723 1724 pthread_mutex_lock(&sc->mtx); 1725 if (set_crit) 1726 sc->health_log.critical_warning |= 1727 NVME_CRIT_WARN_ST_TEMPERATURE; 1728 else 1729 sc->health_log.critical_warning &= 1730 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1731 pthread_mutex_unlock(&sc->mtx); 1732 1733 if (set_crit) 1734 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1735 sc->health_log.critical_warning); 1736 1737 1738 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1739 } 1740 1741 static void 1742 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1743 struct nvme_feature_obj *feat, 1744 struct nvme_command *command, 1745 struct nvme_completion *compl) 1746 { 1747 uint16_t nqr; /* Number of Queues Requested */ 1748 1749 if (sc->num_q_is_set) { 1750 WPRINTF("%s: Number of Queues already set", __func__); 1751 pci_nvme_status_genc(&compl->status, 1752 NVME_SC_COMMAND_SEQUENCE_ERROR); 1753 return; 1754 } 1755 1756 nqr = command->cdw11 & 0xFFFF; 1757 if (nqr == 0xffff) { 1758 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1759 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1760 return; 1761 } 1762 1763 sc->num_squeues = ONE_BASED(nqr); 1764 if (sc->num_squeues > sc->max_queues) { 1765 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1766 sc->max_queues); 1767 sc->num_squeues = sc->max_queues; 1768 } 1769 1770 nqr = (command->cdw11 >> 16) & 0xFFFF; 1771 if (nqr == 0xffff) { 1772 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1773 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1774 return; 1775 } 1776 1777 sc->num_cqueues = ONE_BASED(nqr); 1778 if (sc->num_cqueues > sc->max_queues) { 1779 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1780 sc->max_queues); 1781 sc->num_cqueues = sc->max_queues; 1782 } 1783 1784 /* Patch the command value which will be saved on callback's return */ 1785 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1786 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1787 1788 sc->num_q_is_set = true; 1789 } 1790 1791 static int 1792 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1793 struct nvme_completion *compl) 1794 { 1795 struct nvme_feature_obj *feat; 1796 uint32_t nsid = command->nsid; 1797 uint8_t fid = command->cdw10 & 0xFF; 1798 1799 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1800 1801 if (fid >= NVME_FID_MAX) { 1802 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1803 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1804 return (1); 1805 } 1806 feat = &sc->feat[fid]; 1807 1808 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1809 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1810 return (1); 1811 } 1812 1813 if (!feat->namespace_specific && 1814 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1815 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1816 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1817 return (1); 1818 } 1819 1820 compl->cdw0 = 0; 1821 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1822 1823 if (feat->set) 1824 feat->set(sc, feat, command, compl); 1825 1826 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1827 if (compl->status == NVME_SC_SUCCESS) { 1828 feat->cdw11 = command->cdw11; 1829 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1830 (command->cdw11 != 0)) 1831 pci_nvme_aen_notify(sc); 1832 } 1833 1834 return (0); 1835 } 1836 1837 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1838 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1839 1840 static int 1841 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1842 struct nvme_completion* compl) 1843 { 1844 struct nvme_feature_obj *feat; 1845 uint8_t fid = command->cdw10 & 0xFF; 1846 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1847 1848 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1849 1850 if (fid >= NVME_FID_MAX) { 1851 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1852 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1853 return (1); 1854 } 1855 1856 compl->cdw0 = 0; 1857 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1858 1859 feat = &sc->feat[fid]; 1860 if (feat->get) { 1861 feat->get(sc, feat, command, compl); 1862 } 1863 1864 if (compl->status == NVME_SC_SUCCESS) { 1865 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1866 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1867 else 1868 compl->cdw0 = feat->cdw11; 1869 } 1870 1871 return (0); 1872 } 1873 1874 static int 1875 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1876 struct nvme_completion* compl) 1877 { 1878 uint8_t ses, lbaf, pi; 1879 1880 /* Only supports Secure Erase Setting - User Data Erase */ 1881 ses = (command->cdw10 >> 9) & 0x7; 1882 if (ses > 0x1) { 1883 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1884 return (1); 1885 } 1886 1887 /* Only supports a single LBA Format */ 1888 lbaf = command->cdw10 & 0xf; 1889 if (lbaf != 0) { 1890 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1891 NVME_SC_INVALID_FORMAT); 1892 return (1); 1893 } 1894 1895 /* Doesn't support Protection Infomation */ 1896 pi = (command->cdw10 >> 5) & 0x7; 1897 if (pi != 0) { 1898 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1899 return (1); 1900 } 1901 1902 if (sc->nvstore.type == NVME_STOR_RAM) { 1903 if (sc->nvstore.ctx) 1904 free(sc->nvstore.ctx); 1905 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1906 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1907 } else { 1908 struct pci_nvme_ioreq *req; 1909 int err; 1910 1911 req = pci_nvme_get_ioreq(sc); 1912 if (req == NULL) { 1913 pci_nvme_status_genc(&compl->status, 1914 NVME_SC_INTERNAL_DEVICE_ERROR); 1915 WPRINTF("%s: unable to allocate IO req", __func__); 1916 return (1); 1917 } 1918 req->nvme_sq = &sc->submit_queues[0]; 1919 req->sqid = 0; 1920 req->opc = command->opc; 1921 req->cid = command->cid; 1922 req->nsid = command->nsid; 1923 1924 req->io_req.br_offset = 0; 1925 req->io_req.br_resid = sc->nvstore.size; 1926 req->io_req.br_callback = pci_nvme_io_done; 1927 1928 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1929 if (err) { 1930 pci_nvme_status_genc(&compl->status, 1931 NVME_SC_INTERNAL_DEVICE_ERROR); 1932 pci_nvme_release_ioreq(sc, req); 1933 } else 1934 compl->status = NVME_NO_STATUS; 1935 } 1936 1937 return (1); 1938 } 1939 1940 static int 1941 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1942 struct nvme_completion* compl) 1943 { 1944 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1945 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1946 1947 /* TODO: search for the command ID and abort it */ 1948 1949 compl->cdw0 = 1; 1950 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1951 return (1); 1952 } 1953 1954 static int 1955 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1956 struct nvme_command* command, struct nvme_completion* compl) 1957 { 1958 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1959 sc->aer_count, sc->ctrldata.aerl, command->cid); 1960 1961 /* Don't exceed the Async Event Request Limit (AERL). */ 1962 if (pci_nvme_aer_limit_reached(sc)) { 1963 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1964 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1965 return (1); 1966 } 1967 1968 if (pci_nvme_aer_add(sc, command->cid)) { 1969 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1970 NVME_SC_INTERNAL_DEVICE_ERROR); 1971 return (1); 1972 } 1973 1974 /* 1975 * Raise events when they happen based on the Set Features cmd. 1976 * These events happen async, so only set completion successful if 1977 * there is an event reflective of the request to get event. 1978 */ 1979 compl->status = NVME_NO_STATUS; 1980 pci_nvme_aen_notify(sc); 1981 1982 return (0); 1983 } 1984 1985 static void 1986 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1987 { 1988 struct nvme_completion compl; 1989 struct nvme_command *cmd; 1990 struct nvme_submission_queue *sq; 1991 struct nvme_completion_queue *cq; 1992 uint16_t sqhead; 1993 1994 DPRINTF("%s index %u", __func__, (uint32_t)value); 1995 1996 sq = &sc->submit_queues[0]; 1997 cq = &sc->compl_queues[0]; 1998 1999 pthread_mutex_lock(&sq->mtx); 2000 2001 sqhead = sq->head; 2002 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2003 2004 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2005 cmd = &(sq->qbase)[sqhead]; 2006 compl.cdw0 = 0; 2007 compl.status = 0; 2008 2009 switch (cmd->opc) { 2010 case NVME_OPC_DELETE_IO_SQ: 2011 DPRINTF("%s command DELETE_IO_SQ", __func__); 2012 nvme_opc_delete_io_sq(sc, cmd, &compl); 2013 break; 2014 case NVME_OPC_CREATE_IO_SQ: 2015 DPRINTF("%s command CREATE_IO_SQ", __func__); 2016 nvme_opc_create_io_sq(sc, cmd, &compl); 2017 break; 2018 case NVME_OPC_DELETE_IO_CQ: 2019 DPRINTF("%s command DELETE_IO_CQ", __func__); 2020 nvme_opc_delete_io_cq(sc, cmd, &compl); 2021 break; 2022 case NVME_OPC_CREATE_IO_CQ: 2023 DPRINTF("%s command CREATE_IO_CQ", __func__); 2024 nvme_opc_create_io_cq(sc, cmd, &compl); 2025 break; 2026 case NVME_OPC_GET_LOG_PAGE: 2027 DPRINTF("%s command GET_LOG_PAGE", __func__); 2028 nvme_opc_get_log_page(sc, cmd, &compl); 2029 break; 2030 case NVME_OPC_IDENTIFY: 2031 DPRINTF("%s command IDENTIFY", __func__); 2032 nvme_opc_identify(sc, cmd, &compl); 2033 break; 2034 case NVME_OPC_ABORT: 2035 DPRINTF("%s command ABORT", __func__); 2036 nvme_opc_abort(sc, cmd, &compl); 2037 break; 2038 case NVME_OPC_SET_FEATURES: 2039 DPRINTF("%s command SET_FEATURES", __func__); 2040 nvme_opc_set_features(sc, cmd, &compl); 2041 break; 2042 case NVME_OPC_GET_FEATURES: 2043 DPRINTF("%s command GET_FEATURES", __func__); 2044 nvme_opc_get_features(sc, cmd, &compl); 2045 break; 2046 case NVME_OPC_FIRMWARE_ACTIVATE: 2047 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2048 pci_nvme_status_tc(&compl.status, 2049 NVME_SCT_COMMAND_SPECIFIC, 2050 NVME_SC_INVALID_FIRMWARE_SLOT); 2051 break; 2052 case NVME_OPC_ASYNC_EVENT_REQUEST: 2053 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2054 nvme_opc_async_event_req(sc, cmd, &compl); 2055 break; 2056 case NVME_OPC_FORMAT_NVM: 2057 DPRINTF("%s command FORMAT_NVM", __func__); 2058 if ((sc->ctrldata.oacs & 2059 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2060 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2061 break; 2062 } 2063 nvme_opc_format_nvm(sc, cmd, &compl); 2064 break; 2065 case NVME_OPC_SECURITY_SEND: 2066 case NVME_OPC_SECURITY_RECEIVE: 2067 case NVME_OPC_SANITIZE: 2068 case NVME_OPC_GET_LBA_STATUS: 2069 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2070 cmd->opc); 2071 /* Valid but unsupported opcodes */ 2072 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2073 break; 2074 default: 2075 DPRINTF("%s command OPC=%#X (not implemented)", 2076 __func__, 2077 cmd->opc); 2078 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2079 } 2080 sqhead = (sqhead + 1) % sq->size; 2081 2082 if (NVME_COMPLETION_VALID(compl)) { 2083 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2084 compl.cdw0, 2085 cmd->cid, 2086 0, /* SQID */ 2087 compl.status); 2088 } 2089 } 2090 2091 DPRINTF("setting sqhead %u", sqhead); 2092 sq->head = sqhead; 2093 2094 if (cq->head != cq->tail) 2095 pci_generate_msix(sc->nsc_pi, 0); 2096 2097 pthread_mutex_unlock(&sq->mtx); 2098 } 2099 2100 /* 2101 * Update the Write and Read statistics reported in SMART data 2102 * 2103 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2104 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2105 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2106 */ 2107 static void 2108 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2109 size_t bytes, uint16_t status) 2110 { 2111 2112 pthread_mutex_lock(&sc->mtx); 2113 switch (opc) { 2114 case NVME_OPC_WRITE: 2115 sc->write_commands++; 2116 if (status != NVME_SC_SUCCESS) 2117 break; 2118 sc->write_dunits_remainder += (bytes / 512); 2119 while (sc->write_dunits_remainder >= 1000) { 2120 sc->write_data_units++; 2121 sc->write_dunits_remainder -= 1000; 2122 } 2123 break; 2124 case NVME_OPC_READ: 2125 sc->read_commands++; 2126 if (status != NVME_SC_SUCCESS) 2127 break; 2128 sc->read_dunits_remainder += (bytes / 512); 2129 while (sc->read_dunits_remainder >= 1000) { 2130 sc->read_data_units++; 2131 sc->read_dunits_remainder -= 1000; 2132 } 2133 break; 2134 default: 2135 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2136 break; 2137 } 2138 pthread_mutex_unlock(&sc->mtx); 2139 } 2140 2141 /* 2142 * Check if the combination of Starting LBA (slba) and number of blocks 2143 * exceeds the range of the underlying storage. 2144 * 2145 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2146 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2147 * overflow. 2148 */ 2149 static bool 2150 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2151 uint32_t nblocks) 2152 { 2153 size_t offset, bytes; 2154 2155 /* Overflow check of multiplying Starting LBA by the sector size */ 2156 if (slba >> (64 - nvstore->sectsz_bits)) 2157 return (true); 2158 2159 offset = slba << nvstore->sectsz_bits; 2160 bytes = nblocks << nvstore->sectsz_bits; 2161 2162 /* Overflow check of Number of Logical Blocks */ 2163 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2164 return (true); 2165 2166 return (false); 2167 } 2168 2169 static int 2170 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2171 uint64_t gpaddr, size_t size, int do_write, uint64_t offset) 2172 { 2173 int iovidx; 2174 bool range_is_contiguous; 2175 2176 if (req == NULL) 2177 return (-1); 2178 2179 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2180 return (-1); 2181 } 2182 2183 /* 2184 * Minimize the number of IOVs by concatenating contiguous address 2185 * ranges. If the IOV count is zero, there is no previous range to 2186 * concatenate. 2187 */ 2188 if (req->io_req.br_iovcnt == 0) 2189 range_is_contiguous = false; 2190 else 2191 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2192 2193 if (range_is_contiguous) { 2194 iovidx = req->io_req.br_iovcnt - 1; 2195 2196 req->io_req.br_iov[iovidx].iov_base = 2197 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2198 req->prev_gpaddr, size); 2199 2200 req->prev_size += size; 2201 req->io_req.br_resid += size; 2202 2203 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2204 } else { 2205 iovidx = req->io_req.br_iovcnt; 2206 if (iovidx == 0) { 2207 req->io_req.br_offset = offset; 2208 req->io_req.br_resid = 0; 2209 req->io_req.br_param = req; 2210 } 2211 2212 req->io_req.br_iov[iovidx].iov_base = 2213 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2214 gpaddr, size); 2215 2216 req->io_req.br_iov[iovidx].iov_len = size; 2217 2218 req->prev_gpaddr = gpaddr; 2219 req->prev_size = size; 2220 req->io_req.br_resid += size; 2221 2222 req->io_req.br_iovcnt++; 2223 } 2224 2225 return (0); 2226 } 2227 2228 static void 2229 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2230 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 2231 uint32_t cdw0, uint16_t status) 2232 { 2233 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2234 2235 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2236 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2237 NVME_STATUS_GET_SC(status)); 2238 2239 pci_nvme_cq_update(sc, cq, 2240 0, /* CDW0 */ 2241 cid, 2242 sqid, 2243 status); 2244 2245 if (cq->head != cq->tail) { 2246 if (cq->intr_en & NVME_CQ_INTEN) { 2247 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2248 } else { 2249 DPRINTF("%s: CQ%u interrupt disabled", 2250 __func__, sq->cqid); 2251 } 2252 } 2253 } 2254 2255 static void 2256 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2257 { 2258 req->sc = NULL; 2259 req->nvme_sq = NULL; 2260 req->sqid = 0; 2261 2262 pthread_mutex_lock(&sc->mtx); 2263 2264 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2265 sc->pending_ios--; 2266 2267 /* when no more IO pending, can set to ready if device reset/enabled */ 2268 if (sc->pending_ios == 0 && 2269 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2270 sc->regs.csts |= NVME_CSTS_RDY; 2271 2272 pthread_mutex_unlock(&sc->mtx); 2273 2274 sem_post(&sc->iosemlock); 2275 } 2276 2277 static struct pci_nvme_ioreq * 2278 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2279 { 2280 struct pci_nvme_ioreq *req = NULL; 2281 2282 sem_wait(&sc->iosemlock); 2283 pthread_mutex_lock(&sc->mtx); 2284 2285 req = STAILQ_FIRST(&sc->ioreqs_free); 2286 assert(req != NULL); 2287 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2288 2289 req->sc = sc; 2290 2291 sc->pending_ios++; 2292 2293 pthread_mutex_unlock(&sc->mtx); 2294 2295 req->io_req.br_iovcnt = 0; 2296 req->io_req.br_offset = 0; 2297 req->io_req.br_resid = 0; 2298 req->io_req.br_param = req; 2299 req->prev_gpaddr = 0; 2300 req->prev_size = 0; 2301 2302 return req; 2303 } 2304 2305 static void 2306 pci_nvme_io_done(struct blockif_req *br, int err) 2307 { 2308 struct pci_nvme_ioreq *req = br->br_param; 2309 struct nvme_submission_queue *sq = req->nvme_sq; 2310 uint16_t code, status; 2311 2312 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2313 2314 /* TODO return correct error */ 2315 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2316 pci_nvme_status_genc(&status, code); 2317 2318 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 2319 pci_nvme_stats_write_read_update(req->sc, req->opc, 2320 req->bytes, status); 2321 pci_nvme_release_ioreq(req->sc, req); 2322 } 2323 2324 /* 2325 * Implements the Flush command. The specification states: 2326 * If a volatile write cache is not present, Flush commands complete 2327 * successfully and have no effect 2328 * in the description of the Volatile Write Cache (VWC) field of the Identify 2329 * Controller data. Therefore, set status to Success if the command is 2330 * not supported (i.e. RAM or as indicated by the blockif). 2331 */ 2332 static bool 2333 nvme_opc_flush(struct pci_nvme_softc *sc, 2334 struct nvme_command *cmd, 2335 struct pci_nvme_blockstore *nvstore, 2336 struct pci_nvme_ioreq *req, 2337 uint16_t *status) 2338 { 2339 bool pending = false; 2340 2341 if (nvstore->type == NVME_STOR_RAM) { 2342 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2343 } else { 2344 int err; 2345 2346 req->io_req.br_callback = pci_nvme_io_done; 2347 2348 err = blockif_flush(nvstore->ctx, &req->io_req); 2349 switch (err) { 2350 case 0: 2351 pending = true; 2352 break; 2353 case EOPNOTSUPP: 2354 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2355 break; 2356 default: 2357 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2358 } 2359 } 2360 2361 return (pending); 2362 } 2363 2364 static uint16_t 2365 nvme_write_read_ram(struct pci_nvme_softc *sc, 2366 struct pci_nvme_blockstore *nvstore, 2367 uint64_t prp1, uint64_t prp2, 2368 size_t offset, uint64_t bytes, 2369 bool is_write) 2370 { 2371 uint8_t *buf = nvstore->ctx; 2372 enum nvme_copy_dir dir; 2373 uint16_t status; 2374 2375 if (is_write) 2376 dir = NVME_COPY_TO_PRP; 2377 else 2378 dir = NVME_COPY_FROM_PRP; 2379 2380 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2381 buf + offset, bytes, dir)) 2382 pci_nvme_status_genc(&status, 2383 NVME_SC_DATA_TRANSFER_ERROR); 2384 else 2385 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2386 2387 return (status); 2388 } 2389 2390 static uint16_t 2391 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2392 struct pci_nvme_blockstore *nvstore, 2393 struct pci_nvme_ioreq *req, 2394 uint64_t prp1, uint64_t prp2, 2395 size_t offset, uint64_t bytes, 2396 bool is_write) 2397 { 2398 uint64_t size; 2399 int err; 2400 uint16_t status = NVME_NO_STATUS; 2401 2402 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2403 if (pci_nvme_append_iov_req(sc, req, prp1, 2404 size, is_write, offset)) { 2405 pci_nvme_status_genc(&status, 2406 NVME_SC_DATA_TRANSFER_ERROR); 2407 goto out; 2408 } 2409 2410 offset += size; 2411 bytes -= size; 2412 2413 if (bytes == 0) { 2414 ; 2415 } else if (bytes <= PAGE_SIZE) { 2416 size = bytes; 2417 if (pci_nvme_append_iov_req(sc, req, prp2, 2418 size, is_write, offset)) { 2419 pci_nvme_status_genc(&status, 2420 NVME_SC_DATA_TRANSFER_ERROR); 2421 goto out; 2422 } 2423 } else { 2424 void *vmctx = sc->nsc_pi->pi_vmctx; 2425 uint64_t *prp_list = &prp2; 2426 uint64_t *last = prp_list; 2427 2428 /* PRP2 is pointer to a physical region page list */ 2429 while (bytes) { 2430 /* Last entry in list points to the next list */ 2431 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2432 uint64_t prp = *prp_list; 2433 2434 prp_list = paddr_guest2host(vmctx, prp, 2435 PAGE_SIZE - (prp % PAGE_SIZE)); 2436 last = prp_list + (NVME_PRP2_ITEMS - 1); 2437 } 2438 2439 size = MIN(bytes, PAGE_SIZE); 2440 2441 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2442 size, is_write, offset)) { 2443 pci_nvme_status_genc(&status, 2444 NVME_SC_DATA_TRANSFER_ERROR); 2445 goto out; 2446 } 2447 2448 offset += size; 2449 bytes -= size; 2450 2451 prp_list++; 2452 } 2453 } 2454 req->io_req.br_callback = pci_nvme_io_done; 2455 if (is_write) 2456 err = blockif_write(nvstore->ctx, &req->io_req); 2457 else 2458 err = blockif_read(nvstore->ctx, &req->io_req); 2459 2460 if (err) 2461 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2462 out: 2463 return (status); 2464 } 2465 2466 static bool 2467 nvme_opc_write_read(struct pci_nvme_softc *sc, 2468 struct nvme_command *cmd, 2469 struct pci_nvme_blockstore *nvstore, 2470 struct pci_nvme_ioreq *req, 2471 uint16_t *status) 2472 { 2473 uint64_t lba, nblocks, bytes; 2474 size_t offset; 2475 bool is_write = cmd->opc == NVME_OPC_WRITE; 2476 bool pending = false; 2477 2478 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2479 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2480 2481 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2482 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2483 __func__, lba, nblocks); 2484 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2485 goto out; 2486 } 2487 2488 bytes = nblocks << nvstore->sectsz_bits; 2489 if (bytes > NVME_MAX_DATA_SIZE) { 2490 WPRINTF("%s command would exceed MDTS", __func__); 2491 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2492 goto out; 2493 } 2494 2495 offset = lba << nvstore->sectsz_bits; 2496 2497 req->bytes = bytes; 2498 req->io_req.br_offset = lba; 2499 2500 /* PRP bits 1:0 must be zero */ 2501 cmd->prp1 &= ~0x3UL; 2502 cmd->prp2 &= ~0x3UL; 2503 2504 if (nvstore->type == NVME_STOR_RAM) { 2505 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2506 cmd->prp2, offset, bytes, is_write); 2507 } else { 2508 *status = nvme_write_read_blockif(sc, nvstore, req, 2509 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2510 2511 if (*status == NVME_NO_STATUS) 2512 pending = true; 2513 } 2514 out: 2515 if (!pending) 2516 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2517 2518 return (pending); 2519 } 2520 2521 static void 2522 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2523 { 2524 struct pci_nvme_ioreq *req = br->br_param; 2525 struct pci_nvme_softc *sc = req->sc; 2526 bool done = true; 2527 uint16_t status; 2528 2529 if (err) { 2530 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2531 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2532 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2533 } else { 2534 struct iovec *iov = req->io_req.br_iov; 2535 2536 req->prev_gpaddr++; 2537 iov += req->prev_gpaddr; 2538 2539 /* The iov_* values already include the sector size */ 2540 req->io_req.br_offset = (off_t)iov->iov_base; 2541 req->io_req.br_resid = iov->iov_len; 2542 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2543 pci_nvme_status_genc(&status, 2544 NVME_SC_INTERNAL_DEVICE_ERROR); 2545 } else 2546 done = false; 2547 } 2548 2549 if (done) { 2550 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2551 req->cid, 0, status); 2552 pci_nvme_release_ioreq(sc, req); 2553 } 2554 } 2555 2556 static bool 2557 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2558 struct nvme_command *cmd, 2559 struct pci_nvme_blockstore *nvstore, 2560 struct pci_nvme_ioreq *req, 2561 uint16_t *status) 2562 { 2563 struct nvme_dsm_range *range; 2564 uint32_t nr, r, non_zero, dr; 2565 int err; 2566 bool pending = false; 2567 2568 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2569 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2570 goto out; 2571 } 2572 2573 nr = cmd->cdw10 & 0xff; 2574 2575 /* copy locally because a range entry could straddle PRPs */ 2576 range = calloc(1, NVME_MAX_DSM_TRIM); 2577 if (range == NULL) { 2578 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2579 goto out; 2580 } 2581 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2582 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2583 2584 /* Check for invalid ranges and the number of non-zero lengths */ 2585 non_zero = 0; 2586 for (r = 0; r <= nr; r++) { 2587 if (pci_nvme_out_of_range(nvstore, 2588 range[r].starting_lba, range[r].length)) { 2589 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2590 goto out; 2591 } 2592 if (range[r].length != 0) 2593 non_zero++; 2594 } 2595 2596 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2597 size_t offset, bytes; 2598 int sectsz_bits = sc->nvstore.sectsz_bits; 2599 2600 /* 2601 * DSM calls are advisory only, and compliant controllers 2602 * may choose to take no actions (i.e. return Success). 2603 */ 2604 if (!nvstore->deallocate) { 2605 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2606 goto out; 2607 } 2608 2609 /* If all ranges have a zero length, return Success */ 2610 if (non_zero == 0) { 2611 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2612 goto out; 2613 } 2614 2615 if (req == NULL) { 2616 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2617 goto out; 2618 } 2619 2620 offset = range[0].starting_lba << sectsz_bits; 2621 bytes = range[0].length << sectsz_bits; 2622 2623 /* 2624 * If the request is for more than a single range, store 2625 * the ranges in the br_iov. Optimize for the common case 2626 * of a single range. 2627 * 2628 * Note that NVMe Number of Ranges is a zero based value 2629 */ 2630 req->io_req.br_iovcnt = 0; 2631 req->io_req.br_offset = offset; 2632 req->io_req.br_resid = bytes; 2633 2634 if (nr == 0) { 2635 req->io_req.br_callback = pci_nvme_io_done; 2636 } else { 2637 struct iovec *iov = req->io_req.br_iov; 2638 2639 for (r = 0, dr = 0; r <= nr; r++) { 2640 offset = range[r].starting_lba << sectsz_bits; 2641 bytes = range[r].length << sectsz_bits; 2642 if (bytes == 0) 2643 continue; 2644 2645 if ((nvstore->size - offset) < bytes) { 2646 pci_nvme_status_genc(status, 2647 NVME_SC_LBA_OUT_OF_RANGE); 2648 goto out; 2649 } 2650 iov[dr].iov_base = (void *)offset; 2651 iov[dr].iov_len = bytes; 2652 dr++; 2653 } 2654 req->io_req.br_callback = pci_nvme_dealloc_sm; 2655 2656 /* 2657 * Use prev_gpaddr to track the current entry and 2658 * prev_size to track the number of entries 2659 */ 2660 req->prev_gpaddr = 0; 2661 req->prev_size = dr; 2662 } 2663 2664 err = blockif_delete(nvstore->ctx, &req->io_req); 2665 if (err) 2666 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2667 else 2668 pending = true; 2669 } 2670 out: 2671 free(range); 2672 return (pending); 2673 } 2674 2675 static void 2676 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2677 { 2678 struct nvme_submission_queue *sq; 2679 uint16_t status; 2680 uint16_t sqhead; 2681 2682 /* handle all submissions up to sq->tail index */ 2683 sq = &sc->submit_queues[idx]; 2684 2685 pthread_mutex_lock(&sq->mtx); 2686 2687 sqhead = sq->head; 2688 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2689 idx, sqhead, sq->tail, sq->qbase); 2690 2691 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2692 struct nvme_command *cmd; 2693 struct pci_nvme_ioreq *req; 2694 uint32_t nsid; 2695 bool pending; 2696 2697 pending = false; 2698 req = NULL; 2699 status = 0; 2700 2701 cmd = &sq->qbase[sqhead]; 2702 sqhead = (sqhead + 1) % sq->size; 2703 2704 nsid = le32toh(cmd->nsid); 2705 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2706 pci_nvme_status_genc(&status, 2707 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2708 status |= 2709 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2710 goto complete; 2711 } 2712 2713 req = pci_nvme_get_ioreq(sc); 2714 if (req == NULL) { 2715 pci_nvme_status_genc(&status, 2716 NVME_SC_INTERNAL_DEVICE_ERROR); 2717 WPRINTF("%s: unable to allocate IO req", __func__); 2718 goto complete; 2719 } 2720 req->nvme_sq = sq; 2721 req->sqid = idx; 2722 req->opc = cmd->opc; 2723 req->cid = cmd->cid; 2724 req->nsid = cmd->nsid; 2725 2726 switch (cmd->opc) { 2727 case NVME_OPC_FLUSH: 2728 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2729 req, &status); 2730 break; 2731 case NVME_OPC_WRITE: 2732 case NVME_OPC_READ: 2733 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2734 req, &status); 2735 break; 2736 case NVME_OPC_WRITE_ZEROES: 2737 /* TODO: write zeroes 2738 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2739 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2740 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2741 break; 2742 case NVME_OPC_DATASET_MANAGEMENT: 2743 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2744 req, &status); 2745 break; 2746 default: 2747 WPRINTF("%s unhandled io command 0x%x", 2748 __func__, cmd->opc); 2749 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2750 } 2751 complete: 2752 if (!pending) { 2753 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2754 status); 2755 if (req != NULL) 2756 pci_nvme_release_ioreq(sc, req); 2757 } 2758 } 2759 2760 sq->head = sqhead; 2761 2762 pthread_mutex_unlock(&sq->mtx); 2763 } 2764 2765 static void 2766 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2767 uint64_t idx, int is_sq, uint64_t value) 2768 { 2769 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2770 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2771 2772 if (is_sq) { 2773 if (idx > sc->num_squeues) { 2774 WPRINTF("%s queue index %lu overflow from " 2775 "guest (max %u)", 2776 __func__, idx, sc->num_squeues); 2777 return; 2778 } 2779 2780 atomic_store_short(&sc->submit_queues[idx].tail, 2781 (uint16_t)value); 2782 2783 if (idx == 0) { 2784 pci_nvme_handle_admin_cmd(sc, value); 2785 } else { 2786 /* submission queue; handle new entries in SQ */ 2787 if (idx > sc->num_squeues) { 2788 WPRINTF("%s SQ index %lu overflow from " 2789 "guest (max %u)", 2790 __func__, idx, sc->num_squeues); 2791 return; 2792 } 2793 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2794 } 2795 } else { 2796 if (idx > sc->num_cqueues) { 2797 WPRINTF("%s queue index %lu overflow from " 2798 "guest (max %u)", 2799 __func__, idx, sc->num_cqueues); 2800 return; 2801 } 2802 2803 atomic_store_short(&sc->compl_queues[idx].head, 2804 (uint16_t)value); 2805 } 2806 } 2807 2808 static void 2809 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2810 { 2811 const char *s = iswrite ? "WRITE" : "READ"; 2812 2813 switch (offset) { 2814 case NVME_CR_CAP_LOW: 2815 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2816 break; 2817 case NVME_CR_CAP_HI: 2818 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2819 break; 2820 case NVME_CR_VS: 2821 DPRINTF("%s %s NVME_CR_VS", func, s); 2822 break; 2823 case NVME_CR_INTMS: 2824 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2825 break; 2826 case NVME_CR_INTMC: 2827 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2828 break; 2829 case NVME_CR_CC: 2830 DPRINTF("%s %s NVME_CR_CC", func, s); 2831 break; 2832 case NVME_CR_CSTS: 2833 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2834 break; 2835 case NVME_CR_NSSR: 2836 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2837 break; 2838 case NVME_CR_AQA: 2839 DPRINTF("%s %s NVME_CR_AQA", func, s); 2840 break; 2841 case NVME_CR_ASQ_LOW: 2842 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2843 break; 2844 case NVME_CR_ASQ_HI: 2845 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2846 break; 2847 case NVME_CR_ACQ_LOW: 2848 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2849 break; 2850 case NVME_CR_ACQ_HI: 2851 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2852 break; 2853 default: 2854 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2855 } 2856 2857 } 2858 2859 static void 2860 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2861 uint64_t offset, int size, uint64_t value) 2862 { 2863 uint32_t ccreg; 2864 2865 if (offset >= NVME_DOORBELL_OFFSET) { 2866 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2867 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2868 int is_sq = (belloffset % 8) < 4; 2869 2870 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2871 WPRINTF("guest attempted an overflow write offset " 2872 "0x%lx, val 0x%lx in %s", 2873 offset, value, __func__); 2874 return; 2875 } 2876 2877 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2878 return; 2879 } 2880 2881 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2882 offset, size, value); 2883 2884 if (size != 4) { 2885 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2886 "val 0x%lx) to bar0 in %s", 2887 size, offset, value, __func__); 2888 /* TODO: shutdown device */ 2889 return; 2890 } 2891 2892 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2893 2894 pthread_mutex_lock(&sc->mtx); 2895 2896 switch (offset) { 2897 case NVME_CR_CAP_LOW: 2898 case NVME_CR_CAP_HI: 2899 /* readonly */ 2900 break; 2901 case NVME_CR_VS: 2902 /* readonly */ 2903 break; 2904 case NVME_CR_INTMS: 2905 /* MSI-X, so ignore */ 2906 break; 2907 case NVME_CR_INTMC: 2908 /* MSI-X, so ignore */ 2909 break; 2910 case NVME_CR_CC: 2911 ccreg = (uint32_t)value; 2912 2913 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2914 "iocqes %u", 2915 __func__, 2916 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2917 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2918 NVME_CC_GET_IOCQES(ccreg)); 2919 2920 if (NVME_CC_GET_SHN(ccreg)) { 2921 /* perform shutdown - flush out data to backend */ 2922 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2923 NVME_CSTS_REG_SHST_SHIFT); 2924 sc->regs.csts |= NVME_SHST_COMPLETE << 2925 NVME_CSTS_REG_SHST_SHIFT; 2926 } 2927 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2928 if (NVME_CC_GET_EN(ccreg) == 0) 2929 /* transition 1-> causes controller reset */ 2930 pci_nvme_reset_locked(sc); 2931 else 2932 pci_nvme_init_controller(ctx, sc); 2933 } 2934 2935 /* Insert the iocqes, iosqes and en bits from the write */ 2936 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2937 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2938 if (NVME_CC_GET_EN(ccreg) == 0) { 2939 /* Insert the ams, mps and css bit fields */ 2940 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2941 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2942 sc->regs.csts &= ~NVME_CSTS_RDY; 2943 } else if (sc->pending_ios == 0) { 2944 sc->regs.csts |= NVME_CSTS_RDY; 2945 } 2946 break; 2947 case NVME_CR_CSTS: 2948 break; 2949 case NVME_CR_NSSR: 2950 /* ignore writes; don't support subsystem reset */ 2951 break; 2952 case NVME_CR_AQA: 2953 sc->regs.aqa = (uint32_t)value; 2954 break; 2955 case NVME_CR_ASQ_LOW: 2956 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2957 (0xFFFFF000 & value); 2958 break; 2959 case NVME_CR_ASQ_HI: 2960 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2961 (value << 32); 2962 break; 2963 case NVME_CR_ACQ_LOW: 2964 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2965 (0xFFFFF000 & value); 2966 break; 2967 case NVME_CR_ACQ_HI: 2968 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2969 (value << 32); 2970 break; 2971 default: 2972 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2973 __func__, offset, value, size); 2974 } 2975 pthread_mutex_unlock(&sc->mtx); 2976 } 2977 2978 static void 2979 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2980 int baridx, uint64_t offset, int size, uint64_t value) 2981 { 2982 struct pci_nvme_softc* sc = pi->pi_arg; 2983 2984 if (baridx == pci_msix_table_bar(pi) || 2985 baridx == pci_msix_pba_bar(pi)) { 2986 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2987 " value 0x%lx", baridx, offset, size, value); 2988 2989 pci_emul_msix_twrite(pi, offset, size, value); 2990 return; 2991 } 2992 2993 switch (baridx) { 2994 case 0: 2995 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2996 break; 2997 2998 default: 2999 DPRINTF("%s unknown baridx %d, val 0x%lx", 3000 __func__, baridx, value); 3001 } 3002 } 3003 3004 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3005 uint64_t offset, int size) 3006 { 3007 uint64_t value; 3008 3009 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3010 3011 if (offset < NVME_DOORBELL_OFFSET) { 3012 void *p = &(sc->regs); 3013 pthread_mutex_lock(&sc->mtx); 3014 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3015 pthread_mutex_unlock(&sc->mtx); 3016 } else { 3017 value = 0; 3018 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3019 } 3020 3021 switch (size) { 3022 case 1: 3023 value &= 0xFF; 3024 break; 3025 case 2: 3026 value &= 0xFFFF; 3027 break; 3028 case 4: 3029 value &= 0xFFFFFFFF; 3030 break; 3031 } 3032 3033 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3034 offset, size, (uint32_t)value); 3035 3036 return (value); 3037 } 3038 3039 3040 3041 static uint64_t 3042 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 3043 uint64_t offset, int size) 3044 { 3045 struct pci_nvme_softc* sc = pi->pi_arg; 3046 3047 if (baridx == pci_msix_table_bar(pi) || 3048 baridx == pci_msix_pba_bar(pi)) { 3049 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3050 baridx, offset, size); 3051 3052 return pci_emul_msix_tread(pi, offset, size); 3053 } 3054 3055 switch (baridx) { 3056 case 0: 3057 return pci_nvme_read_bar_0(sc, offset, size); 3058 3059 default: 3060 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3061 } 3062 3063 return (0); 3064 } 3065 3066 static int 3067 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3068 { 3069 char bident[sizeof("XX:X:X")]; 3070 const char *value; 3071 uint32_t sectsz; 3072 3073 sc->max_queues = NVME_QUEUES; 3074 sc->max_qentries = NVME_MAX_QENTRIES; 3075 sc->ioslots = NVME_IOSLOTS; 3076 sc->num_squeues = sc->max_queues; 3077 sc->num_cqueues = sc->max_queues; 3078 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3079 sectsz = 0; 3080 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3081 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3082 3083 value = get_config_value_node(nvl, "maxq"); 3084 if (value != NULL) 3085 sc->max_queues = atoi(value); 3086 value = get_config_value_node(nvl, "qsz"); 3087 if (value != NULL) { 3088 sc->max_qentries = atoi(value); 3089 if (sc->max_qentries <= 0) { 3090 EPRINTLN("nvme: Invalid qsz option %d", 3091 sc->max_qentries); 3092 return (-1); 3093 } 3094 } 3095 value = get_config_value_node(nvl, "ioslots"); 3096 if (value != NULL) { 3097 sc->ioslots = atoi(value); 3098 if (sc->ioslots <= 0) { 3099 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3100 return (-1); 3101 } 3102 } 3103 value = get_config_value_node(nvl, "sectsz"); 3104 if (value != NULL) 3105 sectsz = atoi(value); 3106 value = get_config_value_node(nvl, "ser"); 3107 if (value != NULL) { 3108 /* 3109 * This field indicates the Product Serial Number in 3110 * 7-bit ASCII, unused bytes should be space characters. 3111 * Ref: NVMe v1.3c. 3112 */ 3113 cpywithpad((char *)sc->ctrldata.sn, 3114 sizeof(sc->ctrldata.sn), value, ' '); 3115 } 3116 value = get_config_value_node(nvl, "eui64"); 3117 if (value != NULL) 3118 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3119 value = get_config_value_node(nvl, "dsm"); 3120 if (value != NULL) { 3121 if (strcmp(value, "auto") == 0) 3122 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3123 else if (strcmp(value, "enable") == 0) 3124 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3125 else if (strcmp(value, "disable") == 0) 3126 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3127 } 3128 3129 value = get_config_value_node(nvl, "ram"); 3130 if (value != NULL) { 3131 uint64_t sz = strtoull(value, NULL, 10); 3132 3133 sc->nvstore.type = NVME_STOR_RAM; 3134 sc->nvstore.size = sz * 1024 * 1024; 3135 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3136 sc->nvstore.sectsz = 4096; 3137 sc->nvstore.sectsz_bits = 12; 3138 if (sc->nvstore.ctx == NULL) { 3139 EPRINTLN("nvme: Unable to allocate RAM"); 3140 return (-1); 3141 } 3142 } else { 3143 snprintf(bident, sizeof(bident), "%d:%d", 3144 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3145 sc->nvstore.ctx = blockif_open(nvl, bident); 3146 if (sc->nvstore.ctx == NULL) { 3147 EPRINTLN("nvme: Could not open backing file: %s", 3148 strerror(errno)); 3149 return (-1); 3150 } 3151 sc->nvstore.type = NVME_STOR_BLOCKIF; 3152 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3153 } 3154 3155 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3156 sc->nvstore.sectsz = sectsz; 3157 else if (sc->nvstore.type != NVME_STOR_RAM) 3158 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3159 for (sc->nvstore.sectsz_bits = 9; 3160 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3161 sc->nvstore.sectsz_bits++); 3162 3163 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3164 sc->max_queues = NVME_QUEUES; 3165 3166 return (0); 3167 } 3168 3169 static void 3170 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size) 3171 { 3172 struct pci_nvme_softc *sc; 3173 struct pci_nvme_blockstore *nvstore; 3174 struct nvme_namespace_data *nd; 3175 3176 sc = arg; 3177 nvstore = &sc->nvstore; 3178 nd = &sc->nsdata; 3179 3180 nvstore->size = new_size; 3181 pci_nvme_init_nsdata_size(nvstore, nd); 3182 3183 /* Add changed NSID to list */ 3184 sc->ns_log.ns[0] = 1; 3185 sc->ns_log.ns[1] = 0; 3186 3187 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3188 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3189 } 3190 3191 static int 3192 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 3193 { 3194 struct pci_nvme_softc *sc; 3195 uint32_t pci_membar_sz; 3196 int error; 3197 3198 error = 0; 3199 3200 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3201 pi->pi_arg = sc; 3202 sc->nsc_pi = pi; 3203 3204 error = pci_nvme_parse_config(sc, nvl); 3205 if (error < 0) 3206 goto done; 3207 else 3208 error = 0; 3209 3210 STAILQ_INIT(&sc->ioreqs_free); 3211 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3212 for (int i = 0; i < sc->ioslots; i++) { 3213 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3214 } 3215 3216 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3217 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3218 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3219 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3220 pci_set_cfgdata8(pi, PCIR_PROGIF, 3221 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3222 3223 /* 3224 * Allocate size of NVMe registers + doorbell space for all queues. 3225 * 3226 * The specification requires a minimum memory I/O window size of 16K. 3227 * The Windows driver will refuse to start a device with a smaller 3228 * window. 3229 */ 3230 pci_membar_sz = sizeof(struct nvme_registers) + 3231 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3232 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3233 3234 DPRINTF("nvme membar size: %u", pci_membar_sz); 3235 3236 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3237 if (error) { 3238 WPRINTF("%s pci alloc mem bar failed", __func__); 3239 goto done; 3240 } 3241 3242 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3243 if (error) { 3244 WPRINTF("%s pci add msixcap failed", __func__); 3245 goto done; 3246 } 3247 3248 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3249 if (error) { 3250 WPRINTF("%s pci add Express capability failed", __func__); 3251 goto done; 3252 } 3253 3254 pthread_mutex_init(&sc->mtx, NULL); 3255 sem_init(&sc->iosemlock, 0, sc->ioslots); 3256 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3257 3258 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3259 /* 3260 * Controller data depends on Namespace data so initialize Namespace 3261 * data first. 3262 */ 3263 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3264 pci_nvme_init_ctrldata(sc); 3265 pci_nvme_init_logpages(sc); 3266 pci_nvme_init_features(sc); 3267 3268 pci_nvme_aer_init(sc); 3269 pci_nvme_aen_init(sc); 3270 3271 pci_nvme_reset(sc); 3272 3273 pci_lintr_request(pi); 3274 3275 done: 3276 return (error); 3277 } 3278 3279 static int 3280 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3281 { 3282 char *cp, *ram; 3283 3284 if (opts == NULL) 3285 return (0); 3286 3287 if (strncmp(opts, "ram=", 4) == 0) { 3288 cp = strchr(opts, ','); 3289 if (cp == NULL) { 3290 set_config_value_node(nvl, "ram", opts + 4); 3291 return (0); 3292 } 3293 ram = strndup(opts + 4, cp - opts - 4); 3294 set_config_value_node(nvl, "ram", ram); 3295 free(ram); 3296 return (pci_parse_legacy_config(nvl, cp + 1)); 3297 } else 3298 return (blockif_legacy_config(nvl, opts)); 3299 } 3300 3301 struct pci_devemu pci_de_nvme = { 3302 .pe_emu = "nvme", 3303 .pe_init = pci_nvme_init, 3304 .pe_legacy_config = pci_nvme_legacy_config, 3305 .pe_barwrite = pci_nvme_write, 3306 .pe_barread = pci_nvme_read 3307 }; 3308 PCI_EMUL_SET(pci_de_nvme); 3309