1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <semaphore.h> 69 #include <stdbool.h> 70 #include <stddef.h> 71 #include <stdint.h> 72 #include <stdio.h> 73 #include <stdlib.h> 74 #include <string.h> 75 76 #include <machine/atomic.h> 77 #include <machine/vmm.h> 78 #include <vmmapi.h> 79 80 #include <dev/nvme/nvme.h> 81 82 #include "bhyverun.h" 83 #include "block_if.h" 84 #include "debug.h" 85 #include "pci_emul.h" 86 87 88 static int nvme_debug = 0; 89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 91 92 /* defaults; can be overridden */ 93 #define NVME_MSIX_BAR 4 94 95 #define NVME_IOSLOTS 8 96 97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 98 #define NVME_MMIO_SPACE_MIN (1 << 14) 99 100 #define NVME_QUEUES 16 101 #define NVME_MAX_QENTRIES 2048 102 /* Memory Page size Minimum reported in CAP register */ 103 #define NVME_MPSMIN 0 104 /* MPSMIN converted to bytes */ 105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 106 107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 108 #define NVME_MDTS 9 109 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 112 113 /* This is a synthetic status code to indicate there is no status */ 114 #define NVME_NO_STATUS 0xffff 115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 116 117 /* helpers */ 118 119 /* Convert a zero-based value into a one-based value */ 120 #define ONE_BASED(zero) ((zero) + 1) 121 /* Convert a one-based value into a zero-based value */ 122 #define ZERO_BASED(one) ((one) - 1) 123 124 /* Encode number of SQ's and CQ's for Set/Get Features */ 125 #define NVME_FEATURE_NUM_QUEUES(sc) \ 126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 128 129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 130 131 enum nvme_controller_register_offsets { 132 NVME_CR_CAP_LOW = 0x00, 133 NVME_CR_CAP_HI = 0x04, 134 NVME_CR_VS = 0x08, 135 NVME_CR_INTMS = 0x0c, 136 NVME_CR_INTMC = 0x10, 137 NVME_CR_CC = 0x14, 138 NVME_CR_CSTS = 0x1c, 139 NVME_CR_NSSR = 0x20, 140 NVME_CR_AQA = 0x24, 141 NVME_CR_ASQ_LOW = 0x28, 142 NVME_CR_ASQ_HI = 0x2c, 143 NVME_CR_ACQ_LOW = 0x30, 144 NVME_CR_ACQ_HI = 0x34, 145 }; 146 147 enum nvme_cmd_cdw11 { 148 NVME_CMD_CDW11_PC = 0x0001, 149 NVME_CMD_CDW11_IEN = 0x0002, 150 NVME_CMD_CDW11_IV = 0xFFFF0000, 151 }; 152 153 enum nvme_copy_dir { 154 NVME_COPY_TO_PRP, 155 NVME_COPY_FROM_PRP, 156 }; 157 158 #define NVME_CQ_INTEN 0x01 159 #define NVME_CQ_INTCOAL 0x02 160 161 struct nvme_completion_queue { 162 struct nvme_completion *qbase; 163 pthread_mutex_t mtx; 164 uint32_t size; 165 uint16_t tail; /* nvme progress */ 166 uint16_t head; /* guest progress */ 167 uint16_t intr_vec; 168 uint32_t intr_en; 169 }; 170 171 struct nvme_submission_queue { 172 struct nvme_command *qbase; 173 pthread_mutex_t mtx; 174 uint32_t size; 175 uint16_t head; /* nvme progress */ 176 uint16_t tail; /* guest progress */ 177 uint16_t cqid; /* completion queue id */ 178 int qpriority; 179 }; 180 181 enum nvme_storage_type { 182 NVME_STOR_BLOCKIF = 0, 183 NVME_STOR_RAM = 1, 184 }; 185 186 struct pci_nvme_blockstore { 187 enum nvme_storage_type type; 188 void *ctx; 189 uint64_t size; 190 uint32_t sectsz; 191 uint32_t sectsz_bits; 192 uint64_t eui64; 193 uint32_t deallocate:1; 194 }; 195 196 /* 197 * Calculate the number of additional page descriptors for guest IO requests 198 * based on the advertised Max Data Transfer (MDTS) and given the number of 199 * default iovec's in a struct blockif_req. 200 * 201 * Note the + 1 allows for the initial descriptor to not be page aligned. 202 */ 203 #define MDTS_PAD_SIZE \ 204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 206 0 207 208 struct pci_nvme_ioreq { 209 struct pci_nvme_softc *sc; 210 STAILQ_ENTRY(pci_nvme_ioreq) link; 211 struct nvme_submission_queue *nvme_sq; 212 uint16_t sqid; 213 214 /* command information */ 215 uint16_t opc; 216 uint16_t cid; 217 uint32_t nsid; 218 219 uint64_t prev_gpaddr; 220 size_t prev_size; 221 size_t bytes; 222 223 struct blockif_req io_req; 224 225 struct iovec iovpadding[MDTS_PAD_SIZE]; 226 }; 227 228 enum nvme_dsm_type { 229 /* Dataset Management bit in ONCS reflects backing storage capability */ 230 NVME_DATASET_MANAGEMENT_AUTO, 231 /* Unconditionally set Dataset Management bit in ONCS */ 232 NVME_DATASET_MANAGEMENT_ENABLE, 233 /* Unconditionally clear Dataset Management bit in ONCS */ 234 NVME_DATASET_MANAGEMENT_DISABLE, 235 }; 236 237 struct pci_nvme_softc; 238 struct nvme_feature_obj; 239 240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 241 struct nvme_feature_obj *, 242 struct nvme_command *, 243 struct nvme_completion *); 244 245 struct nvme_feature_obj { 246 uint32_t cdw11; 247 nvme_feature_cb set; 248 nvme_feature_cb get; 249 bool namespace_specific; 250 }; 251 252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 253 254 struct pci_nvme_aer { 255 STAILQ_ENTRY(pci_nvme_aer) link; 256 uint16_t cid; /* Command ID of the submitted AER */ 257 }; 258 259 struct pci_nvme_softc { 260 struct pci_devinst *nsc_pi; 261 262 pthread_mutex_t mtx; 263 264 struct nvme_registers regs; 265 266 struct nvme_namespace_data nsdata; 267 struct nvme_controller_data ctrldata; 268 struct nvme_error_information_entry err_log; 269 struct nvme_health_information_page health_log; 270 struct nvme_firmware_page fw_log; 271 272 struct pci_nvme_blockstore nvstore; 273 274 uint16_t max_qentries; /* max entries per queue */ 275 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 276 uint32_t num_cqueues; 277 uint32_t num_squeues; 278 bool num_q_is_set; /* Has host set Number of Queues */ 279 280 struct pci_nvme_ioreq *ioreqs; 281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 282 uint32_t pending_ios; 283 uint32_t ioslots; 284 sem_t iosemlock; 285 286 /* 287 * Memory mapped Submission and Completion queues 288 * Each array includes both Admin and IO queues 289 */ 290 struct nvme_completion_queue *compl_queues; 291 struct nvme_submission_queue *submit_queues; 292 293 struct nvme_feature_obj feat[NVME_FID_MAX]; 294 295 enum nvme_dsm_type dataset_management; 296 297 /* Accounting for SMART data */ 298 __uint128_t read_data_units; 299 __uint128_t write_data_units; 300 __uint128_t read_commands; 301 __uint128_t write_commands; 302 uint32_t read_dunits_remainder; 303 uint32_t write_dunits_remainder; 304 305 STAILQ_HEAD(, pci_nvme_aer) aer_list; 306 uint32_t aer_count; 307 }; 308 309 310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 312 static void pci_nvme_io_done(struct blockif_req *, int); 313 314 /* Controller Configuration utils */ 315 #define NVME_CC_GET_EN(cc) \ 316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 317 #define NVME_CC_GET_CSS(cc) \ 318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 319 #define NVME_CC_GET_SHN(cc) \ 320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 321 #define NVME_CC_GET_IOSQES(cc) \ 322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 323 #define NVME_CC_GET_IOCQES(cc) \ 324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 325 326 #define NVME_CC_WRITE_MASK \ 327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 330 331 #define NVME_CC_NEN_WRITE_MASK \ 332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 335 336 /* Controller Status utils */ 337 #define NVME_CSTS_GET_RDY(sts) \ 338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 339 340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 341 342 /* Completion Queue status word utils */ 343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 344 #define NVME_STATUS_MASK \ 345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 347 348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 350 351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 352 struct nvme_feature_obj *, 353 struct nvme_command *, 354 struct nvme_completion *); 355 static void nvme_feature_num_queues(struct pci_nvme_softc *, 356 struct nvme_feature_obj *, 357 struct nvme_command *, 358 struct nvme_completion *); 359 static void nvme_feature_iv_config(struct pci_nvme_softc *, 360 struct nvme_feature_obj *, 361 struct nvme_command *, 362 struct nvme_completion *); 363 364 static __inline void 365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 366 { 367 size_t len; 368 369 len = strnlen(src, dst_size); 370 memset(dst, pad, dst_size); 371 memcpy(dst, src, len); 372 } 373 374 static __inline void 375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 376 { 377 378 *status &= ~NVME_STATUS_MASK; 379 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 380 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 381 } 382 383 static __inline void 384 pci_nvme_status_genc(uint16_t *status, uint16_t code) 385 { 386 387 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 388 } 389 390 /* 391 * Initialize the requested number or IO Submission and Completion Queues. 392 * Admin queues are allocated implicitly. 393 */ 394 static void 395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 396 { 397 uint32_t i; 398 399 /* 400 * Allocate and initialize the Submission Queues 401 */ 402 if (nsq > NVME_QUEUES) { 403 WPRINTF("%s: clamping number of SQ from %u to %u", 404 __func__, nsq, NVME_QUEUES); 405 nsq = NVME_QUEUES; 406 } 407 408 sc->num_squeues = nsq; 409 410 sc->submit_queues = calloc(sc->num_squeues + 1, 411 sizeof(struct nvme_submission_queue)); 412 if (sc->submit_queues == NULL) { 413 WPRINTF("%s: SQ allocation failed", __func__); 414 sc->num_squeues = 0; 415 } else { 416 struct nvme_submission_queue *sq = sc->submit_queues; 417 418 for (i = 0; i < sc->num_squeues; i++) 419 pthread_mutex_init(&sq[i].mtx, NULL); 420 } 421 422 /* 423 * Allocate and initialize the Completion Queues 424 */ 425 if (ncq > NVME_QUEUES) { 426 WPRINTF("%s: clamping number of CQ from %u to %u", 427 __func__, ncq, NVME_QUEUES); 428 ncq = NVME_QUEUES; 429 } 430 431 sc->num_cqueues = ncq; 432 433 sc->compl_queues = calloc(sc->num_cqueues + 1, 434 sizeof(struct nvme_completion_queue)); 435 if (sc->compl_queues == NULL) { 436 WPRINTF("%s: CQ allocation failed", __func__); 437 sc->num_cqueues = 0; 438 } else { 439 struct nvme_completion_queue *cq = sc->compl_queues; 440 441 for (i = 0; i < sc->num_cqueues; i++) 442 pthread_mutex_init(&cq[i].mtx, NULL); 443 } 444 } 445 446 static void 447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 448 { 449 struct nvme_controller_data *cd = &sc->ctrldata; 450 451 cd->vid = 0xFB5D; 452 cd->ssvid = 0x0000; 453 454 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 455 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 456 457 /* Num of submission commands that we can handle at a time (2^rab) */ 458 cd->rab = 4; 459 460 /* FreeBSD OUI */ 461 cd->ieee[0] = 0x58; 462 cd->ieee[1] = 0x9c; 463 cd->ieee[2] = 0xfc; 464 465 cd->mic = 0; 466 467 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 468 469 cd->ver = 0x00010300; 470 471 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 472 cd->acl = 2; 473 cd->aerl = 4; 474 475 /* Advertise 1, Read-only firmware slot */ 476 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 477 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 478 cd->lpa = 0; /* TODO: support some simple things like SMART */ 479 cd->elpe = 0; /* max error log page entries */ 480 cd->npss = 1; /* number of power states support */ 481 482 /* Warning Composite Temperature Threshold */ 483 cd->wctemp = 0x0157; 484 485 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 486 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 487 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 488 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 489 cd->nn = 1; /* number of namespaces */ 490 491 cd->oncs = 0; 492 switch (sc->dataset_management) { 493 case NVME_DATASET_MANAGEMENT_AUTO: 494 if (sc->nvstore.deallocate) 495 cd->oncs |= NVME_ONCS_DSM; 496 break; 497 case NVME_DATASET_MANAGEMENT_ENABLE: 498 cd->oncs |= NVME_ONCS_DSM; 499 break; 500 default: 501 break; 502 } 503 504 cd->fna = 0x03; 505 506 cd->power_state[0].mp = 10; 507 } 508 509 /* 510 * Calculate the CRC-16 of the given buffer 511 * See copyright attribution at top of file 512 */ 513 static uint16_t 514 crc16(uint16_t crc, const void *buffer, unsigned int len) 515 { 516 const unsigned char *cp = buffer; 517 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 518 static uint16_t const crc16_table[256] = { 519 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 520 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 521 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 522 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 523 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 524 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 525 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 526 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 527 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 528 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 529 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 530 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 531 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 532 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 533 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 534 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 535 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 536 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 537 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 538 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 539 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 540 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 541 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 542 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 543 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 544 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 545 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 546 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 547 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 548 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 549 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 550 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 551 }; 552 553 while (len--) 554 crc = (((crc >> 8) & 0xffU) ^ 555 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 556 return crc; 557 } 558 559 static void 560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 561 struct nvme_namespace_data *nd, uint32_t nsid, 562 struct pci_nvme_blockstore *nvstore) 563 { 564 565 /* Get capacity and block size information from backing store */ 566 nd->nsze = nvstore->size / nvstore->sectsz; 567 nd->ncap = nd->nsze; 568 nd->nuse = nd->nsze; 569 570 if (nvstore->type == NVME_STOR_BLOCKIF) 571 nvstore->deallocate = blockif_candelete(nvstore->ctx); 572 573 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 574 nd->flbas = 0; 575 576 /* Create an EUI-64 if user did not provide one */ 577 if (nvstore->eui64 == 0) { 578 char *data = NULL; 579 uint64_t eui64 = nvstore->eui64; 580 581 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 582 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 583 584 if (data != NULL) { 585 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 586 free(data); 587 } 588 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 589 } 590 be64enc(nd->eui64, nvstore->eui64); 591 592 /* LBA data-sz = 2^lbads */ 593 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 594 } 595 596 static void 597 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 598 { 599 600 memset(&sc->err_log, 0, sizeof(sc->err_log)); 601 memset(&sc->health_log, 0, sizeof(sc->health_log)); 602 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 603 604 /* Set read/write remainder to round up according to spec */ 605 sc->read_dunits_remainder = 999; 606 sc->write_dunits_remainder = 999; 607 608 /* Set nominal Health values checked by implementations */ 609 sc->health_log.temperature = 310; 610 sc->health_log.available_spare = 100; 611 sc->health_log.available_spare_threshold = 10; 612 } 613 614 static void 615 pci_nvme_init_features(struct pci_nvme_softc *sc) 616 { 617 618 sc->feat[0].set = nvme_feature_invalid_cb; 619 sc->feat[0].get = nvme_feature_invalid_cb; 620 621 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 622 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 623 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 624 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 625 nvme_feature_iv_config; 626 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 627 nvme_feature_invalid_cb; 628 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 629 nvme_feature_invalid_cb; 630 } 631 632 static void 633 pci_nvme_aer_init(struct pci_nvme_softc *sc) 634 { 635 636 STAILQ_INIT(&sc->aer_list); 637 sc->aer_count = 0; 638 } 639 640 static void 641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 642 { 643 struct pci_nvme_aer *aer = NULL; 644 645 while (!STAILQ_EMPTY(&sc->aer_list)) { 646 aer = STAILQ_FIRST(&sc->aer_list); 647 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 648 free(aer); 649 } 650 651 pci_nvme_aer_init(sc); 652 } 653 654 static bool 655 pci_nvme_aer_available(struct pci_nvme_softc *sc) 656 { 657 658 return (!STAILQ_EMPTY(&sc->aer_list)); 659 } 660 661 static bool 662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 663 { 664 struct nvme_controller_data *cd = &sc->ctrldata; 665 666 /* AERL is a zero based value while aer_count is one's based */ 667 return (sc->aer_count == (cd->aerl + 1)); 668 } 669 670 /* 671 * Add an Async Event Request 672 * 673 * Stores an AER to be returned later if the Controller needs to notify the 674 * host of an event. 675 * Note that while the NVMe spec doesn't require Controllers to return AER's 676 * in order, this implementation does preserve the order. 677 */ 678 static int 679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 680 { 681 struct pci_nvme_aer *aer = NULL; 682 683 if (pci_nvme_aer_limit_reached(sc)) 684 return (-1); 685 686 aer = calloc(1, sizeof(struct pci_nvme_aer)); 687 if (aer == NULL) 688 return (-1); 689 690 sc->aer_count++; 691 692 /* Save the Command ID for use in the completion message */ 693 aer->cid = cid; 694 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 695 696 return (0); 697 } 698 699 /* 700 * Get an Async Event Request structure 701 * 702 * Returns a pointer to an AER previously submitted by the host or NULL if 703 * no AER's exist. Caller is responsible for freeing the returned struct. 704 */ 705 static struct pci_nvme_aer * 706 pci_nvme_aer_get(struct pci_nvme_softc *sc) 707 { 708 struct pci_nvme_aer *aer = NULL; 709 710 aer = STAILQ_FIRST(&sc->aer_list); 711 if (aer != NULL) { 712 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 713 sc->aer_count--; 714 } 715 716 return (aer); 717 } 718 719 static void 720 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 721 { 722 uint32_t i; 723 724 DPRINTF("%s", __func__); 725 726 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 727 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 728 (60 << NVME_CAP_LO_REG_TO_SHIFT); 729 730 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 731 732 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 733 734 sc->regs.cc = 0; 735 sc->regs.csts = 0; 736 737 assert(sc->submit_queues != NULL); 738 739 for (i = 0; i < sc->num_squeues + 1; i++) { 740 sc->submit_queues[i].qbase = NULL; 741 sc->submit_queues[i].size = 0; 742 sc->submit_queues[i].cqid = 0; 743 sc->submit_queues[i].tail = 0; 744 sc->submit_queues[i].head = 0; 745 } 746 747 assert(sc->compl_queues != NULL); 748 749 for (i = 0; i < sc->num_cqueues + 1; i++) { 750 sc->compl_queues[i].qbase = NULL; 751 sc->compl_queues[i].size = 0; 752 sc->compl_queues[i].tail = 0; 753 sc->compl_queues[i].head = 0; 754 } 755 756 sc->num_q_is_set = false; 757 758 pci_nvme_aer_destroy(sc); 759 } 760 761 static void 762 pci_nvme_reset(struct pci_nvme_softc *sc) 763 { 764 pthread_mutex_lock(&sc->mtx); 765 pci_nvme_reset_locked(sc); 766 pthread_mutex_unlock(&sc->mtx); 767 } 768 769 static void 770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 771 { 772 uint16_t acqs, asqs; 773 774 DPRINTF("%s", __func__); 775 776 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 777 sc->submit_queues[0].size = asqs; 778 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 779 sizeof(struct nvme_command) * asqs); 780 781 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 782 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 783 784 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 785 NVME_AQA_REG_ACQS_MASK) + 1; 786 sc->compl_queues[0].size = acqs; 787 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 788 sizeof(struct nvme_completion) * acqs); 789 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 790 791 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 792 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 793 } 794 795 static int 796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 797 size_t len, enum nvme_copy_dir dir) 798 { 799 uint8_t *p; 800 size_t bytes; 801 802 if (len > (8 * 1024)) { 803 return (-1); 804 } 805 806 /* Copy from the start of prp1 to the end of the physical page */ 807 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 808 bytes = MIN(bytes, len); 809 810 p = vm_map_gpa(ctx, prp1, bytes); 811 if (p == NULL) { 812 return (-1); 813 } 814 815 if (dir == NVME_COPY_TO_PRP) 816 memcpy(p, b, bytes); 817 else 818 memcpy(b, p, bytes); 819 820 b += bytes; 821 822 len -= bytes; 823 if (len == 0) { 824 return (0); 825 } 826 827 len = MIN(len, PAGE_SIZE); 828 829 p = vm_map_gpa(ctx, prp2, len); 830 if (p == NULL) { 831 return (-1); 832 } 833 834 if (dir == NVME_COPY_TO_PRP) 835 memcpy(p, b, len); 836 else 837 memcpy(b, p, len); 838 839 return (0); 840 } 841 842 /* 843 * Write a Completion Queue Entry update 844 * 845 * Write the completion and update the doorbell value 846 */ 847 static void 848 pci_nvme_cq_update(struct pci_nvme_softc *sc, 849 struct nvme_completion_queue *cq, 850 uint32_t cdw0, 851 uint16_t cid, 852 uint16_t sqid, 853 uint16_t status) 854 { 855 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 856 struct nvme_completion *cqe; 857 858 assert(cq->qbase != NULL); 859 860 pthread_mutex_lock(&cq->mtx); 861 862 cqe = &cq->qbase[cq->tail]; 863 864 /* Flip the phase bit */ 865 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 866 867 cqe->cdw0 = cdw0; 868 cqe->sqhd = sq->head; 869 cqe->sqid = sqid; 870 cqe->cid = cid; 871 cqe->status = status; 872 873 cq->tail++; 874 if (cq->tail >= cq->size) { 875 cq->tail = 0; 876 } 877 878 pthread_mutex_unlock(&cq->mtx); 879 } 880 881 static int 882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 883 struct nvme_completion* compl) 884 { 885 uint16_t qid = command->cdw10 & 0xffff; 886 887 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 888 if (qid == 0 || qid > sc->num_squeues || 889 (sc->submit_queues[qid].qbase == NULL)) { 890 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 891 __func__, qid, sc->num_squeues); 892 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 893 NVME_SC_INVALID_QUEUE_IDENTIFIER); 894 return (1); 895 } 896 897 sc->submit_queues[qid].qbase = NULL; 898 sc->submit_queues[qid].cqid = 0; 899 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 900 return (1); 901 } 902 903 static int 904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 905 struct nvme_completion* compl) 906 { 907 if (command->cdw11 & NVME_CMD_CDW11_PC) { 908 uint16_t qid = command->cdw10 & 0xffff; 909 struct nvme_submission_queue *nsq; 910 911 if ((qid == 0) || (qid > sc->num_squeues) || 912 (sc->submit_queues[qid].qbase != NULL)) { 913 WPRINTF("%s queue index %u > num_squeues %u", 914 __func__, qid, sc->num_squeues); 915 pci_nvme_status_tc(&compl->status, 916 NVME_SCT_COMMAND_SPECIFIC, 917 NVME_SC_INVALID_QUEUE_IDENTIFIER); 918 return (1); 919 } 920 921 nsq = &sc->submit_queues[qid]; 922 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 923 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 924 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 925 /* 926 * Queues must specify at least two entries 927 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 928 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 929 */ 930 pci_nvme_status_tc(&compl->status, 931 NVME_SCT_COMMAND_SPECIFIC, 932 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 933 return (1); 934 } 935 nsq->head = nsq->tail = 0; 936 937 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 938 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 939 pci_nvme_status_tc(&compl->status, 940 NVME_SCT_COMMAND_SPECIFIC, 941 NVME_SC_INVALID_QUEUE_IDENTIFIER); 942 return (1); 943 } 944 945 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 946 pci_nvme_status_tc(&compl->status, 947 NVME_SCT_COMMAND_SPECIFIC, 948 NVME_SC_COMPLETION_QUEUE_INVALID); 949 return (1); 950 } 951 952 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 953 954 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 955 sizeof(struct nvme_command) * (size_t)nsq->size); 956 957 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 958 qid, nsq->size, nsq->qbase, nsq->cqid); 959 960 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 961 962 DPRINTF("%s completed creating IOSQ qid %u", 963 __func__, qid); 964 } else { 965 /* 966 * Guest sent non-cont submission queue request. 967 * This setting is unsupported by this emulation. 968 */ 969 WPRINTF("%s unsupported non-contig (list-based) " 970 "create i/o submission queue", __func__); 971 972 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 973 } 974 return (1); 975 } 976 977 static int 978 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 979 struct nvme_completion* compl) 980 { 981 uint16_t qid = command->cdw10 & 0xffff; 982 uint16_t sqid; 983 984 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 985 if (qid == 0 || qid > sc->num_cqueues || 986 (sc->compl_queues[qid].qbase == NULL)) { 987 WPRINTF("%s queue index %u / num_cqueues %u", 988 __func__, qid, sc->num_cqueues); 989 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 990 NVME_SC_INVALID_QUEUE_IDENTIFIER); 991 return (1); 992 } 993 994 /* Deleting an Active CQ is an error */ 995 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 996 if (sc->submit_queues[sqid].cqid == qid) { 997 pci_nvme_status_tc(&compl->status, 998 NVME_SCT_COMMAND_SPECIFIC, 999 NVME_SC_INVALID_QUEUE_DELETION); 1000 return (1); 1001 } 1002 1003 sc->compl_queues[qid].qbase = NULL; 1004 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1005 return (1); 1006 } 1007 1008 static int 1009 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1010 struct nvme_completion* compl) 1011 { 1012 struct nvme_completion_queue *ncq; 1013 uint16_t qid = command->cdw10 & 0xffff; 1014 1015 /* Only support Physically Contiguous queues */ 1016 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1017 WPRINTF("%s unsupported non-contig (list-based) " 1018 "create i/o completion queue", 1019 __func__); 1020 1021 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1022 return (1); 1023 } 1024 1025 if ((qid == 0) || (qid > sc->num_cqueues) || 1026 (sc->compl_queues[qid].qbase != NULL)) { 1027 WPRINTF("%s queue index %u > num_cqueues %u", 1028 __func__, qid, sc->num_cqueues); 1029 pci_nvme_status_tc(&compl->status, 1030 NVME_SCT_COMMAND_SPECIFIC, 1031 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1032 return (1); 1033 } 1034 1035 ncq = &sc->compl_queues[qid]; 1036 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1037 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1038 if (ncq->intr_vec > (sc->max_queues + 1)) { 1039 pci_nvme_status_tc(&compl->status, 1040 NVME_SCT_COMMAND_SPECIFIC, 1041 NVME_SC_INVALID_INTERRUPT_VECTOR); 1042 return (1); 1043 } 1044 1045 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1046 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1047 /* 1048 * Queues must specify at least two entries 1049 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1050 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1051 */ 1052 pci_nvme_status_tc(&compl->status, 1053 NVME_SCT_COMMAND_SPECIFIC, 1054 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1055 return (1); 1056 } 1057 ncq->head = ncq->tail = 0; 1058 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1059 command->prp1, 1060 sizeof(struct nvme_command) * (size_t)ncq->size); 1061 1062 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1063 1064 1065 return (1); 1066 } 1067 1068 static int 1069 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1070 struct nvme_completion* compl) 1071 { 1072 uint32_t logsize; 1073 uint8_t logpage = command->cdw10 & 0xFF; 1074 1075 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1076 1077 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1078 1079 /* 1080 * Command specifies the number of dwords to return in fields NUMDU 1081 * and NUMDL. This is a zero-based value. 1082 */ 1083 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1084 logsize *= sizeof(uint32_t); 1085 1086 switch (logpage) { 1087 case NVME_LOG_ERROR: 1088 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1089 command->prp2, (uint8_t *)&sc->err_log, 1090 MIN(logsize, sizeof(sc->err_log)), 1091 NVME_COPY_TO_PRP); 1092 break; 1093 case NVME_LOG_HEALTH_INFORMATION: 1094 pthread_mutex_lock(&sc->mtx); 1095 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1096 sizeof(sc->health_log.data_units_read)); 1097 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1098 sizeof(sc->health_log.data_units_written)); 1099 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1100 sizeof(sc->health_log.host_read_commands)); 1101 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1102 sizeof(sc->health_log.host_write_commands)); 1103 pthread_mutex_unlock(&sc->mtx); 1104 1105 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1106 command->prp2, (uint8_t *)&sc->health_log, 1107 MIN(logsize, sizeof(sc->health_log)), 1108 NVME_COPY_TO_PRP); 1109 break; 1110 case NVME_LOG_FIRMWARE_SLOT: 1111 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1112 command->prp2, (uint8_t *)&sc->fw_log, 1113 MIN(logsize, sizeof(sc->fw_log)), 1114 NVME_COPY_TO_PRP); 1115 break; 1116 default: 1117 DPRINTF("%s get log page %x command not supported", 1118 __func__, logpage); 1119 1120 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1121 NVME_SC_INVALID_LOG_PAGE); 1122 } 1123 1124 return (1); 1125 } 1126 1127 static int 1128 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1129 struct nvme_completion* compl) 1130 { 1131 void *dest; 1132 uint16_t status; 1133 1134 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1135 command->cdw10 & 0xFF, command->nsid); 1136 1137 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1138 1139 switch (command->cdw10 & 0xFF) { 1140 case 0x00: /* return Identify Namespace data structure */ 1141 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1142 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1143 NVME_COPY_TO_PRP); 1144 break; 1145 case 0x01: /* return Identify Controller data structure */ 1146 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1147 command->prp2, (uint8_t *)&sc->ctrldata, 1148 sizeof(sc->ctrldata), 1149 NVME_COPY_TO_PRP); 1150 break; 1151 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1152 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1153 sizeof(uint32_t) * 1024); 1154 /* All unused entries shall be zero */ 1155 bzero(dest, sizeof(uint32_t) * 1024); 1156 ((uint32_t *)dest)[0] = 1; 1157 break; 1158 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1159 if (command->nsid != 1) { 1160 pci_nvme_status_genc(&status, 1161 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1162 break; 1163 } 1164 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1165 sizeof(uint32_t) * 1024); 1166 /* All bytes after the descriptor shall be zero */ 1167 bzero(dest, sizeof(uint32_t) * 1024); 1168 1169 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1170 ((uint8_t *)dest)[0] = 1; 1171 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1172 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1173 break; 1174 default: 1175 DPRINTF("%s unsupported identify command requested 0x%x", 1176 __func__, command->cdw10 & 0xFF); 1177 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1178 break; 1179 } 1180 1181 compl->status = status; 1182 return (1); 1183 } 1184 1185 static const char * 1186 nvme_fid_to_name(uint8_t fid) 1187 { 1188 const char *name; 1189 1190 switch (fid) { 1191 case NVME_FEAT_ARBITRATION: 1192 name = "Arbitration"; 1193 break; 1194 case NVME_FEAT_POWER_MANAGEMENT: 1195 name = "Power Management"; 1196 break; 1197 case NVME_FEAT_LBA_RANGE_TYPE: 1198 name = "LBA Range Type"; 1199 break; 1200 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1201 name = "Temperature Threshold"; 1202 break; 1203 case NVME_FEAT_ERROR_RECOVERY: 1204 name = "Error Recovery"; 1205 break; 1206 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1207 name = "Volatile Write Cache"; 1208 break; 1209 case NVME_FEAT_NUMBER_OF_QUEUES: 1210 name = "Number of Queues"; 1211 break; 1212 case NVME_FEAT_INTERRUPT_COALESCING: 1213 name = "Interrupt Coalescing"; 1214 break; 1215 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1216 name = "Interrupt Vector Configuration"; 1217 break; 1218 case NVME_FEAT_WRITE_ATOMICITY: 1219 name = "Write Atomicity Normal"; 1220 break; 1221 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1222 name = "Asynchronous Event Configuration"; 1223 break; 1224 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1225 name = "Autonomous Power State Transition"; 1226 break; 1227 case NVME_FEAT_HOST_MEMORY_BUFFER: 1228 name = "Host Memory Buffer"; 1229 break; 1230 case NVME_FEAT_TIMESTAMP: 1231 name = "Timestamp"; 1232 break; 1233 case NVME_FEAT_KEEP_ALIVE_TIMER: 1234 name = "Keep Alive Timer"; 1235 break; 1236 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1237 name = "Host Controlled Thermal Management"; 1238 break; 1239 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1240 name = "Non-Operation Power State Config"; 1241 break; 1242 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1243 name = "Read Recovery Level Config"; 1244 break; 1245 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1246 name = "Predictable Latency Mode Config"; 1247 break; 1248 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1249 name = "Predictable Latency Mode Window"; 1250 break; 1251 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1252 name = "LBA Status Information Report Interval"; 1253 break; 1254 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1255 name = "Host Behavior Support"; 1256 break; 1257 case NVME_FEAT_SANITIZE_CONFIG: 1258 name = "Sanitize Config"; 1259 break; 1260 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1261 name = "Endurance Group Event Configuration"; 1262 break; 1263 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1264 name = "Software Progress Marker"; 1265 break; 1266 case NVME_FEAT_HOST_IDENTIFIER: 1267 name = "Host Identifier"; 1268 break; 1269 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1270 name = "Reservation Notification Mask"; 1271 break; 1272 case NVME_FEAT_RESERVATION_PERSISTENCE: 1273 name = "Reservation Persistence"; 1274 break; 1275 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1276 name = "Namespace Write Protection Config"; 1277 break; 1278 default: 1279 name = "Unknown"; 1280 break; 1281 } 1282 1283 return (name); 1284 } 1285 1286 static void 1287 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1288 struct nvme_feature_obj *feat, 1289 struct nvme_command *command, 1290 struct nvme_completion *compl) 1291 { 1292 1293 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1294 } 1295 1296 static void 1297 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1298 struct nvme_feature_obj *feat, 1299 struct nvme_command *command, 1300 struct nvme_completion *compl) 1301 { 1302 uint32_t i; 1303 uint32_t cdw11 = command->cdw11; 1304 uint16_t iv; 1305 bool cd; 1306 1307 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1308 1309 iv = cdw11 & 0xffff; 1310 cd = cdw11 & (1 << 16); 1311 1312 if (iv > (sc->max_queues + 1)) { 1313 return; 1314 } 1315 1316 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1317 if ((iv == 0) && !cd) 1318 return; 1319 1320 /* Requested Interrupt Vector must be used by a CQ */ 1321 for (i = 0; i < sc->num_cqueues + 1; i++) { 1322 if (sc->compl_queues[i].intr_vec == iv) { 1323 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1324 } 1325 } 1326 1327 } 1328 1329 static void 1330 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1331 struct nvme_feature_obj *feat, 1332 struct nvme_command *command, 1333 struct nvme_completion *compl) 1334 { 1335 uint16_t nqr; /* Number of Queues Requested */ 1336 1337 if (sc->num_q_is_set) { 1338 WPRINTF("%s: Number of Queues already set", __func__); 1339 pci_nvme_status_genc(&compl->status, 1340 NVME_SC_COMMAND_SEQUENCE_ERROR); 1341 return; 1342 } 1343 1344 nqr = command->cdw11 & 0xFFFF; 1345 if (nqr == 0xffff) { 1346 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1347 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1348 return; 1349 } 1350 1351 sc->num_squeues = ONE_BASED(nqr); 1352 if (sc->num_squeues > sc->max_queues) { 1353 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1354 sc->max_queues); 1355 sc->num_squeues = sc->max_queues; 1356 } 1357 1358 nqr = (command->cdw11 >> 16) & 0xFFFF; 1359 if (nqr == 0xffff) { 1360 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1361 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1362 return; 1363 } 1364 1365 sc->num_cqueues = ONE_BASED(nqr); 1366 if (sc->num_cqueues > sc->max_queues) { 1367 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1368 sc->max_queues); 1369 sc->num_cqueues = sc->max_queues; 1370 } 1371 1372 /* Patch the command value which will be saved on callback's return */ 1373 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1374 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1375 1376 sc->num_q_is_set = true; 1377 } 1378 1379 static int 1380 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1381 struct nvme_completion *compl) 1382 { 1383 struct nvme_feature_obj *feat; 1384 uint32_t nsid = command->nsid; 1385 uint8_t fid = command->cdw10 & 0xFF; 1386 1387 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1388 1389 if (fid >= NVME_FID_MAX) { 1390 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1391 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1392 return (1); 1393 } 1394 feat = &sc->feat[fid]; 1395 1396 if (!feat->namespace_specific && 1397 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1398 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1399 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1400 return (1); 1401 } 1402 1403 compl->cdw0 = 0; 1404 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1405 1406 if (feat->set) 1407 feat->set(sc, feat, command, compl); 1408 1409 if (compl->status == NVME_SC_SUCCESS) 1410 feat->cdw11 = command->cdw11; 1411 1412 return (0); 1413 } 1414 1415 static int 1416 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1417 struct nvme_completion* compl) 1418 { 1419 struct nvme_feature_obj *feat; 1420 uint8_t fid = command->cdw10 & 0xFF; 1421 1422 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1423 1424 if (fid >= NVME_FID_MAX) { 1425 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1426 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1427 return (1); 1428 } 1429 1430 compl->cdw0 = 0; 1431 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1432 1433 feat = &sc->feat[fid]; 1434 if (feat->get) { 1435 feat->get(sc, feat, command, compl); 1436 } 1437 1438 if (compl->status == NVME_SC_SUCCESS) { 1439 compl->cdw0 = feat->cdw11; 1440 } 1441 1442 return (0); 1443 } 1444 1445 static int 1446 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1447 struct nvme_completion* compl) 1448 { 1449 uint8_t ses, lbaf, pi; 1450 1451 /* Only supports Secure Erase Setting - User Data Erase */ 1452 ses = (command->cdw10 >> 9) & 0x7; 1453 if (ses > 0x1) { 1454 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1455 return (1); 1456 } 1457 1458 /* Only supports a single LBA Format */ 1459 lbaf = command->cdw10 & 0xf; 1460 if (lbaf != 0) { 1461 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1462 NVME_SC_INVALID_FORMAT); 1463 return (1); 1464 } 1465 1466 /* Doesn't support Protection Infomation */ 1467 pi = (command->cdw10 >> 5) & 0x7; 1468 if (pi != 0) { 1469 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1470 return (1); 1471 } 1472 1473 if (sc->nvstore.type == NVME_STOR_RAM) { 1474 if (sc->nvstore.ctx) 1475 free(sc->nvstore.ctx); 1476 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1477 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1478 } else { 1479 struct pci_nvme_ioreq *req; 1480 int err; 1481 1482 req = pci_nvme_get_ioreq(sc); 1483 if (req == NULL) { 1484 pci_nvme_status_genc(&compl->status, 1485 NVME_SC_INTERNAL_DEVICE_ERROR); 1486 WPRINTF("%s: unable to allocate IO req", __func__); 1487 return (1); 1488 } 1489 req->nvme_sq = &sc->submit_queues[0]; 1490 req->sqid = 0; 1491 req->opc = command->opc; 1492 req->cid = command->cid; 1493 req->nsid = command->nsid; 1494 1495 req->io_req.br_offset = 0; 1496 req->io_req.br_resid = sc->nvstore.size; 1497 req->io_req.br_callback = pci_nvme_io_done; 1498 1499 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1500 if (err) { 1501 pci_nvme_status_genc(&compl->status, 1502 NVME_SC_INTERNAL_DEVICE_ERROR); 1503 pci_nvme_release_ioreq(sc, req); 1504 } 1505 } 1506 1507 return (1); 1508 } 1509 1510 static int 1511 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1512 struct nvme_completion* compl) 1513 { 1514 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1515 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1516 1517 /* TODO: search for the command ID and abort it */ 1518 1519 compl->cdw0 = 1; 1520 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1521 return (1); 1522 } 1523 1524 static int 1525 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1526 struct nvme_command* command, struct nvme_completion* compl) 1527 { 1528 DPRINTF("%s async event request 0x%x", __func__, command->cdw11); 1529 1530 /* Don't exceed the Async Event Request Limit (AERL). */ 1531 if (pci_nvme_aer_limit_reached(sc)) { 1532 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1533 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1534 return (1); 1535 } 1536 1537 if (pci_nvme_aer_add(sc, command->cid)) { 1538 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1539 NVME_SC_INTERNAL_DEVICE_ERROR); 1540 return (1); 1541 } 1542 1543 /* 1544 * Raise events when they happen based on the Set Features cmd. 1545 * These events happen async, so only set completion successful if 1546 * there is an event reflective of the request to get event. 1547 */ 1548 compl->status = NVME_NO_STATUS; 1549 1550 return (0); 1551 } 1552 1553 static void 1554 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1555 { 1556 struct nvme_completion compl; 1557 struct nvme_command *cmd; 1558 struct nvme_submission_queue *sq; 1559 struct nvme_completion_queue *cq; 1560 uint16_t sqhead; 1561 1562 DPRINTF("%s index %u", __func__, (uint32_t)value); 1563 1564 sq = &sc->submit_queues[0]; 1565 cq = &sc->compl_queues[0]; 1566 1567 pthread_mutex_lock(&sq->mtx); 1568 1569 sqhead = sq->head; 1570 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1571 1572 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1573 cmd = &(sq->qbase)[sqhead]; 1574 compl.cdw0 = 0; 1575 compl.status = 0; 1576 1577 switch (cmd->opc) { 1578 case NVME_OPC_DELETE_IO_SQ: 1579 DPRINTF("%s command DELETE_IO_SQ", __func__); 1580 nvme_opc_delete_io_sq(sc, cmd, &compl); 1581 break; 1582 case NVME_OPC_CREATE_IO_SQ: 1583 DPRINTF("%s command CREATE_IO_SQ", __func__); 1584 nvme_opc_create_io_sq(sc, cmd, &compl); 1585 break; 1586 case NVME_OPC_DELETE_IO_CQ: 1587 DPRINTF("%s command DELETE_IO_CQ", __func__); 1588 nvme_opc_delete_io_cq(sc, cmd, &compl); 1589 break; 1590 case NVME_OPC_CREATE_IO_CQ: 1591 DPRINTF("%s command CREATE_IO_CQ", __func__); 1592 nvme_opc_create_io_cq(sc, cmd, &compl); 1593 break; 1594 case NVME_OPC_GET_LOG_PAGE: 1595 DPRINTF("%s command GET_LOG_PAGE", __func__); 1596 nvme_opc_get_log_page(sc, cmd, &compl); 1597 break; 1598 case NVME_OPC_IDENTIFY: 1599 DPRINTF("%s command IDENTIFY", __func__); 1600 nvme_opc_identify(sc, cmd, &compl); 1601 break; 1602 case NVME_OPC_ABORT: 1603 DPRINTF("%s command ABORT", __func__); 1604 nvme_opc_abort(sc, cmd, &compl); 1605 break; 1606 case NVME_OPC_SET_FEATURES: 1607 DPRINTF("%s command SET_FEATURES", __func__); 1608 nvme_opc_set_features(sc, cmd, &compl); 1609 break; 1610 case NVME_OPC_GET_FEATURES: 1611 DPRINTF("%s command GET_FEATURES", __func__); 1612 nvme_opc_get_features(sc, cmd, &compl); 1613 break; 1614 case NVME_OPC_FIRMWARE_ACTIVATE: 1615 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1616 pci_nvme_status_tc(&compl.status, 1617 NVME_SCT_COMMAND_SPECIFIC, 1618 NVME_SC_INVALID_FIRMWARE_SLOT); 1619 break; 1620 case NVME_OPC_ASYNC_EVENT_REQUEST: 1621 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1622 nvme_opc_async_event_req(sc, cmd, &compl); 1623 break; 1624 case NVME_OPC_FORMAT_NVM: 1625 DPRINTF("%s command FORMAT_NVM", __func__); 1626 if ((sc->ctrldata.oacs & 1627 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1628 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1629 } 1630 compl.status = NVME_NO_STATUS; 1631 nvme_opc_format_nvm(sc, cmd, &compl); 1632 break; 1633 default: 1634 DPRINTF("0x%x command is not implemented", 1635 cmd->opc); 1636 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1637 } 1638 sqhead = (sqhead + 1) % sq->size; 1639 1640 if (NVME_COMPLETION_VALID(compl)) { 1641 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1642 compl.cdw0, 1643 cmd->cid, 1644 0, /* SQID */ 1645 compl.status); 1646 } 1647 } 1648 1649 DPRINTF("setting sqhead %u", sqhead); 1650 sq->head = sqhead; 1651 1652 if (cq->head != cq->tail) 1653 pci_generate_msix(sc->nsc_pi, 0); 1654 1655 pthread_mutex_unlock(&sq->mtx); 1656 } 1657 1658 /* 1659 * Update the Write and Read statistics reported in SMART data 1660 * 1661 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1662 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1663 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1664 */ 1665 static void 1666 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1667 size_t bytes, uint16_t status) 1668 { 1669 1670 pthread_mutex_lock(&sc->mtx); 1671 switch (opc) { 1672 case NVME_OPC_WRITE: 1673 sc->write_commands++; 1674 if (status != NVME_SC_SUCCESS) 1675 break; 1676 sc->write_dunits_remainder += (bytes / 512); 1677 while (sc->write_dunits_remainder >= 1000) { 1678 sc->write_data_units++; 1679 sc->write_dunits_remainder -= 1000; 1680 } 1681 break; 1682 case NVME_OPC_READ: 1683 sc->read_commands++; 1684 if (status != NVME_SC_SUCCESS) 1685 break; 1686 sc->read_dunits_remainder += (bytes / 512); 1687 while (sc->read_dunits_remainder >= 1000) { 1688 sc->read_data_units++; 1689 sc->read_dunits_remainder -= 1000; 1690 } 1691 break; 1692 default: 1693 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1694 break; 1695 } 1696 pthread_mutex_unlock(&sc->mtx); 1697 } 1698 1699 /* 1700 * Check if the combination of Starting LBA (slba) and Number of Logical 1701 * Blocks (nlb) exceeds the range of the underlying storage. 1702 * 1703 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 1704 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 1705 * overflow. 1706 */ 1707 static bool 1708 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 1709 uint32_t nlb) 1710 { 1711 size_t offset, bytes; 1712 1713 /* Overflow check of multiplying Starting LBA by the sector size */ 1714 if (slba >> (64 - nvstore->sectsz_bits)) 1715 return (true); 1716 1717 offset = slba << nvstore->sectsz_bits; 1718 bytes = nlb << nvstore->sectsz_bits; 1719 1720 /* Overflow check of Number of Logical Blocks */ 1721 if ((nvstore->size - offset) < bytes) 1722 return (true); 1723 1724 return (false); 1725 } 1726 1727 static int 1728 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1729 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1730 { 1731 int iovidx; 1732 1733 if (req == NULL) 1734 return (-1); 1735 1736 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 1737 return (-1); 1738 } 1739 1740 /* concatenate contig block-iovs to minimize number of iovs */ 1741 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1742 iovidx = req->io_req.br_iovcnt - 1; 1743 1744 req->io_req.br_iov[iovidx].iov_base = 1745 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1746 req->prev_gpaddr, size); 1747 1748 req->prev_size += size; 1749 req->io_req.br_resid += size; 1750 1751 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1752 } else { 1753 iovidx = req->io_req.br_iovcnt; 1754 if (iovidx == 0) { 1755 req->io_req.br_offset = lba; 1756 req->io_req.br_resid = 0; 1757 req->io_req.br_param = req; 1758 } 1759 1760 req->io_req.br_iov[iovidx].iov_base = 1761 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1762 gpaddr, size); 1763 1764 req->io_req.br_iov[iovidx].iov_len = size; 1765 1766 req->prev_gpaddr = gpaddr; 1767 req->prev_size = size; 1768 req->io_req.br_resid += size; 1769 1770 req->io_req.br_iovcnt++; 1771 } 1772 1773 return (0); 1774 } 1775 1776 static void 1777 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1778 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1779 uint32_t cdw0, uint16_t status) 1780 { 1781 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1782 1783 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1784 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1785 NVME_STATUS_GET_SC(status)); 1786 1787 pci_nvme_cq_update(sc, cq, 1788 0, /* CDW0 */ 1789 cid, 1790 sqid, 1791 status); 1792 1793 if (cq->head != cq->tail) { 1794 if (cq->intr_en & NVME_CQ_INTEN) { 1795 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1796 } else { 1797 DPRINTF("%s: CQ%u interrupt disabled", 1798 __func__, sq->cqid); 1799 } 1800 } 1801 } 1802 1803 static void 1804 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1805 { 1806 req->sc = NULL; 1807 req->nvme_sq = NULL; 1808 req->sqid = 0; 1809 1810 pthread_mutex_lock(&sc->mtx); 1811 1812 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1813 sc->pending_ios--; 1814 1815 /* when no more IO pending, can set to ready if device reset/enabled */ 1816 if (sc->pending_ios == 0 && 1817 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1818 sc->regs.csts |= NVME_CSTS_RDY; 1819 1820 pthread_mutex_unlock(&sc->mtx); 1821 1822 sem_post(&sc->iosemlock); 1823 } 1824 1825 static struct pci_nvme_ioreq * 1826 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1827 { 1828 struct pci_nvme_ioreq *req = NULL;; 1829 1830 sem_wait(&sc->iosemlock); 1831 pthread_mutex_lock(&sc->mtx); 1832 1833 req = STAILQ_FIRST(&sc->ioreqs_free); 1834 assert(req != NULL); 1835 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1836 1837 req->sc = sc; 1838 1839 sc->pending_ios++; 1840 1841 pthread_mutex_unlock(&sc->mtx); 1842 1843 req->io_req.br_iovcnt = 0; 1844 req->io_req.br_offset = 0; 1845 req->io_req.br_resid = 0; 1846 req->io_req.br_param = req; 1847 req->prev_gpaddr = 0; 1848 req->prev_size = 0; 1849 1850 return req; 1851 } 1852 1853 static void 1854 pci_nvme_io_done(struct blockif_req *br, int err) 1855 { 1856 struct pci_nvme_ioreq *req = br->br_param; 1857 struct nvme_submission_queue *sq = req->nvme_sq; 1858 uint16_t code, status; 1859 1860 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 1861 1862 /* TODO return correct error */ 1863 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1864 pci_nvme_status_genc(&status, code); 1865 1866 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 1867 pci_nvme_stats_write_read_update(req->sc, req->opc, 1868 req->bytes, status); 1869 pci_nvme_release_ioreq(req->sc, req); 1870 } 1871 1872 /* 1873 * Implements the Flush command. The specification states: 1874 * If a volatile write cache is not present, Flush commands complete 1875 * successfully and have no effect 1876 * in the description of the Volatile Write Cache (VWC) field of the Identify 1877 * Controller data. Therefore, set status to Success if the command is 1878 * not supported (i.e. RAM or as indicated by the blockif). 1879 */ 1880 static bool 1881 nvme_opc_flush(struct pci_nvme_softc *sc, 1882 struct nvme_command *cmd, 1883 struct pci_nvme_blockstore *nvstore, 1884 struct pci_nvme_ioreq *req, 1885 uint16_t *status) 1886 { 1887 bool pending = false; 1888 1889 if (nvstore->type == NVME_STOR_RAM) { 1890 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1891 } else { 1892 int err; 1893 1894 req->io_req.br_callback = pci_nvme_io_done; 1895 1896 err = blockif_flush(nvstore->ctx, &req->io_req); 1897 switch (err) { 1898 case 0: 1899 pending = true; 1900 break; 1901 case EOPNOTSUPP: 1902 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1903 break; 1904 default: 1905 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1906 } 1907 } 1908 1909 return (pending); 1910 } 1911 1912 static uint16_t 1913 nvme_write_read_ram(struct pci_nvme_softc *sc, 1914 struct pci_nvme_blockstore *nvstore, 1915 uint64_t prp1, uint64_t prp2, 1916 size_t offset, uint64_t bytes, 1917 bool is_write) 1918 { 1919 uint8_t *buf = nvstore->ctx; 1920 enum nvme_copy_dir dir; 1921 uint16_t status; 1922 1923 if (is_write) 1924 dir = NVME_COPY_TO_PRP; 1925 else 1926 dir = NVME_COPY_FROM_PRP; 1927 1928 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 1929 buf + offset, bytes, dir)) 1930 pci_nvme_status_genc(&status, 1931 NVME_SC_DATA_TRANSFER_ERROR); 1932 else 1933 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1934 1935 return (status); 1936 } 1937 1938 static uint16_t 1939 nvme_write_read_blockif(struct pci_nvme_softc *sc, 1940 struct pci_nvme_blockstore *nvstore, 1941 struct pci_nvme_ioreq *req, 1942 uint64_t prp1, uint64_t prp2, 1943 size_t offset, uint64_t bytes, 1944 bool is_write) 1945 { 1946 uint64_t size; 1947 int err; 1948 uint16_t status = NVME_NO_STATUS; 1949 1950 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 1951 if (pci_nvme_append_iov_req(sc, req, prp1, 1952 size, is_write, offset)) { 1953 pci_nvme_status_genc(&status, 1954 NVME_SC_DATA_TRANSFER_ERROR); 1955 goto out; 1956 } 1957 1958 offset += size; 1959 bytes -= size; 1960 1961 if (bytes == 0) { 1962 ; 1963 } else if (bytes <= PAGE_SIZE) { 1964 size = bytes; 1965 if (pci_nvme_append_iov_req(sc, req, prp2, 1966 size, is_write, offset)) { 1967 pci_nvme_status_genc(&status, 1968 NVME_SC_DATA_TRANSFER_ERROR); 1969 goto out; 1970 } 1971 } else { 1972 void *vmctx = sc->nsc_pi->pi_vmctx; 1973 uint64_t *prp_list = &prp2; 1974 uint64_t *last = prp_list; 1975 1976 /* PRP2 is pointer to a physical region page list */ 1977 while (bytes) { 1978 /* Last entry in list points to the next list */ 1979 if (prp_list == last) { 1980 uint64_t prp = *prp_list; 1981 1982 prp_list = paddr_guest2host(vmctx, prp, 1983 PAGE_SIZE - (prp % PAGE_SIZE)); 1984 last = prp_list + (NVME_PRP2_ITEMS - 1); 1985 } 1986 1987 size = MIN(bytes, PAGE_SIZE); 1988 1989 if (pci_nvme_append_iov_req(sc, req, *prp_list, 1990 size, is_write, offset)) { 1991 pci_nvme_status_genc(&status, 1992 NVME_SC_DATA_TRANSFER_ERROR); 1993 goto out; 1994 } 1995 1996 offset += size; 1997 bytes -= size; 1998 1999 prp_list++; 2000 } 2001 } 2002 req->io_req.br_callback = pci_nvme_io_done; 2003 if (is_write) 2004 err = blockif_write(nvstore->ctx, &req->io_req); 2005 else 2006 err = blockif_read(nvstore->ctx, &req->io_req); 2007 2008 if (err) 2009 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2010 out: 2011 return (status); 2012 } 2013 2014 static bool 2015 nvme_opc_write_read(struct pci_nvme_softc *sc, 2016 struct nvme_command *cmd, 2017 struct pci_nvme_blockstore *nvstore, 2018 struct pci_nvme_ioreq *req, 2019 uint16_t *status) 2020 { 2021 uint64_t lba, nblocks, bytes; 2022 size_t offset; 2023 bool is_write = cmd->opc == NVME_OPC_WRITE; 2024 bool pending = false; 2025 2026 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2027 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2028 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2029 WPRINTF("%s command would exceed LBA range", __func__); 2030 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2031 goto out; 2032 } 2033 2034 bytes = nblocks << nvstore->sectsz_bits; 2035 if (bytes > NVME_MAX_DATA_SIZE) { 2036 WPRINTF("%s command would exceed MDTS", __func__); 2037 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2038 goto out; 2039 } 2040 2041 offset = lba << nvstore->sectsz_bits; 2042 2043 req->bytes = bytes; 2044 req->io_req.br_offset = lba; 2045 2046 /* PRP bits 1:0 must be zero */ 2047 cmd->prp1 &= ~0x3UL; 2048 cmd->prp2 &= ~0x3UL; 2049 2050 if (nvstore->type == NVME_STOR_RAM) { 2051 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2052 cmd->prp2, offset, bytes, is_write); 2053 } else { 2054 *status = nvme_write_read_blockif(sc, nvstore, req, 2055 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2056 2057 if (*status == NVME_NO_STATUS) 2058 pending = true; 2059 } 2060 out: 2061 if (!pending) 2062 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2063 2064 return (pending); 2065 } 2066 2067 static void 2068 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2069 { 2070 struct pci_nvme_ioreq *req = br->br_param; 2071 struct pci_nvme_softc *sc = req->sc; 2072 bool done = true; 2073 uint16_t status; 2074 2075 if (err) { 2076 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2077 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2078 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2079 } else { 2080 struct iovec *iov = req->io_req.br_iov; 2081 2082 req->prev_gpaddr++; 2083 iov += req->prev_gpaddr; 2084 2085 /* The iov_* values already include the sector size */ 2086 req->io_req.br_offset = (off_t)iov->iov_base; 2087 req->io_req.br_resid = iov->iov_len; 2088 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2089 pci_nvme_status_genc(&status, 2090 NVME_SC_INTERNAL_DEVICE_ERROR); 2091 } else 2092 done = false; 2093 } 2094 2095 if (done) { 2096 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2097 req->cid, 0, status); 2098 pci_nvme_release_ioreq(sc, req); 2099 } 2100 } 2101 2102 static bool 2103 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2104 struct nvme_command *cmd, 2105 struct pci_nvme_blockstore *nvstore, 2106 struct pci_nvme_ioreq *req, 2107 uint16_t *status) 2108 { 2109 struct nvme_dsm_range *range; 2110 uint32_t nr, r, non_zero, dr; 2111 int err; 2112 bool pending = false; 2113 2114 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2115 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2116 goto out; 2117 } 2118 2119 nr = cmd->cdw10 & 0xff; 2120 2121 /* copy locally because a range entry could straddle PRPs */ 2122 range = calloc(1, NVME_MAX_DSM_TRIM); 2123 if (range == NULL) { 2124 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2125 goto out; 2126 } 2127 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2128 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2129 2130 /* Check for invalid ranges and the number of non-zero lengths */ 2131 non_zero = 0; 2132 for (r = 0; r <= nr; r++) { 2133 if (pci_nvme_out_of_range(nvstore, 2134 range[r].starting_lba, range[r].length)) { 2135 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2136 goto out; 2137 } 2138 if (range[r].length != 0) 2139 non_zero++; 2140 } 2141 2142 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2143 size_t offset, bytes; 2144 int sectsz_bits = sc->nvstore.sectsz_bits; 2145 2146 /* 2147 * DSM calls are advisory only, and compliant controllers 2148 * may choose to take no actions (i.e. return Success). 2149 */ 2150 if (!nvstore->deallocate) { 2151 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2152 goto out; 2153 } 2154 2155 /* If all ranges have a zero length, return Success */ 2156 if (non_zero == 0) { 2157 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2158 goto out; 2159 } 2160 2161 if (req == NULL) { 2162 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2163 goto out; 2164 } 2165 2166 offset = range[0].starting_lba << sectsz_bits; 2167 bytes = range[0].length << sectsz_bits; 2168 2169 /* 2170 * If the request is for more than a single range, store 2171 * the ranges in the br_iov. Optimize for the common case 2172 * of a single range. 2173 * 2174 * Note that NVMe Number of Ranges is a zero based value 2175 */ 2176 req->io_req.br_iovcnt = 0; 2177 req->io_req.br_offset = offset; 2178 req->io_req.br_resid = bytes; 2179 2180 if (nr == 0) { 2181 req->io_req.br_callback = pci_nvme_io_done; 2182 } else { 2183 struct iovec *iov = req->io_req.br_iov; 2184 2185 for (r = 0, dr = 0; r <= nr; r++) { 2186 offset = range[r].starting_lba << sectsz_bits; 2187 bytes = range[r].length << sectsz_bits; 2188 if (bytes == 0) 2189 continue; 2190 2191 if ((nvstore->size - offset) < bytes) { 2192 pci_nvme_status_genc(status, 2193 NVME_SC_LBA_OUT_OF_RANGE); 2194 goto out; 2195 } 2196 iov[dr].iov_base = (void *)offset; 2197 iov[dr].iov_len = bytes; 2198 dr++; 2199 } 2200 req->io_req.br_callback = pci_nvme_dealloc_sm; 2201 2202 /* 2203 * Use prev_gpaddr to track the current entry and 2204 * prev_size to track the number of entries 2205 */ 2206 req->prev_gpaddr = 0; 2207 req->prev_size = dr; 2208 } 2209 2210 err = blockif_delete(nvstore->ctx, &req->io_req); 2211 if (err) 2212 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2213 else 2214 pending = true; 2215 } 2216 out: 2217 free(range); 2218 return (pending); 2219 } 2220 2221 static void 2222 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2223 { 2224 struct nvme_submission_queue *sq; 2225 uint16_t status; 2226 uint16_t sqhead; 2227 2228 /* handle all submissions up to sq->tail index */ 2229 sq = &sc->submit_queues[idx]; 2230 2231 pthread_mutex_lock(&sq->mtx); 2232 2233 sqhead = sq->head; 2234 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2235 idx, sqhead, sq->tail, sq->qbase); 2236 2237 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2238 struct nvme_command *cmd; 2239 struct pci_nvme_ioreq *req; 2240 uint32_t nsid; 2241 bool pending; 2242 2243 pending = false; 2244 req = NULL; 2245 status = 0; 2246 2247 cmd = &sq->qbase[sqhead]; 2248 sqhead = (sqhead + 1) % sq->size; 2249 2250 nsid = le32toh(cmd->nsid); 2251 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2252 pci_nvme_status_genc(&status, 2253 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2254 status |= 2255 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2256 goto complete; 2257 } 2258 2259 req = pci_nvme_get_ioreq(sc); 2260 if (req == NULL) { 2261 pci_nvme_status_genc(&status, 2262 NVME_SC_INTERNAL_DEVICE_ERROR); 2263 WPRINTF("%s: unable to allocate IO req", __func__); 2264 goto complete; 2265 } 2266 req->nvme_sq = sq; 2267 req->sqid = idx; 2268 req->opc = cmd->opc; 2269 req->cid = cmd->cid; 2270 req->nsid = cmd->nsid; 2271 2272 switch (cmd->opc) { 2273 case NVME_OPC_FLUSH: 2274 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2275 req, &status); 2276 break; 2277 case NVME_OPC_WRITE: 2278 case NVME_OPC_READ: 2279 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2280 req, &status); 2281 break; 2282 case NVME_OPC_WRITE_ZEROES: 2283 /* TODO: write zeroes 2284 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2285 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2286 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2287 break; 2288 case NVME_OPC_DATASET_MANAGEMENT: 2289 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2290 req, &status); 2291 break; 2292 default: 2293 WPRINTF("%s unhandled io command 0x%x", 2294 __func__, cmd->opc); 2295 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2296 } 2297 complete: 2298 if (!pending) { 2299 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2300 status); 2301 if (req != NULL) 2302 pci_nvme_release_ioreq(sc, req); 2303 } 2304 } 2305 2306 sq->head = sqhead; 2307 2308 pthread_mutex_unlock(&sq->mtx); 2309 } 2310 2311 static void 2312 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2313 uint64_t idx, int is_sq, uint64_t value) 2314 { 2315 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2316 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2317 2318 if (is_sq) { 2319 if (idx > sc->num_squeues) { 2320 WPRINTF("%s queue index %lu overflow from " 2321 "guest (max %u)", 2322 __func__, idx, sc->num_squeues); 2323 return; 2324 } 2325 2326 atomic_store_short(&sc->submit_queues[idx].tail, 2327 (uint16_t)value); 2328 2329 if (idx == 0) { 2330 pci_nvme_handle_admin_cmd(sc, value); 2331 } else { 2332 /* submission queue; handle new entries in SQ */ 2333 if (idx > sc->num_squeues) { 2334 WPRINTF("%s SQ index %lu overflow from " 2335 "guest (max %u)", 2336 __func__, idx, sc->num_squeues); 2337 return; 2338 } 2339 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2340 } 2341 } else { 2342 if (idx > sc->num_cqueues) { 2343 WPRINTF("%s queue index %lu overflow from " 2344 "guest (max %u)", 2345 __func__, idx, sc->num_cqueues); 2346 return; 2347 } 2348 2349 atomic_store_short(&sc->compl_queues[idx].head, 2350 (uint16_t)value); 2351 } 2352 } 2353 2354 static void 2355 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2356 { 2357 const char *s = iswrite ? "WRITE" : "READ"; 2358 2359 switch (offset) { 2360 case NVME_CR_CAP_LOW: 2361 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2362 break; 2363 case NVME_CR_CAP_HI: 2364 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2365 break; 2366 case NVME_CR_VS: 2367 DPRINTF("%s %s NVME_CR_VS", func, s); 2368 break; 2369 case NVME_CR_INTMS: 2370 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2371 break; 2372 case NVME_CR_INTMC: 2373 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2374 break; 2375 case NVME_CR_CC: 2376 DPRINTF("%s %s NVME_CR_CC", func, s); 2377 break; 2378 case NVME_CR_CSTS: 2379 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2380 break; 2381 case NVME_CR_NSSR: 2382 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2383 break; 2384 case NVME_CR_AQA: 2385 DPRINTF("%s %s NVME_CR_AQA", func, s); 2386 break; 2387 case NVME_CR_ASQ_LOW: 2388 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2389 break; 2390 case NVME_CR_ASQ_HI: 2391 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2392 break; 2393 case NVME_CR_ACQ_LOW: 2394 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2395 break; 2396 case NVME_CR_ACQ_HI: 2397 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2398 break; 2399 default: 2400 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2401 } 2402 2403 } 2404 2405 static void 2406 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2407 uint64_t offset, int size, uint64_t value) 2408 { 2409 uint32_t ccreg; 2410 2411 if (offset >= NVME_DOORBELL_OFFSET) { 2412 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2413 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2414 int is_sq = (belloffset % 8) < 4; 2415 2416 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2417 WPRINTF("guest attempted an overflow write offset " 2418 "0x%lx, val 0x%lx in %s", 2419 offset, value, __func__); 2420 return; 2421 } 2422 2423 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2424 return; 2425 } 2426 2427 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2428 offset, size, value); 2429 2430 if (size != 4) { 2431 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2432 "val 0x%lx) to bar0 in %s", 2433 size, offset, value, __func__); 2434 /* TODO: shutdown device */ 2435 return; 2436 } 2437 2438 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2439 2440 pthread_mutex_lock(&sc->mtx); 2441 2442 switch (offset) { 2443 case NVME_CR_CAP_LOW: 2444 case NVME_CR_CAP_HI: 2445 /* readonly */ 2446 break; 2447 case NVME_CR_VS: 2448 /* readonly */ 2449 break; 2450 case NVME_CR_INTMS: 2451 /* MSI-X, so ignore */ 2452 break; 2453 case NVME_CR_INTMC: 2454 /* MSI-X, so ignore */ 2455 break; 2456 case NVME_CR_CC: 2457 ccreg = (uint32_t)value; 2458 2459 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2460 "iocqes %u", 2461 __func__, 2462 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2463 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2464 NVME_CC_GET_IOCQES(ccreg)); 2465 2466 if (NVME_CC_GET_SHN(ccreg)) { 2467 /* perform shutdown - flush out data to backend */ 2468 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2469 NVME_CSTS_REG_SHST_SHIFT); 2470 sc->regs.csts |= NVME_SHST_COMPLETE << 2471 NVME_CSTS_REG_SHST_SHIFT; 2472 } 2473 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2474 if (NVME_CC_GET_EN(ccreg) == 0) 2475 /* transition 1-> causes controller reset */ 2476 pci_nvme_reset_locked(sc); 2477 else 2478 pci_nvme_init_controller(ctx, sc); 2479 } 2480 2481 /* Insert the iocqes, iosqes and en bits from the write */ 2482 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2483 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2484 if (NVME_CC_GET_EN(ccreg) == 0) { 2485 /* Insert the ams, mps and css bit fields */ 2486 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2487 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2488 sc->regs.csts &= ~NVME_CSTS_RDY; 2489 } else if (sc->pending_ios == 0) { 2490 sc->regs.csts |= NVME_CSTS_RDY; 2491 } 2492 break; 2493 case NVME_CR_CSTS: 2494 break; 2495 case NVME_CR_NSSR: 2496 /* ignore writes; don't support subsystem reset */ 2497 break; 2498 case NVME_CR_AQA: 2499 sc->regs.aqa = (uint32_t)value; 2500 break; 2501 case NVME_CR_ASQ_LOW: 2502 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2503 (0xFFFFF000 & value); 2504 break; 2505 case NVME_CR_ASQ_HI: 2506 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2507 (value << 32); 2508 break; 2509 case NVME_CR_ACQ_LOW: 2510 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2511 (0xFFFFF000 & value); 2512 break; 2513 case NVME_CR_ACQ_HI: 2514 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2515 (value << 32); 2516 break; 2517 default: 2518 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2519 __func__, offset, value, size); 2520 } 2521 pthread_mutex_unlock(&sc->mtx); 2522 } 2523 2524 static void 2525 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2526 int baridx, uint64_t offset, int size, uint64_t value) 2527 { 2528 struct pci_nvme_softc* sc = pi->pi_arg; 2529 2530 if (baridx == pci_msix_table_bar(pi) || 2531 baridx == pci_msix_pba_bar(pi)) { 2532 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2533 " value 0x%lx", baridx, offset, size, value); 2534 2535 pci_emul_msix_twrite(pi, offset, size, value); 2536 return; 2537 } 2538 2539 switch (baridx) { 2540 case 0: 2541 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2542 break; 2543 2544 default: 2545 DPRINTF("%s unknown baridx %d, val 0x%lx", 2546 __func__, baridx, value); 2547 } 2548 } 2549 2550 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2551 uint64_t offset, int size) 2552 { 2553 uint64_t value; 2554 2555 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2556 2557 if (offset < NVME_DOORBELL_OFFSET) { 2558 void *p = &(sc->regs); 2559 pthread_mutex_lock(&sc->mtx); 2560 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2561 pthread_mutex_unlock(&sc->mtx); 2562 } else { 2563 value = 0; 2564 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2565 } 2566 2567 switch (size) { 2568 case 1: 2569 value &= 0xFF; 2570 break; 2571 case 2: 2572 value &= 0xFFFF; 2573 break; 2574 case 4: 2575 value &= 0xFFFFFFFF; 2576 break; 2577 } 2578 2579 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2580 offset, size, (uint32_t)value); 2581 2582 return (value); 2583 } 2584 2585 2586 2587 static uint64_t 2588 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2589 uint64_t offset, int size) 2590 { 2591 struct pci_nvme_softc* sc = pi->pi_arg; 2592 2593 if (baridx == pci_msix_table_bar(pi) || 2594 baridx == pci_msix_pba_bar(pi)) { 2595 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2596 baridx, offset, size); 2597 2598 return pci_emul_msix_tread(pi, offset, size); 2599 } 2600 2601 switch (baridx) { 2602 case 0: 2603 return pci_nvme_read_bar_0(sc, offset, size); 2604 2605 default: 2606 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2607 } 2608 2609 return (0); 2610 } 2611 2612 2613 static int 2614 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 2615 { 2616 char bident[sizeof("XX:X:X")]; 2617 char *uopt, *xopts, *config; 2618 uint32_t sectsz; 2619 int optidx; 2620 2621 sc->max_queues = NVME_QUEUES; 2622 sc->max_qentries = NVME_MAX_QENTRIES; 2623 sc->ioslots = NVME_IOSLOTS; 2624 sc->num_squeues = sc->max_queues; 2625 sc->num_cqueues = sc->max_queues; 2626 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2627 sectsz = 0; 2628 2629 uopt = strdup(opts); 2630 optidx = 0; 2631 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2632 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2633 for (xopts = strtok(uopt, ","); 2634 xopts != NULL; 2635 xopts = strtok(NULL, ",")) { 2636 2637 if ((config = strchr(xopts, '=')) != NULL) 2638 *config++ = '\0'; 2639 2640 if (!strcmp("maxq", xopts)) { 2641 sc->max_queues = atoi(config); 2642 } else if (!strcmp("qsz", xopts)) { 2643 sc->max_qentries = atoi(config); 2644 } else if (!strcmp("ioslots", xopts)) { 2645 sc->ioslots = atoi(config); 2646 } else if (!strcmp("sectsz", xopts)) { 2647 sectsz = atoi(config); 2648 } else if (!strcmp("ser", xopts)) { 2649 /* 2650 * This field indicates the Product Serial Number in 2651 * 7-bit ASCII, unused bytes should be space characters. 2652 * Ref: NVMe v1.3c. 2653 */ 2654 cpywithpad((char *)sc->ctrldata.sn, 2655 sizeof(sc->ctrldata.sn), config, ' '); 2656 } else if (!strcmp("ram", xopts)) { 2657 uint64_t sz = strtoull(&xopts[4], NULL, 10); 2658 2659 sc->nvstore.type = NVME_STOR_RAM; 2660 sc->nvstore.size = sz * 1024 * 1024; 2661 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2662 sc->nvstore.sectsz = 4096; 2663 sc->nvstore.sectsz_bits = 12; 2664 if (sc->nvstore.ctx == NULL) { 2665 perror("Unable to allocate RAM"); 2666 free(uopt); 2667 return (-1); 2668 } 2669 } else if (!strcmp("eui64", xopts)) { 2670 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 2671 } else if (!strcmp("dsm", xopts)) { 2672 if (!strcmp("auto", config)) 2673 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2674 else if (!strcmp("enable", config)) 2675 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2676 else if (!strcmp("disable", config)) 2677 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2678 } else if (optidx == 0) { 2679 snprintf(bident, sizeof(bident), "%d:%d", 2680 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2681 sc->nvstore.ctx = blockif_open(xopts, bident); 2682 if (sc->nvstore.ctx == NULL) { 2683 perror("Could not open backing file"); 2684 free(uopt); 2685 return (-1); 2686 } 2687 sc->nvstore.type = NVME_STOR_BLOCKIF; 2688 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2689 } else { 2690 EPRINTLN("Invalid option %s", xopts); 2691 free(uopt); 2692 return (-1); 2693 } 2694 2695 optidx++; 2696 } 2697 free(uopt); 2698 2699 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 2700 EPRINTLN("backing store not specified"); 2701 return (-1); 2702 } 2703 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2704 sc->nvstore.sectsz = sectsz; 2705 else if (sc->nvstore.type != NVME_STOR_RAM) 2706 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2707 for (sc->nvstore.sectsz_bits = 9; 2708 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2709 sc->nvstore.sectsz_bits++); 2710 2711 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2712 sc->max_queues = NVME_QUEUES; 2713 2714 if (sc->max_qentries <= 0) { 2715 EPRINTLN("Invalid qsz option"); 2716 return (-1); 2717 } 2718 if (sc->ioslots <= 0) { 2719 EPRINTLN("Invalid ioslots option"); 2720 return (-1); 2721 } 2722 2723 return (0); 2724 } 2725 2726 static int 2727 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 2728 { 2729 struct pci_nvme_softc *sc; 2730 uint32_t pci_membar_sz; 2731 int error; 2732 2733 error = 0; 2734 2735 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2736 pi->pi_arg = sc; 2737 sc->nsc_pi = pi; 2738 2739 error = pci_nvme_parse_opts(sc, opts); 2740 if (error < 0) 2741 goto done; 2742 else 2743 error = 0; 2744 2745 STAILQ_INIT(&sc->ioreqs_free); 2746 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2747 for (int i = 0; i < sc->ioslots; i++) { 2748 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2749 } 2750 2751 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2752 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2753 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2754 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2755 pci_set_cfgdata8(pi, PCIR_PROGIF, 2756 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2757 2758 /* 2759 * Allocate size of NVMe registers + doorbell space for all queues. 2760 * 2761 * The specification requires a minimum memory I/O window size of 16K. 2762 * The Windows driver will refuse to start a device with a smaller 2763 * window. 2764 */ 2765 pci_membar_sz = sizeof(struct nvme_registers) + 2766 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2767 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2768 2769 DPRINTF("nvme membar size: %u", pci_membar_sz); 2770 2771 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2772 if (error) { 2773 WPRINTF("%s pci alloc mem bar failed", __func__); 2774 goto done; 2775 } 2776 2777 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2778 if (error) { 2779 WPRINTF("%s pci add msixcap failed", __func__); 2780 goto done; 2781 } 2782 2783 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2784 if (error) { 2785 WPRINTF("%s pci add Express capability failed", __func__); 2786 goto done; 2787 } 2788 2789 pthread_mutex_init(&sc->mtx, NULL); 2790 sem_init(&sc->iosemlock, 0, sc->ioslots); 2791 2792 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 2793 /* 2794 * Controller data depends on Namespace data so initialize Namespace 2795 * data first. 2796 */ 2797 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2798 pci_nvme_init_ctrldata(sc); 2799 pci_nvme_init_logpages(sc); 2800 pci_nvme_init_features(sc); 2801 2802 pci_nvme_aer_init(sc); 2803 2804 pci_nvme_reset(sc); 2805 2806 pci_lintr_request(pi); 2807 2808 done: 2809 return (error); 2810 } 2811 2812 2813 struct pci_devemu pci_de_nvme = { 2814 .pe_emu = "nvme", 2815 .pe_init = pci_nvme_init, 2816 .pe_barwrite = pci_nvme_write, 2817 .pe_barread = pci_nvme_read 2818 }; 2819 PCI_EMUL_SET(pci_de_nvme); 2820