1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <semaphore.h> 69 #include <stdbool.h> 70 #include <stddef.h> 71 #include <stdint.h> 72 #include <stdio.h> 73 #include <stdlib.h> 74 #include <string.h> 75 76 #include <machine/atomic.h> 77 #include <machine/vmm.h> 78 #include <vmmapi.h> 79 80 #include <dev/nvme/nvme.h> 81 82 #include "bhyverun.h" 83 #include "block_if.h" 84 #include "config.h" 85 #include "debug.h" 86 #include "pci_emul.h" 87 88 89 static int nvme_debug = 0; 90 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 91 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 92 93 /* defaults; can be overridden */ 94 #define NVME_MSIX_BAR 4 95 96 #define NVME_IOSLOTS 8 97 98 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 99 #define NVME_MMIO_SPACE_MIN (1 << 14) 100 101 #define NVME_QUEUES 16 102 #define NVME_MAX_QENTRIES 2048 103 /* Memory Page size Minimum reported in CAP register */ 104 #define NVME_MPSMIN 0 105 /* MPSMIN converted to bytes */ 106 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 107 108 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 109 #define NVME_MDTS 9 110 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 111 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 112 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 113 114 /* This is a synthetic status code to indicate there is no status */ 115 #define NVME_NO_STATUS 0xffff 116 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 117 118 /* helpers */ 119 120 /* Convert a zero-based value into a one-based value */ 121 #define ONE_BASED(zero) ((zero) + 1) 122 /* Convert a one-based value into a zero-based value */ 123 #define ZERO_BASED(one) ((one) - 1) 124 125 /* Encode number of SQ's and CQ's for Set/Get Features */ 126 #define NVME_FEATURE_NUM_QUEUES(sc) \ 127 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 128 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 129 130 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 131 132 enum nvme_controller_register_offsets { 133 NVME_CR_CAP_LOW = 0x00, 134 NVME_CR_CAP_HI = 0x04, 135 NVME_CR_VS = 0x08, 136 NVME_CR_INTMS = 0x0c, 137 NVME_CR_INTMC = 0x10, 138 NVME_CR_CC = 0x14, 139 NVME_CR_CSTS = 0x1c, 140 NVME_CR_NSSR = 0x20, 141 NVME_CR_AQA = 0x24, 142 NVME_CR_ASQ_LOW = 0x28, 143 NVME_CR_ASQ_HI = 0x2c, 144 NVME_CR_ACQ_LOW = 0x30, 145 NVME_CR_ACQ_HI = 0x34, 146 }; 147 148 enum nvme_cmd_cdw11 { 149 NVME_CMD_CDW11_PC = 0x0001, 150 NVME_CMD_CDW11_IEN = 0x0002, 151 NVME_CMD_CDW11_IV = 0xFFFF0000, 152 }; 153 154 enum nvme_copy_dir { 155 NVME_COPY_TO_PRP, 156 NVME_COPY_FROM_PRP, 157 }; 158 159 #define NVME_CQ_INTEN 0x01 160 #define NVME_CQ_INTCOAL 0x02 161 162 struct nvme_completion_queue { 163 struct nvme_completion *qbase; 164 pthread_mutex_t mtx; 165 uint32_t size; 166 uint16_t tail; /* nvme progress */ 167 uint16_t head; /* guest progress */ 168 uint16_t intr_vec; 169 uint32_t intr_en; 170 }; 171 172 struct nvme_submission_queue { 173 struct nvme_command *qbase; 174 pthread_mutex_t mtx; 175 uint32_t size; 176 uint16_t head; /* nvme progress */ 177 uint16_t tail; /* guest progress */ 178 uint16_t cqid; /* completion queue id */ 179 int qpriority; 180 }; 181 182 enum nvme_storage_type { 183 NVME_STOR_BLOCKIF = 0, 184 NVME_STOR_RAM = 1, 185 }; 186 187 struct pci_nvme_blockstore { 188 enum nvme_storage_type type; 189 void *ctx; 190 uint64_t size; 191 uint32_t sectsz; 192 uint32_t sectsz_bits; 193 uint64_t eui64; 194 uint32_t deallocate:1; 195 }; 196 197 /* 198 * Calculate the number of additional page descriptors for guest IO requests 199 * based on the advertised Max Data Transfer (MDTS) and given the number of 200 * default iovec's in a struct blockif_req. 201 * 202 * Note the + 1 allows for the initial descriptor to not be page aligned. 203 */ 204 #define MDTS_PAD_SIZE \ 205 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 206 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 207 0 208 209 struct pci_nvme_ioreq { 210 struct pci_nvme_softc *sc; 211 STAILQ_ENTRY(pci_nvme_ioreq) link; 212 struct nvme_submission_queue *nvme_sq; 213 uint16_t sqid; 214 215 /* command information */ 216 uint16_t opc; 217 uint16_t cid; 218 uint32_t nsid; 219 220 uint64_t prev_gpaddr; 221 size_t prev_size; 222 size_t bytes; 223 224 struct blockif_req io_req; 225 226 struct iovec iovpadding[MDTS_PAD_SIZE]; 227 }; 228 229 enum nvme_dsm_type { 230 /* Dataset Management bit in ONCS reflects backing storage capability */ 231 NVME_DATASET_MANAGEMENT_AUTO, 232 /* Unconditionally set Dataset Management bit in ONCS */ 233 NVME_DATASET_MANAGEMENT_ENABLE, 234 /* Unconditionally clear Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_DISABLE, 236 }; 237 238 struct pci_nvme_softc; 239 struct nvme_feature_obj; 240 241 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 242 struct nvme_feature_obj *, 243 struct nvme_command *, 244 struct nvme_completion *); 245 246 struct nvme_feature_obj { 247 uint32_t cdw11; 248 nvme_feature_cb set; 249 nvme_feature_cb get; 250 bool namespace_specific; 251 }; 252 253 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 254 255 struct pci_nvme_aer { 256 STAILQ_ENTRY(pci_nvme_aer) link; 257 uint16_t cid; /* Command ID of the submitted AER */ 258 }; 259 260 struct pci_nvme_softc { 261 struct pci_devinst *nsc_pi; 262 263 pthread_mutex_t mtx; 264 265 struct nvme_registers regs; 266 267 struct nvme_namespace_data nsdata; 268 struct nvme_controller_data ctrldata; 269 struct nvme_error_information_entry err_log; 270 struct nvme_health_information_page health_log; 271 struct nvme_firmware_page fw_log; 272 273 struct pci_nvme_blockstore nvstore; 274 275 uint16_t max_qentries; /* max entries per queue */ 276 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 277 uint32_t num_cqueues; 278 uint32_t num_squeues; 279 bool num_q_is_set; /* Has host set Number of Queues */ 280 281 struct pci_nvme_ioreq *ioreqs; 282 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 283 uint32_t pending_ios; 284 uint32_t ioslots; 285 sem_t iosemlock; 286 287 /* 288 * Memory mapped Submission and Completion queues 289 * Each array includes both Admin and IO queues 290 */ 291 struct nvme_completion_queue *compl_queues; 292 struct nvme_submission_queue *submit_queues; 293 294 struct nvme_feature_obj feat[NVME_FID_MAX]; 295 296 enum nvme_dsm_type dataset_management; 297 298 /* Accounting for SMART data */ 299 __uint128_t read_data_units; 300 __uint128_t write_data_units; 301 __uint128_t read_commands; 302 __uint128_t write_commands; 303 uint32_t read_dunits_remainder; 304 uint32_t write_dunits_remainder; 305 306 STAILQ_HEAD(, pci_nvme_aer) aer_list; 307 uint32_t aer_count; 308 }; 309 310 311 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 312 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 313 static void pci_nvme_io_done(struct blockif_req *, int); 314 315 /* Controller Configuration utils */ 316 #define NVME_CC_GET_EN(cc) \ 317 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 318 #define NVME_CC_GET_CSS(cc) \ 319 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 320 #define NVME_CC_GET_SHN(cc) \ 321 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 322 #define NVME_CC_GET_IOSQES(cc) \ 323 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 324 #define NVME_CC_GET_IOCQES(cc) \ 325 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 326 327 #define NVME_CC_WRITE_MASK \ 328 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 329 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 330 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 331 332 #define NVME_CC_NEN_WRITE_MASK \ 333 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 334 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 335 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 336 337 /* Controller Status utils */ 338 #define NVME_CSTS_GET_RDY(sts) \ 339 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 340 341 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 342 343 /* Completion Queue status word utils */ 344 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 345 #define NVME_STATUS_MASK \ 346 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 347 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 348 349 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 350 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 351 352 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 353 struct nvme_feature_obj *, 354 struct nvme_command *, 355 struct nvme_completion *); 356 static void nvme_feature_num_queues(struct pci_nvme_softc *, 357 struct nvme_feature_obj *, 358 struct nvme_command *, 359 struct nvme_completion *); 360 static void nvme_feature_iv_config(struct pci_nvme_softc *, 361 struct nvme_feature_obj *, 362 struct nvme_command *, 363 struct nvme_completion *); 364 365 static __inline void 366 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 367 { 368 size_t len; 369 370 len = strnlen(src, dst_size); 371 memset(dst, pad, dst_size); 372 memcpy(dst, src, len); 373 } 374 375 static __inline void 376 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 377 { 378 379 *status &= ~NVME_STATUS_MASK; 380 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 381 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 382 } 383 384 static __inline void 385 pci_nvme_status_genc(uint16_t *status, uint16_t code) 386 { 387 388 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 389 } 390 391 /* 392 * Initialize the requested number or IO Submission and Completion Queues. 393 * Admin queues are allocated implicitly. 394 */ 395 static void 396 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 397 { 398 uint32_t i; 399 400 /* 401 * Allocate and initialize the Submission Queues 402 */ 403 if (nsq > NVME_QUEUES) { 404 WPRINTF("%s: clamping number of SQ from %u to %u", 405 __func__, nsq, NVME_QUEUES); 406 nsq = NVME_QUEUES; 407 } 408 409 sc->num_squeues = nsq; 410 411 sc->submit_queues = calloc(sc->num_squeues + 1, 412 sizeof(struct nvme_submission_queue)); 413 if (sc->submit_queues == NULL) { 414 WPRINTF("%s: SQ allocation failed", __func__); 415 sc->num_squeues = 0; 416 } else { 417 struct nvme_submission_queue *sq = sc->submit_queues; 418 419 for (i = 0; i < sc->num_squeues; i++) 420 pthread_mutex_init(&sq[i].mtx, NULL); 421 } 422 423 /* 424 * Allocate and initialize the Completion Queues 425 */ 426 if (ncq > NVME_QUEUES) { 427 WPRINTF("%s: clamping number of CQ from %u to %u", 428 __func__, ncq, NVME_QUEUES); 429 ncq = NVME_QUEUES; 430 } 431 432 sc->num_cqueues = ncq; 433 434 sc->compl_queues = calloc(sc->num_cqueues + 1, 435 sizeof(struct nvme_completion_queue)); 436 if (sc->compl_queues == NULL) { 437 WPRINTF("%s: CQ allocation failed", __func__); 438 sc->num_cqueues = 0; 439 } else { 440 struct nvme_completion_queue *cq = sc->compl_queues; 441 442 for (i = 0; i < sc->num_cqueues; i++) 443 pthread_mutex_init(&cq[i].mtx, NULL); 444 } 445 } 446 447 static void 448 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 449 { 450 struct nvme_controller_data *cd = &sc->ctrldata; 451 452 cd->vid = 0xFB5D; 453 cd->ssvid = 0x0000; 454 455 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 456 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 457 458 /* Num of submission commands that we can handle at a time (2^rab) */ 459 cd->rab = 4; 460 461 /* FreeBSD OUI */ 462 cd->ieee[0] = 0x58; 463 cd->ieee[1] = 0x9c; 464 cd->ieee[2] = 0xfc; 465 466 cd->mic = 0; 467 468 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 469 470 cd->ver = 0x00010300; 471 472 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 473 cd->acl = 2; 474 cd->aerl = 4; 475 476 /* Advertise 1, Read-only firmware slot */ 477 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 478 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 479 cd->lpa = 0; /* TODO: support some simple things like SMART */ 480 cd->elpe = 0; /* max error log page entries */ 481 cd->npss = 1; /* number of power states support */ 482 483 /* Warning Composite Temperature Threshold */ 484 cd->wctemp = 0x0157; 485 486 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 487 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 488 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 489 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 490 cd->nn = 1; /* number of namespaces */ 491 492 cd->oncs = 0; 493 switch (sc->dataset_management) { 494 case NVME_DATASET_MANAGEMENT_AUTO: 495 if (sc->nvstore.deallocate) 496 cd->oncs |= NVME_ONCS_DSM; 497 break; 498 case NVME_DATASET_MANAGEMENT_ENABLE: 499 cd->oncs |= NVME_ONCS_DSM; 500 break; 501 default: 502 break; 503 } 504 505 cd->fna = 0x03; 506 507 cd->power_state[0].mp = 10; 508 } 509 510 /* 511 * Calculate the CRC-16 of the given buffer 512 * See copyright attribution at top of file 513 */ 514 static uint16_t 515 crc16(uint16_t crc, const void *buffer, unsigned int len) 516 { 517 const unsigned char *cp = buffer; 518 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 519 static uint16_t const crc16_table[256] = { 520 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 521 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 522 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 523 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 524 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 525 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 526 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 527 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 528 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 529 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 530 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 531 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 532 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 533 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 534 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 535 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 536 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 537 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 538 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 539 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 540 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 541 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 542 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 543 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 544 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 545 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 546 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 547 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 548 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 549 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 550 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 551 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 552 }; 553 554 while (len--) 555 crc = (((crc >> 8) & 0xffU) ^ 556 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 557 return crc; 558 } 559 560 static void 561 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 562 struct nvme_namespace_data *nd, uint32_t nsid, 563 struct pci_nvme_blockstore *nvstore) 564 { 565 566 /* Get capacity and block size information from backing store */ 567 nd->nsze = nvstore->size / nvstore->sectsz; 568 nd->ncap = nd->nsze; 569 nd->nuse = nd->nsze; 570 571 if (nvstore->type == NVME_STOR_BLOCKIF) 572 nvstore->deallocate = blockif_candelete(nvstore->ctx); 573 574 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 575 nd->flbas = 0; 576 577 /* Create an EUI-64 if user did not provide one */ 578 if (nvstore->eui64 == 0) { 579 char *data = NULL; 580 uint64_t eui64 = nvstore->eui64; 581 582 asprintf(&data, "%s%u%u%u", get_config_value("name"), 583 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 584 sc->nsc_pi->pi_func); 585 586 if (data != NULL) { 587 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 588 free(data); 589 } 590 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 591 } 592 be64enc(nd->eui64, nvstore->eui64); 593 594 /* LBA data-sz = 2^lbads */ 595 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 596 } 597 598 static void 599 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 600 { 601 602 memset(&sc->err_log, 0, sizeof(sc->err_log)); 603 memset(&sc->health_log, 0, sizeof(sc->health_log)); 604 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 605 606 /* Set read/write remainder to round up according to spec */ 607 sc->read_dunits_remainder = 999; 608 sc->write_dunits_remainder = 999; 609 610 /* Set nominal Health values checked by implementations */ 611 sc->health_log.temperature = 310; 612 sc->health_log.available_spare = 100; 613 sc->health_log.available_spare_threshold = 10; 614 } 615 616 static void 617 pci_nvme_init_features(struct pci_nvme_softc *sc) 618 { 619 620 sc->feat[0].set = nvme_feature_invalid_cb; 621 sc->feat[0].get = nvme_feature_invalid_cb; 622 623 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 624 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 625 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 626 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 627 nvme_feature_iv_config; 628 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 629 nvme_feature_invalid_cb; 630 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 631 nvme_feature_invalid_cb; 632 } 633 634 static void 635 pci_nvme_aer_init(struct pci_nvme_softc *sc) 636 { 637 638 STAILQ_INIT(&sc->aer_list); 639 sc->aer_count = 0; 640 } 641 642 static void 643 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 644 { 645 struct pci_nvme_aer *aer = NULL; 646 647 while (!STAILQ_EMPTY(&sc->aer_list)) { 648 aer = STAILQ_FIRST(&sc->aer_list); 649 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 650 free(aer); 651 } 652 653 pci_nvme_aer_init(sc); 654 } 655 656 static bool 657 pci_nvme_aer_available(struct pci_nvme_softc *sc) 658 { 659 660 return (!STAILQ_EMPTY(&sc->aer_list)); 661 } 662 663 static bool 664 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 665 { 666 struct nvme_controller_data *cd = &sc->ctrldata; 667 668 /* AERL is a zero based value while aer_count is one's based */ 669 return (sc->aer_count == (cd->aerl + 1)); 670 } 671 672 /* 673 * Add an Async Event Request 674 * 675 * Stores an AER to be returned later if the Controller needs to notify the 676 * host of an event. 677 * Note that while the NVMe spec doesn't require Controllers to return AER's 678 * in order, this implementation does preserve the order. 679 */ 680 static int 681 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 682 { 683 struct pci_nvme_aer *aer = NULL; 684 685 if (pci_nvme_aer_limit_reached(sc)) 686 return (-1); 687 688 aer = calloc(1, sizeof(struct pci_nvme_aer)); 689 if (aer == NULL) 690 return (-1); 691 692 sc->aer_count++; 693 694 /* Save the Command ID for use in the completion message */ 695 aer->cid = cid; 696 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 697 698 return (0); 699 } 700 701 /* 702 * Get an Async Event Request structure 703 * 704 * Returns a pointer to an AER previously submitted by the host or NULL if 705 * no AER's exist. Caller is responsible for freeing the returned struct. 706 */ 707 static struct pci_nvme_aer * 708 pci_nvme_aer_get(struct pci_nvme_softc *sc) 709 { 710 struct pci_nvme_aer *aer = NULL; 711 712 aer = STAILQ_FIRST(&sc->aer_list); 713 if (aer != NULL) { 714 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 715 sc->aer_count--; 716 } 717 718 return (aer); 719 } 720 721 static void 722 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 723 { 724 uint32_t i; 725 726 DPRINTF("%s", __func__); 727 728 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 729 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 730 (60 << NVME_CAP_LO_REG_TO_SHIFT); 731 732 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 733 734 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 735 736 sc->regs.cc = 0; 737 sc->regs.csts = 0; 738 739 assert(sc->submit_queues != NULL); 740 741 for (i = 0; i < sc->num_squeues + 1; i++) { 742 sc->submit_queues[i].qbase = NULL; 743 sc->submit_queues[i].size = 0; 744 sc->submit_queues[i].cqid = 0; 745 sc->submit_queues[i].tail = 0; 746 sc->submit_queues[i].head = 0; 747 } 748 749 assert(sc->compl_queues != NULL); 750 751 for (i = 0; i < sc->num_cqueues + 1; i++) { 752 sc->compl_queues[i].qbase = NULL; 753 sc->compl_queues[i].size = 0; 754 sc->compl_queues[i].tail = 0; 755 sc->compl_queues[i].head = 0; 756 } 757 758 sc->num_q_is_set = false; 759 760 pci_nvme_aer_destroy(sc); 761 } 762 763 static void 764 pci_nvme_reset(struct pci_nvme_softc *sc) 765 { 766 pthread_mutex_lock(&sc->mtx); 767 pci_nvme_reset_locked(sc); 768 pthread_mutex_unlock(&sc->mtx); 769 } 770 771 static void 772 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 773 { 774 uint16_t acqs, asqs; 775 776 DPRINTF("%s", __func__); 777 778 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 779 sc->submit_queues[0].size = asqs; 780 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 781 sizeof(struct nvme_command) * asqs); 782 783 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 784 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 785 786 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 787 NVME_AQA_REG_ACQS_MASK) + 1; 788 sc->compl_queues[0].size = acqs; 789 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 790 sizeof(struct nvme_completion) * acqs); 791 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 792 793 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 794 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 795 } 796 797 static int 798 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 799 size_t len, enum nvme_copy_dir dir) 800 { 801 uint8_t *p; 802 size_t bytes; 803 804 if (len > (8 * 1024)) { 805 return (-1); 806 } 807 808 /* Copy from the start of prp1 to the end of the physical page */ 809 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 810 bytes = MIN(bytes, len); 811 812 p = vm_map_gpa(ctx, prp1, bytes); 813 if (p == NULL) { 814 return (-1); 815 } 816 817 if (dir == NVME_COPY_TO_PRP) 818 memcpy(p, b, bytes); 819 else 820 memcpy(b, p, bytes); 821 822 b += bytes; 823 824 len -= bytes; 825 if (len == 0) { 826 return (0); 827 } 828 829 len = MIN(len, PAGE_SIZE); 830 831 p = vm_map_gpa(ctx, prp2, len); 832 if (p == NULL) { 833 return (-1); 834 } 835 836 if (dir == NVME_COPY_TO_PRP) 837 memcpy(p, b, len); 838 else 839 memcpy(b, p, len); 840 841 return (0); 842 } 843 844 /* 845 * Write a Completion Queue Entry update 846 * 847 * Write the completion and update the doorbell value 848 */ 849 static void 850 pci_nvme_cq_update(struct pci_nvme_softc *sc, 851 struct nvme_completion_queue *cq, 852 uint32_t cdw0, 853 uint16_t cid, 854 uint16_t sqid, 855 uint16_t status) 856 { 857 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 858 struct nvme_completion *cqe; 859 860 assert(cq->qbase != NULL); 861 862 pthread_mutex_lock(&cq->mtx); 863 864 cqe = &cq->qbase[cq->tail]; 865 866 /* Flip the phase bit */ 867 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 868 869 cqe->cdw0 = cdw0; 870 cqe->sqhd = sq->head; 871 cqe->sqid = sqid; 872 cqe->cid = cid; 873 cqe->status = status; 874 875 cq->tail++; 876 if (cq->tail >= cq->size) { 877 cq->tail = 0; 878 } 879 880 pthread_mutex_unlock(&cq->mtx); 881 } 882 883 static int 884 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 885 struct nvme_completion* compl) 886 { 887 uint16_t qid = command->cdw10 & 0xffff; 888 889 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 890 if (qid == 0 || qid > sc->num_squeues || 891 (sc->submit_queues[qid].qbase == NULL)) { 892 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 893 __func__, qid, sc->num_squeues); 894 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 895 NVME_SC_INVALID_QUEUE_IDENTIFIER); 896 return (1); 897 } 898 899 sc->submit_queues[qid].qbase = NULL; 900 sc->submit_queues[qid].cqid = 0; 901 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 902 return (1); 903 } 904 905 static int 906 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 907 struct nvme_completion* compl) 908 { 909 if (command->cdw11 & NVME_CMD_CDW11_PC) { 910 uint16_t qid = command->cdw10 & 0xffff; 911 struct nvme_submission_queue *nsq; 912 913 if ((qid == 0) || (qid > sc->num_squeues) || 914 (sc->submit_queues[qid].qbase != NULL)) { 915 WPRINTF("%s queue index %u > num_squeues %u", 916 __func__, qid, sc->num_squeues); 917 pci_nvme_status_tc(&compl->status, 918 NVME_SCT_COMMAND_SPECIFIC, 919 NVME_SC_INVALID_QUEUE_IDENTIFIER); 920 return (1); 921 } 922 923 nsq = &sc->submit_queues[qid]; 924 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 925 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 926 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 927 /* 928 * Queues must specify at least two entries 929 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 930 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 931 */ 932 pci_nvme_status_tc(&compl->status, 933 NVME_SCT_COMMAND_SPECIFIC, 934 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 935 return (1); 936 } 937 nsq->head = nsq->tail = 0; 938 939 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 940 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 941 pci_nvme_status_tc(&compl->status, 942 NVME_SCT_COMMAND_SPECIFIC, 943 NVME_SC_INVALID_QUEUE_IDENTIFIER); 944 return (1); 945 } 946 947 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 948 pci_nvme_status_tc(&compl->status, 949 NVME_SCT_COMMAND_SPECIFIC, 950 NVME_SC_COMPLETION_QUEUE_INVALID); 951 return (1); 952 } 953 954 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 955 956 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 957 sizeof(struct nvme_command) * (size_t)nsq->size); 958 959 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 960 qid, nsq->size, nsq->qbase, nsq->cqid); 961 962 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 963 964 DPRINTF("%s completed creating IOSQ qid %u", 965 __func__, qid); 966 } else { 967 /* 968 * Guest sent non-cont submission queue request. 969 * This setting is unsupported by this emulation. 970 */ 971 WPRINTF("%s unsupported non-contig (list-based) " 972 "create i/o submission queue", __func__); 973 974 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 975 } 976 return (1); 977 } 978 979 static int 980 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 981 struct nvme_completion* compl) 982 { 983 uint16_t qid = command->cdw10 & 0xffff; 984 uint16_t sqid; 985 986 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 987 if (qid == 0 || qid > sc->num_cqueues || 988 (sc->compl_queues[qid].qbase == NULL)) { 989 WPRINTF("%s queue index %u / num_cqueues %u", 990 __func__, qid, sc->num_cqueues); 991 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 992 NVME_SC_INVALID_QUEUE_IDENTIFIER); 993 return (1); 994 } 995 996 /* Deleting an Active CQ is an error */ 997 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 998 if (sc->submit_queues[sqid].cqid == qid) { 999 pci_nvme_status_tc(&compl->status, 1000 NVME_SCT_COMMAND_SPECIFIC, 1001 NVME_SC_INVALID_QUEUE_DELETION); 1002 return (1); 1003 } 1004 1005 sc->compl_queues[qid].qbase = NULL; 1006 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1007 return (1); 1008 } 1009 1010 static int 1011 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1012 struct nvme_completion* compl) 1013 { 1014 struct nvme_completion_queue *ncq; 1015 uint16_t qid = command->cdw10 & 0xffff; 1016 1017 /* Only support Physically Contiguous queues */ 1018 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1019 WPRINTF("%s unsupported non-contig (list-based) " 1020 "create i/o completion queue", 1021 __func__); 1022 1023 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1024 return (1); 1025 } 1026 1027 if ((qid == 0) || (qid > sc->num_cqueues) || 1028 (sc->compl_queues[qid].qbase != NULL)) { 1029 WPRINTF("%s queue index %u > num_cqueues %u", 1030 __func__, qid, sc->num_cqueues); 1031 pci_nvme_status_tc(&compl->status, 1032 NVME_SCT_COMMAND_SPECIFIC, 1033 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1034 return (1); 1035 } 1036 1037 ncq = &sc->compl_queues[qid]; 1038 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1039 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1040 if (ncq->intr_vec > (sc->max_queues + 1)) { 1041 pci_nvme_status_tc(&compl->status, 1042 NVME_SCT_COMMAND_SPECIFIC, 1043 NVME_SC_INVALID_INTERRUPT_VECTOR); 1044 return (1); 1045 } 1046 1047 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1048 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1049 /* 1050 * Queues must specify at least two entries 1051 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1052 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1053 */ 1054 pci_nvme_status_tc(&compl->status, 1055 NVME_SCT_COMMAND_SPECIFIC, 1056 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1057 return (1); 1058 } 1059 ncq->head = ncq->tail = 0; 1060 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1061 command->prp1, 1062 sizeof(struct nvme_command) * (size_t)ncq->size); 1063 1064 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1065 1066 1067 return (1); 1068 } 1069 1070 static int 1071 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1072 struct nvme_completion* compl) 1073 { 1074 uint32_t logsize; 1075 uint8_t logpage = command->cdw10 & 0xFF; 1076 1077 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1078 1079 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1080 1081 /* 1082 * Command specifies the number of dwords to return in fields NUMDU 1083 * and NUMDL. This is a zero-based value. 1084 */ 1085 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1086 logsize *= sizeof(uint32_t); 1087 1088 switch (logpage) { 1089 case NVME_LOG_ERROR: 1090 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1091 command->prp2, (uint8_t *)&sc->err_log, 1092 MIN(logsize, sizeof(sc->err_log)), 1093 NVME_COPY_TO_PRP); 1094 break; 1095 case NVME_LOG_HEALTH_INFORMATION: 1096 pthread_mutex_lock(&sc->mtx); 1097 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1098 sizeof(sc->health_log.data_units_read)); 1099 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1100 sizeof(sc->health_log.data_units_written)); 1101 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1102 sizeof(sc->health_log.host_read_commands)); 1103 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1104 sizeof(sc->health_log.host_write_commands)); 1105 pthread_mutex_unlock(&sc->mtx); 1106 1107 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1108 command->prp2, (uint8_t *)&sc->health_log, 1109 MIN(logsize, sizeof(sc->health_log)), 1110 NVME_COPY_TO_PRP); 1111 break; 1112 case NVME_LOG_FIRMWARE_SLOT: 1113 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1114 command->prp2, (uint8_t *)&sc->fw_log, 1115 MIN(logsize, sizeof(sc->fw_log)), 1116 NVME_COPY_TO_PRP); 1117 break; 1118 default: 1119 DPRINTF("%s get log page %x command not supported", 1120 __func__, logpage); 1121 1122 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1123 NVME_SC_INVALID_LOG_PAGE); 1124 } 1125 1126 return (1); 1127 } 1128 1129 static int 1130 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1131 struct nvme_completion* compl) 1132 { 1133 void *dest; 1134 uint16_t status; 1135 1136 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1137 command->cdw10 & 0xFF, command->nsid); 1138 1139 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1140 1141 switch (command->cdw10 & 0xFF) { 1142 case 0x00: /* return Identify Namespace data structure */ 1143 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1144 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1145 NVME_COPY_TO_PRP); 1146 break; 1147 case 0x01: /* return Identify Controller data structure */ 1148 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1149 command->prp2, (uint8_t *)&sc->ctrldata, 1150 sizeof(sc->ctrldata), 1151 NVME_COPY_TO_PRP); 1152 break; 1153 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1154 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1155 sizeof(uint32_t) * 1024); 1156 /* All unused entries shall be zero */ 1157 bzero(dest, sizeof(uint32_t) * 1024); 1158 ((uint32_t *)dest)[0] = 1; 1159 break; 1160 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1161 if (command->nsid != 1) { 1162 pci_nvme_status_genc(&status, 1163 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1164 break; 1165 } 1166 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1167 sizeof(uint32_t) * 1024); 1168 /* All bytes after the descriptor shall be zero */ 1169 bzero(dest, sizeof(uint32_t) * 1024); 1170 1171 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1172 ((uint8_t *)dest)[0] = 1; 1173 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1174 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1175 break; 1176 default: 1177 DPRINTF("%s unsupported identify command requested 0x%x", 1178 __func__, command->cdw10 & 0xFF); 1179 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1180 break; 1181 } 1182 1183 compl->status = status; 1184 return (1); 1185 } 1186 1187 static const char * 1188 nvme_fid_to_name(uint8_t fid) 1189 { 1190 const char *name; 1191 1192 switch (fid) { 1193 case NVME_FEAT_ARBITRATION: 1194 name = "Arbitration"; 1195 break; 1196 case NVME_FEAT_POWER_MANAGEMENT: 1197 name = "Power Management"; 1198 break; 1199 case NVME_FEAT_LBA_RANGE_TYPE: 1200 name = "LBA Range Type"; 1201 break; 1202 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1203 name = "Temperature Threshold"; 1204 break; 1205 case NVME_FEAT_ERROR_RECOVERY: 1206 name = "Error Recovery"; 1207 break; 1208 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1209 name = "Volatile Write Cache"; 1210 break; 1211 case NVME_FEAT_NUMBER_OF_QUEUES: 1212 name = "Number of Queues"; 1213 break; 1214 case NVME_FEAT_INTERRUPT_COALESCING: 1215 name = "Interrupt Coalescing"; 1216 break; 1217 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1218 name = "Interrupt Vector Configuration"; 1219 break; 1220 case NVME_FEAT_WRITE_ATOMICITY: 1221 name = "Write Atomicity Normal"; 1222 break; 1223 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1224 name = "Asynchronous Event Configuration"; 1225 break; 1226 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1227 name = "Autonomous Power State Transition"; 1228 break; 1229 case NVME_FEAT_HOST_MEMORY_BUFFER: 1230 name = "Host Memory Buffer"; 1231 break; 1232 case NVME_FEAT_TIMESTAMP: 1233 name = "Timestamp"; 1234 break; 1235 case NVME_FEAT_KEEP_ALIVE_TIMER: 1236 name = "Keep Alive Timer"; 1237 break; 1238 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1239 name = "Host Controlled Thermal Management"; 1240 break; 1241 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1242 name = "Non-Operation Power State Config"; 1243 break; 1244 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1245 name = "Read Recovery Level Config"; 1246 break; 1247 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1248 name = "Predictable Latency Mode Config"; 1249 break; 1250 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1251 name = "Predictable Latency Mode Window"; 1252 break; 1253 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1254 name = "LBA Status Information Report Interval"; 1255 break; 1256 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1257 name = "Host Behavior Support"; 1258 break; 1259 case NVME_FEAT_SANITIZE_CONFIG: 1260 name = "Sanitize Config"; 1261 break; 1262 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1263 name = "Endurance Group Event Configuration"; 1264 break; 1265 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1266 name = "Software Progress Marker"; 1267 break; 1268 case NVME_FEAT_HOST_IDENTIFIER: 1269 name = "Host Identifier"; 1270 break; 1271 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1272 name = "Reservation Notification Mask"; 1273 break; 1274 case NVME_FEAT_RESERVATION_PERSISTENCE: 1275 name = "Reservation Persistence"; 1276 break; 1277 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1278 name = "Namespace Write Protection Config"; 1279 break; 1280 default: 1281 name = "Unknown"; 1282 break; 1283 } 1284 1285 return (name); 1286 } 1287 1288 static void 1289 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1290 struct nvme_feature_obj *feat, 1291 struct nvme_command *command, 1292 struct nvme_completion *compl) 1293 { 1294 1295 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1296 } 1297 1298 static void 1299 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1300 struct nvme_feature_obj *feat, 1301 struct nvme_command *command, 1302 struct nvme_completion *compl) 1303 { 1304 uint32_t i; 1305 uint32_t cdw11 = command->cdw11; 1306 uint16_t iv; 1307 bool cd; 1308 1309 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1310 1311 iv = cdw11 & 0xffff; 1312 cd = cdw11 & (1 << 16); 1313 1314 if (iv > (sc->max_queues + 1)) { 1315 return; 1316 } 1317 1318 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1319 if ((iv == 0) && !cd) 1320 return; 1321 1322 /* Requested Interrupt Vector must be used by a CQ */ 1323 for (i = 0; i < sc->num_cqueues + 1; i++) { 1324 if (sc->compl_queues[i].intr_vec == iv) { 1325 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1326 } 1327 } 1328 1329 } 1330 1331 static void 1332 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1333 struct nvme_feature_obj *feat, 1334 struct nvme_command *command, 1335 struct nvme_completion *compl) 1336 { 1337 uint16_t nqr; /* Number of Queues Requested */ 1338 1339 if (sc->num_q_is_set) { 1340 WPRINTF("%s: Number of Queues already set", __func__); 1341 pci_nvme_status_genc(&compl->status, 1342 NVME_SC_COMMAND_SEQUENCE_ERROR); 1343 return; 1344 } 1345 1346 nqr = command->cdw11 & 0xFFFF; 1347 if (nqr == 0xffff) { 1348 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1349 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1350 return; 1351 } 1352 1353 sc->num_squeues = ONE_BASED(nqr); 1354 if (sc->num_squeues > sc->max_queues) { 1355 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1356 sc->max_queues); 1357 sc->num_squeues = sc->max_queues; 1358 } 1359 1360 nqr = (command->cdw11 >> 16) & 0xFFFF; 1361 if (nqr == 0xffff) { 1362 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1363 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1364 return; 1365 } 1366 1367 sc->num_cqueues = ONE_BASED(nqr); 1368 if (sc->num_cqueues > sc->max_queues) { 1369 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1370 sc->max_queues); 1371 sc->num_cqueues = sc->max_queues; 1372 } 1373 1374 /* Patch the command value which will be saved on callback's return */ 1375 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1376 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1377 1378 sc->num_q_is_set = true; 1379 } 1380 1381 static int 1382 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1383 struct nvme_completion *compl) 1384 { 1385 struct nvme_feature_obj *feat; 1386 uint32_t nsid = command->nsid; 1387 uint8_t fid = command->cdw10 & 0xFF; 1388 1389 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1390 1391 if (fid >= NVME_FID_MAX) { 1392 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1393 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1394 return (1); 1395 } 1396 feat = &sc->feat[fid]; 1397 1398 if (!feat->namespace_specific && 1399 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1400 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1401 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1402 return (1); 1403 } 1404 1405 compl->cdw0 = 0; 1406 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1407 1408 if (feat->set) 1409 feat->set(sc, feat, command, compl); 1410 1411 if (compl->status == NVME_SC_SUCCESS) 1412 feat->cdw11 = command->cdw11; 1413 1414 return (0); 1415 } 1416 1417 static int 1418 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1419 struct nvme_completion* compl) 1420 { 1421 struct nvme_feature_obj *feat; 1422 uint8_t fid = command->cdw10 & 0xFF; 1423 1424 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1425 1426 if (fid >= NVME_FID_MAX) { 1427 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1428 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1429 return (1); 1430 } 1431 1432 compl->cdw0 = 0; 1433 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1434 1435 feat = &sc->feat[fid]; 1436 if (feat->get) { 1437 feat->get(sc, feat, command, compl); 1438 } 1439 1440 if (compl->status == NVME_SC_SUCCESS) { 1441 compl->cdw0 = feat->cdw11; 1442 } 1443 1444 return (0); 1445 } 1446 1447 static int 1448 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1449 struct nvme_completion* compl) 1450 { 1451 uint8_t ses, lbaf, pi; 1452 1453 /* Only supports Secure Erase Setting - User Data Erase */ 1454 ses = (command->cdw10 >> 9) & 0x7; 1455 if (ses > 0x1) { 1456 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1457 return (1); 1458 } 1459 1460 /* Only supports a single LBA Format */ 1461 lbaf = command->cdw10 & 0xf; 1462 if (lbaf != 0) { 1463 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1464 NVME_SC_INVALID_FORMAT); 1465 return (1); 1466 } 1467 1468 /* Doesn't support Protection Infomation */ 1469 pi = (command->cdw10 >> 5) & 0x7; 1470 if (pi != 0) { 1471 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1472 return (1); 1473 } 1474 1475 if (sc->nvstore.type == NVME_STOR_RAM) { 1476 if (sc->nvstore.ctx) 1477 free(sc->nvstore.ctx); 1478 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1479 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1480 } else { 1481 struct pci_nvme_ioreq *req; 1482 int err; 1483 1484 req = pci_nvme_get_ioreq(sc); 1485 if (req == NULL) { 1486 pci_nvme_status_genc(&compl->status, 1487 NVME_SC_INTERNAL_DEVICE_ERROR); 1488 WPRINTF("%s: unable to allocate IO req", __func__); 1489 return (1); 1490 } 1491 req->nvme_sq = &sc->submit_queues[0]; 1492 req->sqid = 0; 1493 req->opc = command->opc; 1494 req->cid = command->cid; 1495 req->nsid = command->nsid; 1496 1497 req->io_req.br_offset = 0; 1498 req->io_req.br_resid = sc->nvstore.size; 1499 req->io_req.br_callback = pci_nvme_io_done; 1500 1501 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1502 if (err) { 1503 pci_nvme_status_genc(&compl->status, 1504 NVME_SC_INTERNAL_DEVICE_ERROR); 1505 pci_nvme_release_ioreq(sc, req); 1506 } 1507 } 1508 1509 return (1); 1510 } 1511 1512 static int 1513 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1514 struct nvme_completion* compl) 1515 { 1516 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1517 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1518 1519 /* TODO: search for the command ID and abort it */ 1520 1521 compl->cdw0 = 1; 1522 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1523 return (1); 1524 } 1525 1526 static int 1527 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1528 struct nvme_command* command, struct nvme_completion* compl) 1529 { 1530 DPRINTF("%s async event request 0x%x", __func__, command->cdw11); 1531 1532 /* Don't exceed the Async Event Request Limit (AERL). */ 1533 if (pci_nvme_aer_limit_reached(sc)) { 1534 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1535 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1536 return (1); 1537 } 1538 1539 if (pci_nvme_aer_add(sc, command->cid)) { 1540 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1541 NVME_SC_INTERNAL_DEVICE_ERROR); 1542 return (1); 1543 } 1544 1545 /* 1546 * Raise events when they happen based on the Set Features cmd. 1547 * These events happen async, so only set completion successful if 1548 * there is an event reflective of the request to get event. 1549 */ 1550 compl->status = NVME_NO_STATUS; 1551 1552 return (0); 1553 } 1554 1555 static void 1556 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1557 { 1558 struct nvme_completion compl; 1559 struct nvme_command *cmd; 1560 struct nvme_submission_queue *sq; 1561 struct nvme_completion_queue *cq; 1562 uint16_t sqhead; 1563 1564 DPRINTF("%s index %u", __func__, (uint32_t)value); 1565 1566 sq = &sc->submit_queues[0]; 1567 cq = &sc->compl_queues[0]; 1568 1569 pthread_mutex_lock(&sq->mtx); 1570 1571 sqhead = sq->head; 1572 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1573 1574 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1575 cmd = &(sq->qbase)[sqhead]; 1576 compl.cdw0 = 0; 1577 compl.status = 0; 1578 1579 switch (cmd->opc) { 1580 case NVME_OPC_DELETE_IO_SQ: 1581 DPRINTF("%s command DELETE_IO_SQ", __func__); 1582 nvme_opc_delete_io_sq(sc, cmd, &compl); 1583 break; 1584 case NVME_OPC_CREATE_IO_SQ: 1585 DPRINTF("%s command CREATE_IO_SQ", __func__); 1586 nvme_opc_create_io_sq(sc, cmd, &compl); 1587 break; 1588 case NVME_OPC_DELETE_IO_CQ: 1589 DPRINTF("%s command DELETE_IO_CQ", __func__); 1590 nvme_opc_delete_io_cq(sc, cmd, &compl); 1591 break; 1592 case NVME_OPC_CREATE_IO_CQ: 1593 DPRINTF("%s command CREATE_IO_CQ", __func__); 1594 nvme_opc_create_io_cq(sc, cmd, &compl); 1595 break; 1596 case NVME_OPC_GET_LOG_PAGE: 1597 DPRINTF("%s command GET_LOG_PAGE", __func__); 1598 nvme_opc_get_log_page(sc, cmd, &compl); 1599 break; 1600 case NVME_OPC_IDENTIFY: 1601 DPRINTF("%s command IDENTIFY", __func__); 1602 nvme_opc_identify(sc, cmd, &compl); 1603 break; 1604 case NVME_OPC_ABORT: 1605 DPRINTF("%s command ABORT", __func__); 1606 nvme_opc_abort(sc, cmd, &compl); 1607 break; 1608 case NVME_OPC_SET_FEATURES: 1609 DPRINTF("%s command SET_FEATURES", __func__); 1610 nvme_opc_set_features(sc, cmd, &compl); 1611 break; 1612 case NVME_OPC_GET_FEATURES: 1613 DPRINTF("%s command GET_FEATURES", __func__); 1614 nvme_opc_get_features(sc, cmd, &compl); 1615 break; 1616 case NVME_OPC_FIRMWARE_ACTIVATE: 1617 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1618 pci_nvme_status_tc(&compl.status, 1619 NVME_SCT_COMMAND_SPECIFIC, 1620 NVME_SC_INVALID_FIRMWARE_SLOT); 1621 break; 1622 case NVME_OPC_ASYNC_EVENT_REQUEST: 1623 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1624 nvme_opc_async_event_req(sc, cmd, &compl); 1625 break; 1626 case NVME_OPC_FORMAT_NVM: 1627 DPRINTF("%s command FORMAT_NVM", __func__); 1628 if ((sc->ctrldata.oacs & 1629 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1630 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1631 } 1632 compl.status = NVME_NO_STATUS; 1633 nvme_opc_format_nvm(sc, cmd, &compl); 1634 break; 1635 default: 1636 DPRINTF("0x%x command is not implemented", 1637 cmd->opc); 1638 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1639 } 1640 sqhead = (sqhead + 1) % sq->size; 1641 1642 if (NVME_COMPLETION_VALID(compl)) { 1643 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1644 compl.cdw0, 1645 cmd->cid, 1646 0, /* SQID */ 1647 compl.status); 1648 } 1649 } 1650 1651 DPRINTF("setting sqhead %u", sqhead); 1652 sq->head = sqhead; 1653 1654 if (cq->head != cq->tail) 1655 pci_generate_msix(sc->nsc_pi, 0); 1656 1657 pthread_mutex_unlock(&sq->mtx); 1658 } 1659 1660 /* 1661 * Update the Write and Read statistics reported in SMART data 1662 * 1663 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1664 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1665 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1666 */ 1667 static void 1668 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1669 size_t bytes, uint16_t status) 1670 { 1671 1672 pthread_mutex_lock(&sc->mtx); 1673 switch (opc) { 1674 case NVME_OPC_WRITE: 1675 sc->write_commands++; 1676 if (status != NVME_SC_SUCCESS) 1677 break; 1678 sc->write_dunits_remainder += (bytes / 512); 1679 while (sc->write_dunits_remainder >= 1000) { 1680 sc->write_data_units++; 1681 sc->write_dunits_remainder -= 1000; 1682 } 1683 break; 1684 case NVME_OPC_READ: 1685 sc->read_commands++; 1686 if (status != NVME_SC_SUCCESS) 1687 break; 1688 sc->read_dunits_remainder += (bytes / 512); 1689 while (sc->read_dunits_remainder >= 1000) { 1690 sc->read_data_units++; 1691 sc->read_dunits_remainder -= 1000; 1692 } 1693 break; 1694 default: 1695 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1696 break; 1697 } 1698 pthread_mutex_unlock(&sc->mtx); 1699 } 1700 1701 /* 1702 * Check if the combination of Starting LBA (slba) and Number of Logical 1703 * Blocks (nlb) exceeds the range of the underlying storage. 1704 * 1705 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 1706 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 1707 * overflow. 1708 */ 1709 static bool 1710 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 1711 uint32_t nlb) 1712 { 1713 size_t offset, bytes; 1714 1715 /* Overflow check of multiplying Starting LBA by the sector size */ 1716 if (slba >> (64 - nvstore->sectsz_bits)) 1717 return (true); 1718 1719 offset = slba << nvstore->sectsz_bits; 1720 bytes = nlb << nvstore->sectsz_bits; 1721 1722 /* Overflow check of Number of Logical Blocks */ 1723 if ((nvstore->size - offset) < bytes) 1724 return (true); 1725 1726 return (false); 1727 } 1728 1729 static int 1730 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1731 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1732 { 1733 int iovidx; 1734 1735 if (req == NULL) 1736 return (-1); 1737 1738 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 1739 return (-1); 1740 } 1741 1742 /* concatenate contig block-iovs to minimize number of iovs */ 1743 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1744 iovidx = req->io_req.br_iovcnt - 1; 1745 1746 req->io_req.br_iov[iovidx].iov_base = 1747 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1748 req->prev_gpaddr, size); 1749 1750 req->prev_size += size; 1751 req->io_req.br_resid += size; 1752 1753 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1754 } else { 1755 iovidx = req->io_req.br_iovcnt; 1756 if (iovidx == 0) { 1757 req->io_req.br_offset = lba; 1758 req->io_req.br_resid = 0; 1759 req->io_req.br_param = req; 1760 } 1761 1762 req->io_req.br_iov[iovidx].iov_base = 1763 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1764 gpaddr, size); 1765 1766 req->io_req.br_iov[iovidx].iov_len = size; 1767 1768 req->prev_gpaddr = gpaddr; 1769 req->prev_size = size; 1770 req->io_req.br_resid += size; 1771 1772 req->io_req.br_iovcnt++; 1773 } 1774 1775 return (0); 1776 } 1777 1778 static void 1779 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1780 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1781 uint32_t cdw0, uint16_t status) 1782 { 1783 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1784 1785 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1786 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1787 NVME_STATUS_GET_SC(status)); 1788 1789 pci_nvme_cq_update(sc, cq, 1790 0, /* CDW0 */ 1791 cid, 1792 sqid, 1793 status); 1794 1795 if (cq->head != cq->tail) { 1796 if (cq->intr_en & NVME_CQ_INTEN) { 1797 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1798 } else { 1799 DPRINTF("%s: CQ%u interrupt disabled", 1800 __func__, sq->cqid); 1801 } 1802 } 1803 } 1804 1805 static void 1806 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1807 { 1808 req->sc = NULL; 1809 req->nvme_sq = NULL; 1810 req->sqid = 0; 1811 1812 pthread_mutex_lock(&sc->mtx); 1813 1814 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1815 sc->pending_ios--; 1816 1817 /* when no more IO pending, can set to ready if device reset/enabled */ 1818 if (sc->pending_ios == 0 && 1819 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1820 sc->regs.csts |= NVME_CSTS_RDY; 1821 1822 pthread_mutex_unlock(&sc->mtx); 1823 1824 sem_post(&sc->iosemlock); 1825 } 1826 1827 static struct pci_nvme_ioreq * 1828 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1829 { 1830 struct pci_nvme_ioreq *req = NULL;; 1831 1832 sem_wait(&sc->iosemlock); 1833 pthread_mutex_lock(&sc->mtx); 1834 1835 req = STAILQ_FIRST(&sc->ioreqs_free); 1836 assert(req != NULL); 1837 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1838 1839 req->sc = sc; 1840 1841 sc->pending_ios++; 1842 1843 pthread_mutex_unlock(&sc->mtx); 1844 1845 req->io_req.br_iovcnt = 0; 1846 req->io_req.br_offset = 0; 1847 req->io_req.br_resid = 0; 1848 req->io_req.br_param = req; 1849 req->prev_gpaddr = 0; 1850 req->prev_size = 0; 1851 1852 return req; 1853 } 1854 1855 static void 1856 pci_nvme_io_done(struct blockif_req *br, int err) 1857 { 1858 struct pci_nvme_ioreq *req = br->br_param; 1859 struct nvme_submission_queue *sq = req->nvme_sq; 1860 uint16_t code, status; 1861 1862 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 1863 1864 /* TODO return correct error */ 1865 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1866 pci_nvme_status_genc(&status, code); 1867 1868 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 1869 pci_nvme_stats_write_read_update(req->sc, req->opc, 1870 req->bytes, status); 1871 pci_nvme_release_ioreq(req->sc, req); 1872 } 1873 1874 /* 1875 * Implements the Flush command. The specification states: 1876 * If a volatile write cache is not present, Flush commands complete 1877 * successfully and have no effect 1878 * in the description of the Volatile Write Cache (VWC) field of the Identify 1879 * Controller data. Therefore, set status to Success if the command is 1880 * not supported (i.e. RAM or as indicated by the blockif). 1881 */ 1882 static bool 1883 nvme_opc_flush(struct pci_nvme_softc *sc, 1884 struct nvme_command *cmd, 1885 struct pci_nvme_blockstore *nvstore, 1886 struct pci_nvme_ioreq *req, 1887 uint16_t *status) 1888 { 1889 bool pending = false; 1890 1891 if (nvstore->type == NVME_STOR_RAM) { 1892 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1893 } else { 1894 int err; 1895 1896 req->io_req.br_callback = pci_nvme_io_done; 1897 1898 err = blockif_flush(nvstore->ctx, &req->io_req); 1899 switch (err) { 1900 case 0: 1901 pending = true; 1902 break; 1903 case EOPNOTSUPP: 1904 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1905 break; 1906 default: 1907 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1908 } 1909 } 1910 1911 return (pending); 1912 } 1913 1914 static uint16_t 1915 nvme_write_read_ram(struct pci_nvme_softc *sc, 1916 struct pci_nvme_blockstore *nvstore, 1917 uint64_t prp1, uint64_t prp2, 1918 size_t offset, uint64_t bytes, 1919 bool is_write) 1920 { 1921 uint8_t *buf = nvstore->ctx; 1922 enum nvme_copy_dir dir; 1923 uint16_t status; 1924 1925 if (is_write) 1926 dir = NVME_COPY_TO_PRP; 1927 else 1928 dir = NVME_COPY_FROM_PRP; 1929 1930 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 1931 buf + offset, bytes, dir)) 1932 pci_nvme_status_genc(&status, 1933 NVME_SC_DATA_TRANSFER_ERROR); 1934 else 1935 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1936 1937 return (status); 1938 } 1939 1940 static uint16_t 1941 nvme_write_read_blockif(struct pci_nvme_softc *sc, 1942 struct pci_nvme_blockstore *nvstore, 1943 struct pci_nvme_ioreq *req, 1944 uint64_t prp1, uint64_t prp2, 1945 size_t offset, uint64_t bytes, 1946 bool is_write) 1947 { 1948 uint64_t size; 1949 int err; 1950 uint16_t status = NVME_NO_STATUS; 1951 1952 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 1953 if (pci_nvme_append_iov_req(sc, req, prp1, 1954 size, is_write, offset)) { 1955 pci_nvme_status_genc(&status, 1956 NVME_SC_DATA_TRANSFER_ERROR); 1957 goto out; 1958 } 1959 1960 offset += size; 1961 bytes -= size; 1962 1963 if (bytes == 0) { 1964 ; 1965 } else if (bytes <= PAGE_SIZE) { 1966 size = bytes; 1967 if (pci_nvme_append_iov_req(sc, req, prp2, 1968 size, is_write, offset)) { 1969 pci_nvme_status_genc(&status, 1970 NVME_SC_DATA_TRANSFER_ERROR); 1971 goto out; 1972 } 1973 } else { 1974 void *vmctx = sc->nsc_pi->pi_vmctx; 1975 uint64_t *prp_list = &prp2; 1976 uint64_t *last = prp_list; 1977 1978 /* PRP2 is pointer to a physical region page list */ 1979 while (bytes) { 1980 /* Last entry in list points to the next list */ 1981 if (prp_list == last) { 1982 uint64_t prp = *prp_list; 1983 1984 prp_list = paddr_guest2host(vmctx, prp, 1985 PAGE_SIZE - (prp % PAGE_SIZE)); 1986 last = prp_list + (NVME_PRP2_ITEMS - 1); 1987 } 1988 1989 size = MIN(bytes, PAGE_SIZE); 1990 1991 if (pci_nvme_append_iov_req(sc, req, *prp_list, 1992 size, is_write, offset)) { 1993 pci_nvme_status_genc(&status, 1994 NVME_SC_DATA_TRANSFER_ERROR); 1995 goto out; 1996 } 1997 1998 offset += size; 1999 bytes -= size; 2000 2001 prp_list++; 2002 } 2003 } 2004 req->io_req.br_callback = pci_nvme_io_done; 2005 if (is_write) 2006 err = blockif_write(nvstore->ctx, &req->io_req); 2007 else 2008 err = blockif_read(nvstore->ctx, &req->io_req); 2009 2010 if (err) 2011 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2012 out: 2013 return (status); 2014 } 2015 2016 static bool 2017 nvme_opc_write_read(struct pci_nvme_softc *sc, 2018 struct nvme_command *cmd, 2019 struct pci_nvme_blockstore *nvstore, 2020 struct pci_nvme_ioreq *req, 2021 uint16_t *status) 2022 { 2023 uint64_t lba, nblocks, bytes; 2024 size_t offset; 2025 bool is_write = cmd->opc == NVME_OPC_WRITE; 2026 bool pending = false; 2027 2028 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2029 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2030 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2031 WPRINTF("%s command would exceed LBA range", __func__); 2032 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2033 goto out; 2034 } 2035 2036 bytes = nblocks << nvstore->sectsz_bits; 2037 if (bytes > NVME_MAX_DATA_SIZE) { 2038 WPRINTF("%s command would exceed MDTS", __func__); 2039 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2040 goto out; 2041 } 2042 2043 offset = lba << nvstore->sectsz_bits; 2044 2045 req->bytes = bytes; 2046 req->io_req.br_offset = lba; 2047 2048 /* PRP bits 1:0 must be zero */ 2049 cmd->prp1 &= ~0x3UL; 2050 cmd->prp2 &= ~0x3UL; 2051 2052 if (nvstore->type == NVME_STOR_RAM) { 2053 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2054 cmd->prp2, offset, bytes, is_write); 2055 } else { 2056 *status = nvme_write_read_blockif(sc, nvstore, req, 2057 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2058 2059 if (*status == NVME_NO_STATUS) 2060 pending = true; 2061 } 2062 out: 2063 if (!pending) 2064 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2065 2066 return (pending); 2067 } 2068 2069 static void 2070 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2071 { 2072 struct pci_nvme_ioreq *req = br->br_param; 2073 struct pci_nvme_softc *sc = req->sc; 2074 bool done = true; 2075 uint16_t status; 2076 2077 if (err) { 2078 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2079 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2080 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2081 } else { 2082 struct iovec *iov = req->io_req.br_iov; 2083 2084 req->prev_gpaddr++; 2085 iov += req->prev_gpaddr; 2086 2087 /* The iov_* values already include the sector size */ 2088 req->io_req.br_offset = (off_t)iov->iov_base; 2089 req->io_req.br_resid = iov->iov_len; 2090 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2091 pci_nvme_status_genc(&status, 2092 NVME_SC_INTERNAL_DEVICE_ERROR); 2093 } else 2094 done = false; 2095 } 2096 2097 if (done) { 2098 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2099 req->cid, 0, status); 2100 pci_nvme_release_ioreq(sc, req); 2101 } 2102 } 2103 2104 static bool 2105 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2106 struct nvme_command *cmd, 2107 struct pci_nvme_blockstore *nvstore, 2108 struct pci_nvme_ioreq *req, 2109 uint16_t *status) 2110 { 2111 struct nvme_dsm_range *range; 2112 uint32_t nr, r, non_zero, dr; 2113 int err; 2114 bool pending = false; 2115 2116 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2117 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2118 goto out; 2119 } 2120 2121 nr = cmd->cdw10 & 0xff; 2122 2123 /* copy locally because a range entry could straddle PRPs */ 2124 range = calloc(1, NVME_MAX_DSM_TRIM); 2125 if (range == NULL) { 2126 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2127 goto out; 2128 } 2129 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2130 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2131 2132 /* Check for invalid ranges and the number of non-zero lengths */ 2133 non_zero = 0; 2134 for (r = 0; r <= nr; r++) { 2135 if (pci_nvme_out_of_range(nvstore, 2136 range[r].starting_lba, range[r].length)) { 2137 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2138 goto out; 2139 } 2140 if (range[r].length != 0) 2141 non_zero++; 2142 } 2143 2144 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2145 size_t offset, bytes; 2146 int sectsz_bits = sc->nvstore.sectsz_bits; 2147 2148 /* 2149 * DSM calls are advisory only, and compliant controllers 2150 * may choose to take no actions (i.e. return Success). 2151 */ 2152 if (!nvstore->deallocate) { 2153 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2154 goto out; 2155 } 2156 2157 /* If all ranges have a zero length, return Success */ 2158 if (non_zero == 0) { 2159 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2160 goto out; 2161 } 2162 2163 if (req == NULL) { 2164 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2165 goto out; 2166 } 2167 2168 offset = range[0].starting_lba << sectsz_bits; 2169 bytes = range[0].length << sectsz_bits; 2170 2171 /* 2172 * If the request is for more than a single range, store 2173 * the ranges in the br_iov. Optimize for the common case 2174 * of a single range. 2175 * 2176 * Note that NVMe Number of Ranges is a zero based value 2177 */ 2178 req->io_req.br_iovcnt = 0; 2179 req->io_req.br_offset = offset; 2180 req->io_req.br_resid = bytes; 2181 2182 if (nr == 0) { 2183 req->io_req.br_callback = pci_nvme_io_done; 2184 } else { 2185 struct iovec *iov = req->io_req.br_iov; 2186 2187 for (r = 0, dr = 0; r <= nr; r++) { 2188 offset = range[r].starting_lba << sectsz_bits; 2189 bytes = range[r].length << sectsz_bits; 2190 if (bytes == 0) 2191 continue; 2192 2193 if ((nvstore->size - offset) < bytes) { 2194 pci_nvme_status_genc(status, 2195 NVME_SC_LBA_OUT_OF_RANGE); 2196 goto out; 2197 } 2198 iov[dr].iov_base = (void *)offset; 2199 iov[dr].iov_len = bytes; 2200 dr++; 2201 } 2202 req->io_req.br_callback = pci_nvme_dealloc_sm; 2203 2204 /* 2205 * Use prev_gpaddr to track the current entry and 2206 * prev_size to track the number of entries 2207 */ 2208 req->prev_gpaddr = 0; 2209 req->prev_size = dr; 2210 } 2211 2212 err = blockif_delete(nvstore->ctx, &req->io_req); 2213 if (err) 2214 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2215 else 2216 pending = true; 2217 } 2218 out: 2219 free(range); 2220 return (pending); 2221 } 2222 2223 static void 2224 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2225 { 2226 struct nvme_submission_queue *sq; 2227 uint16_t status; 2228 uint16_t sqhead; 2229 2230 /* handle all submissions up to sq->tail index */ 2231 sq = &sc->submit_queues[idx]; 2232 2233 pthread_mutex_lock(&sq->mtx); 2234 2235 sqhead = sq->head; 2236 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2237 idx, sqhead, sq->tail, sq->qbase); 2238 2239 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2240 struct nvme_command *cmd; 2241 struct pci_nvme_ioreq *req; 2242 uint32_t nsid; 2243 bool pending; 2244 2245 pending = false; 2246 req = NULL; 2247 status = 0; 2248 2249 cmd = &sq->qbase[sqhead]; 2250 sqhead = (sqhead + 1) % sq->size; 2251 2252 nsid = le32toh(cmd->nsid); 2253 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2254 pci_nvme_status_genc(&status, 2255 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2256 status |= 2257 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2258 goto complete; 2259 } 2260 2261 req = pci_nvme_get_ioreq(sc); 2262 if (req == NULL) { 2263 pci_nvme_status_genc(&status, 2264 NVME_SC_INTERNAL_DEVICE_ERROR); 2265 WPRINTF("%s: unable to allocate IO req", __func__); 2266 goto complete; 2267 } 2268 req->nvme_sq = sq; 2269 req->sqid = idx; 2270 req->opc = cmd->opc; 2271 req->cid = cmd->cid; 2272 req->nsid = cmd->nsid; 2273 2274 switch (cmd->opc) { 2275 case NVME_OPC_FLUSH: 2276 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2277 req, &status); 2278 break; 2279 case NVME_OPC_WRITE: 2280 case NVME_OPC_READ: 2281 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2282 req, &status); 2283 break; 2284 case NVME_OPC_WRITE_ZEROES: 2285 /* TODO: write zeroes 2286 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2287 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2288 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2289 break; 2290 case NVME_OPC_DATASET_MANAGEMENT: 2291 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2292 req, &status); 2293 break; 2294 default: 2295 WPRINTF("%s unhandled io command 0x%x", 2296 __func__, cmd->opc); 2297 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2298 } 2299 complete: 2300 if (!pending) { 2301 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2302 status); 2303 if (req != NULL) 2304 pci_nvme_release_ioreq(sc, req); 2305 } 2306 } 2307 2308 sq->head = sqhead; 2309 2310 pthread_mutex_unlock(&sq->mtx); 2311 } 2312 2313 static void 2314 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2315 uint64_t idx, int is_sq, uint64_t value) 2316 { 2317 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2318 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2319 2320 if (is_sq) { 2321 if (idx > sc->num_squeues) { 2322 WPRINTF("%s queue index %lu overflow from " 2323 "guest (max %u)", 2324 __func__, idx, sc->num_squeues); 2325 return; 2326 } 2327 2328 atomic_store_short(&sc->submit_queues[idx].tail, 2329 (uint16_t)value); 2330 2331 if (idx == 0) { 2332 pci_nvme_handle_admin_cmd(sc, value); 2333 } else { 2334 /* submission queue; handle new entries in SQ */ 2335 if (idx > sc->num_squeues) { 2336 WPRINTF("%s SQ index %lu overflow from " 2337 "guest (max %u)", 2338 __func__, idx, sc->num_squeues); 2339 return; 2340 } 2341 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2342 } 2343 } else { 2344 if (idx > sc->num_cqueues) { 2345 WPRINTF("%s queue index %lu overflow from " 2346 "guest (max %u)", 2347 __func__, idx, sc->num_cqueues); 2348 return; 2349 } 2350 2351 atomic_store_short(&sc->compl_queues[idx].head, 2352 (uint16_t)value); 2353 } 2354 } 2355 2356 static void 2357 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2358 { 2359 const char *s = iswrite ? "WRITE" : "READ"; 2360 2361 switch (offset) { 2362 case NVME_CR_CAP_LOW: 2363 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2364 break; 2365 case NVME_CR_CAP_HI: 2366 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2367 break; 2368 case NVME_CR_VS: 2369 DPRINTF("%s %s NVME_CR_VS", func, s); 2370 break; 2371 case NVME_CR_INTMS: 2372 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2373 break; 2374 case NVME_CR_INTMC: 2375 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2376 break; 2377 case NVME_CR_CC: 2378 DPRINTF("%s %s NVME_CR_CC", func, s); 2379 break; 2380 case NVME_CR_CSTS: 2381 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2382 break; 2383 case NVME_CR_NSSR: 2384 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2385 break; 2386 case NVME_CR_AQA: 2387 DPRINTF("%s %s NVME_CR_AQA", func, s); 2388 break; 2389 case NVME_CR_ASQ_LOW: 2390 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2391 break; 2392 case NVME_CR_ASQ_HI: 2393 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2394 break; 2395 case NVME_CR_ACQ_LOW: 2396 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2397 break; 2398 case NVME_CR_ACQ_HI: 2399 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2400 break; 2401 default: 2402 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2403 } 2404 2405 } 2406 2407 static void 2408 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2409 uint64_t offset, int size, uint64_t value) 2410 { 2411 uint32_t ccreg; 2412 2413 if (offset >= NVME_DOORBELL_OFFSET) { 2414 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2415 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2416 int is_sq = (belloffset % 8) < 4; 2417 2418 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2419 WPRINTF("guest attempted an overflow write offset " 2420 "0x%lx, val 0x%lx in %s", 2421 offset, value, __func__); 2422 return; 2423 } 2424 2425 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2426 return; 2427 } 2428 2429 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2430 offset, size, value); 2431 2432 if (size != 4) { 2433 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2434 "val 0x%lx) to bar0 in %s", 2435 size, offset, value, __func__); 2436 /* TODO: shutdown device */ 2437 return; 2438 } 2439 2440 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2441 2442 pthread_mutex_lock(&sc->mtx); 2443 2444 switch (offset) { 2445 case NVME_CR_CAP_LOW: 2446 case NVME_CR_CAP_HI: 2447 /* readonly */ 2448 break; 2449 case NVME_CR_VS: 2450 /* readonly */ 2451 break; 2452 case NVME_CR_INTMS: 2453 /* MSI-X, so ignore */ 2454 break; 2455 case NVME_CR_INTMC: 2456 /* MSI-X, so ignore */ 2457 break; 2458 case NVME_CR_CC: 2459 ccreg = (uint32_t)value; 2460 2461 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2462 "iocqes %u", 2463 __func__, 2464 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2465 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2466 NVME_CC_GET_IOCQES(ccreg)); 2467 2468 if (NVME_CC_GET_SHN(ccreg)) { 2469 /* perform shutdown - flush out data to backend */ 2470 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2471 NVME_CSTS_REG_SHST_SHIFT); 2472 sc->regs.csts |= NVME_SHST_COMPLETE << 2473 NVME_CSTS_REG_SHST_SHIFT; 2474 } 2475 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2476 if (NVME_CC_GET_EN(ccreg) == 0) 2477 /* transition 1-> causes controller reset */ 2478 pci_nvme_reset_locked(sc); 2479 else 2480 pci_nvme_init_controller(ctx, sc); 2481 } 2482 2483 /* Insert the iocqes, iosqes and en bits from the write */ 2484 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2485 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2486 if (NVME_CC_GET_EN(ccreg) == 0) { 2487 /* Insert the ams, mps and css bit fields */ 2488 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2489 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2490 sc->regs.csts &= ~NVME_CSTS_RDY; 2491 } else if (sc->pending_ios == 0) { 2492 sc->regs.csts |= NVME_CSTS_RDY; 2493 } 2494 break; 2495 case NVME_CR_CSTS: 2496 break; 2497 case NVME_CR_NSSR: 2498 /* ignore writes; don't support subsystem reset */ 2499 break; 2500 case NVME_CR_AQA: 2501 sc->regs.aqa = (uint32_t)value; 2502 break; 2503 case NVME_CR_ASQ_LOW: 2504 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2505 (0xFFFFF000 & value); 2506 break; 2507 case NVME_CR_ASQ_HI: 2508 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2509 (value << 32); 2510 break; 2511 case NVME_CR_ACQ_LOW: 2512 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2513 (0xFFFFF000 & value); 2514 break; 2515 case NVME_CR_ACQ_HI: 2516 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2517 (value << 32); 2518 break; 2519 default: 2520 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2521 __func__, offset, value, size); 2522 } 2523 pthread_mutex_unlock(&sc->mtx); 2524 } 2525 2526 static void 2527 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2528 int baridx, uint64_t offset, int size, uint64_t value) 2529 { 2530 struct pci_nvme_softc* sc = pi->pi_arg; 2531 2532 if (baridx == pci_msix_table_bar(pi) || 2533 baridx == pci_msix_pba_bar(pi)) { 2534 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2535 " value 0x%lx", baridx, offset, size, value); 2536 2537 pci_emul_msix_twrite(pi, offset, size, value); 2538 return; 2539 } 2540 2541 switch (baridx) { 2542 case 0: 2543 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2544 break; 2545 2546 default: 2547 DPRINTF("%s unknown baridx %d, val 0x%lx", 2548 __func__, baridx, value); 2549 } 2550 } 2551 2552 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2553 uint64_t offset, int size) 2554 { 2555 uint64_t value; 2556 2557 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2558 2559 if (offset < NVME_DOORBELL_OFFSET) { 2560 void *p = &(sc->regs); 2561 pthread_mutex_lock(&sc->mtx); 2562 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2563 pthread_mutex_unlock(&sc->mtx); 2564 } else { 2565 value = 0; 2566 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2567 } 2568 2569 switch (size) { 2570 case 1: 2571 value &= 0xFF; 2572 break; 2573 case 2: 2574 value &= 0xFFFF; 2575 break; 2576 case 4: 2577 value &= 0xFFFFFFFF; 2578 break; 2579 } 2580 2581 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2582 offset, size, (uint32_t)value); 2583 2584 return (value); 2585 } 2586 2587 2588 2589 static uint64_t 2590 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2591 uint64_t offset, int size) 2592 { 2593 struct pci_nvme_softc* sc = pi->pi_arg; 2594 2595 if (baridx == pci_msix_table_bar(pi) || 2596 baridx == pci_msix_pba_bar(pi)) { 2597 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2598 baridx, offset, size); 2599 2600 return pci_emul_msix_tread(pi, offset, size); 2601 } 2602 2603 switch (baridx) { 2604 case 0: 2605 return pci_nvme_read_bar_0(sc, offset, size); 2606 2607 default: 2608 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2609 } 2610 2611 return (0); 2612 } 2613 2614 static int 2615 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 2616 { 2617 char bident[sizeof("XX:X:X")]; 2618 const char *value; 2619 uint32_t sectsz; 2620 2621 sc->max_queues = NVME_QUEUES; 2622 sc->max_qentries = NVME_MAX_QENTRIES; 2623 sc->ioslots = NVME_IOSLOTS; 2624 sc->num_squeues = sc->max_queues; 2625 sc->num_cqueues = sc->max_queues; 2626 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2627 sectsz = 0; 2628 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2629 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2630 2631 value = get_config_value_node(nvl, "maxq"); 2632 if (value != NULL) 2633 sc->max_queues = atoi(value); 2634 value = get_config_value_node(nvl, "qsz"); 2635 if (value != NULL) { 2636 sc->max_qentries = atoi(value); 2637 if (sc->max_qentries <= 0) { 2638 EPRINTLN("nvme: Invalid qsz option %d", 2639 sc->max_qentries); 2640 return (-1); 2641 } 2642 } 2643 value = get_config_value_node(nvl, "ioslots"); 2644 if (value != NULL) { 2645 sc->ioslots = atoi(value); 2646 if (sc->ioslots <= 0) { 2647 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 2648 return (-1); 2649 } 2650 } 2651 value = get_config_value_node(nvl, "sectsz"); 2652 if (value != NULL) 2653 sectsz = atoi(value); 2654 value = get_config_value_node(nvl, "ser"); 2655 if (value != NULL) { 2656 /* 2657 * This field indicates the Product Serial Number in 2658 * 7-bit ASCII, unused bytes should be space characters. 2659 * Ref: NVMe v1.3c. 2660 */ 2661 cpywithpad((char *)sc->ctrldata.sn, 2662 sizeof(sc->ctrldata.sn), value, ' '); 2663 } 2664 value = get_config_value_node(nvl, "eui64"); 2665 if (value != NULL) 2666 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 2667 value = get_config_value_node(nvl, "dsm"); 2668 if (value != NULL) { 2669 if (strcmp(value, "auto") == 0) 2670 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2671 else if (strcmp(value, "enable") == 0) 2672 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2673 else if (strcmp(value, "disable") == 0) 2674 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2675 } 2676 2677 value = get_config_value_node(nvl, "ram"); 2678 if (value != NULL) { 2679 uint64_t sz = strtoull(value, NULL, 10); 2680 2681 sc->nvstore.type = NVME_STOR_RAM; 2682 sc->nvstore.size = sz * 1024 * 1024; 2683 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2684 sc->nvstore.sectsz = 4096; 2685 sc->nvstore.sectsz_bits = 12; 2686 if (sc->nvstore.ctx == NULL) { 2687 EPRINTLN("nvme: Unable to allocate RAM"); 2688 return (-1); 2689 } 2690 } else { 2691 snprintf(bident, sizeof(bident), "%d:%d", 2692 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2693 sc->nvstore.ctx = blockif_open(nvl, bident); 2694 if (sc->nvstore.ctx == NULL) { 2695 EPRINTLN("nvme: Could not open backing file: %s", 2696 strerror(errno)); 2697 return (-1); 2698 } 2699 sc->nvstore.type = NVME_STOR_BLOCKIF; 2700 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2701 } 2702 2703 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2704 sc->nvstore.sectsz = sectsz; 2705 else if (sc->nvstore.type != NVME_STOR_RAM) 2706 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2707 for (sc->nvstore.sectsz_bits = 9; 2708 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2709 sc->nvstore.sectsz_bits++); 2710 2711 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2712 sc->max_queues = NVME_QUEUES; 2713 2714 return (0); 2715 } 2716 2717 static int 2718 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 2719 { 2720 struct pci_nvme_softc *sc; 2721 uint32_t pci_membar_sz; 2722 int error; 2723 2724 error = 0; 2725 2726 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2727 pi->pi_arg = sc; 2728 sc->nsc_pi = pi; 2729 2730 error = pci_nvme_parse_config(sc, nvl); 2731 if (error < 0) 2732 goto done; 2733 else 2734 error = 0; 2735 2736 STAILQ_INIT(&sc->ioreqs_free); 2737 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2738 for (int i = 0; i < sc->ioslots; i++) { 2739 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2740 } 2741 2742 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2743 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2744 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2745 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2746 pci_set_cfgdata8(pi, PCIR_PROGIF, 2747 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2748 2749 /* 2750 * Allocate size of NVMe registers + doorbell space for all queues. 2751 * 2752 * The specification requires a minimum memory I/O window size of 16K. 2753 * The Windows driver will refuse to start a device with a smaller 2754 * window. 2755 */ 2756 pci_membar_sz = sizeof(struct nvme_registers) + 2757 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2758 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2759 2760 DPRINTF("nvme membar size: %u", pci_membar_sz); 2761 2762 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2763 if (error) { 2764 WPRINTF("%s pci alloc mem bar failed", __func__); 2765 goto done; 2766 } 2767 2768 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2769 if (error) { 2770 WPRINTF("%s pci add msixcap failed", __func__); 2771 goto done; 2772 } 2773 2774 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2775 if (error) { 2776 WPRINTF("%s pci add Express capability failed", __func__); 2777 goto done; 2778 } 2779 2780 pthread_mutex_init(&sc->mtx, NULL); 2781 sem_init(&sc->iosemlock, 0, sc->ioslots); 2782 2783 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 2784 /* 2785 * Controller data depends on Namespace data so initialize Namespace 2786 * data first. 2787 */ 2788 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2789 pci_nvme_init_ctrldata(sc); 2790 pci_nvme_init_logpages(sc); 2791 pci_nvme_init_features(sc); 2792 2793 pci_nvme_aer_init(sc); 2794 2795 pci_nvme_reset(sc); 2796 2797 pci_lintr_request(pi); 2798 2799 done: 2800 return (error); 2801 } 2802 2803 2804 struct pci_devemu pci_de_nvme = { 2805 .pe_emu = "nvme", 2806 .pe_init = pci_nvme_init, 2807 .pe_legacy_config = blockif_legacy_config, 2808 .pe_barwrite = pci_nvme_write, 2809 .pe_barread = pci_nvme_read 2810 }; 2811 PCI_EMUL_SET(pci_de_nvme); 2812