1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2018 Nexenta Systems, Inc. 14 * Copyright 2016 Tegile Systems, Inc. All rights reserved. 15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 16 * Copyright 2018 Joyent, Inc. 17 * Copyright 2019 Western Digital Corporation. 18 * Copyright 2020 Racktop Systems. 19 */ 20 21 /* 22 * blkdev driver for NVMe compliant storage devices 23 * 24 * This driver was written to conform to version 1.2.1 of the NVMe 25 * specification. It may work with newer versions, but that is completely 26 * untested and disabled by default. 27 * 28 * The driver has only been tested on x86 systems and will not work on big- 29 * endian systems without changes to the code accessing registers and data 30 * structures used by the hardware. 31 * 32 * 33 * Interrupt Usage: 34 * 35 * The driver will use a single interrupt while configuring the device as the 36 * specification requires, but contrary to the specification it will try to use 37 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it 38 * will switch to multiple-message MSI(-X) if supported. The driver wants to 39 * have one interrupt vector per CPU, but it will work correctly if less are 40 * available. Interrupts can be shared by queues, the interrupt handler will 41 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only 42 * the admin queue will share an interrupt with one I/O queue. The interrupt 43 * handler will retrieve completed commands from all queues sharing an interrupt 44 * vector and will post them to a taskq for completion processing. 45 * 46 * 47 * Command Processing: 48 * 49 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up 50 * to 65536 I/O commands. The driver will configure one I/O queue pair per 51 * available interrupt vector, with the queue length usually much smaller than 52 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 53 * interrupt vectors will be used. 54 * 55 * Additionally the hardware provides a single special admin queue pair that can 56 * hold up to 4096 admin commands. 57 * 58 * From the hardware perspective both queues of a queue pair are independent, 59 * but they share some driver state: the command array (holding pointers to 60 * commands currently being processed by the hardware) and the active command 61 * counter. Access to a submission queue and the shared state is protected by 62 * nq_mutex, completion queue is protected by ncq_mutex. 63 * 64 * When a command is submitted to a queue pair the active command counter is 65 * incremented and a pointer to the command is stored in the command array. The 66 * array index is used as command identifier (CID) in the submission queue 67 * entry. Some commands may take a very long time to complete, and if the queue 68 * wraps around in that time a submission may find the next array slot to still 69 * be used by a long-running command. In this case the array is sequentially 70 * searched for the next free slot. The length of the command array is the same 71 * as the configured queue length. Queue overrun is prevented by the semaphore, 72 * so a command submission may block if the queue is full. 73 * 74 * 75 * Polled I/O Support: 76 * 77 * For kernel core dump support the driver can do polled I/O. As interrupts are 78 * turned off while dumping the driver will just submit a command in the regular 79 * way, and then repeatedly attempt a command retrieval until it gets the 80 * command back. 81 * 82 * 83 * Namespace Support: 84 * 85 * NVMe devices can have multiple namespaces, each being a independent data 86 * store. The driver supports multiple namespaces and creates a blkdev interface 87 * for each namespace found. Namespaces can have various attributes to support 88 * protection information. This driver does not support any of this and ignores 89 * namespaces that have these attributes. 90 * 91 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 92 * (EUI64). This driver uses the EUI64 if present to generate the devid and 93 * passes it to blkdev to use it in the device node names. As this is currently 94 * untested namespaces with EUI64 are ignored by default. 95 * 96 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a 97 * single controller. This is an artificial limit imposed by the driver to be 98 * able to address a reasonable number of controllers and namespaces using a 99 * 32bit minor node number. 100 * 101 * 102 * Minor nodes: 103 * 104 * For each NVMe device the driver exposes one minor node for the controller and 105 * one minor node for each namespace. The only operations supported by those 106 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the 107 * interface for the nvmeadm(1M) utility. 108 * 109 * 110 * Blkdev Interface: 111 * 112 * This driver uses blkdev to do all the heavy lifting involved with presenting 113 * a disk device to the system. As a result, the processing of I/O requests is 114 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 115 * setup, and splitting of transfers into manageable chunks. 116 * 117 * I/O requests coming in from blkdev are turned into NVM commands and posted to 118 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 119 * queues. There is currently no timeout handling of I/O commands. 120 * 121 * Blkdev also supports querying device/media information and generating a 122 * devid. The driver reports the best block size as determined by the namespace 123 * format back to blkdev as physical block size to support partition and block 124 * alignment. The devid is either based on the namespace EUI64, if present, or 125 * composed using the device vendor ID, model number, serial number, and the 126 * namespace ID. 127 * 128 * 129 * Error Handling: 130 * 131 * Error handling is currently limited to detecting fatal hardware errors, 132 * either by asynchronous events, or synchronously through command status or 133 * admin command timeouts. In case of severe errors the device is fenced off, 134 * all further requests will return EIO. FMA is then called to fault the device. 135 * 136 * The hardware has a limit for outstanding asynchronous event requests. Before 137 * this limit is known the driver assumes it is at least 1 and posts a single 138 * asynchronous request. Later when the limit is known more asynchronous event 139 * requests are posted to allow quicker reception of error information. When an 140 * asynchronous event is posted by the hardware the driver will parse the error 141 * status fields and log information or fault the device, depending on the 142 * severity of the asynchronous event. The asynchronous event request is then 143 * reused and posted to the admin queue again. 144 * 145 * On command completion the command status is checked for errors. In case of 146 * errors indicating a driver bug the driver panics. Almost all other error 147 * status values just cause EIO to be returned. 148 * 149 * Command timeouts are currently detected for all admin commands except 150 * asynchronous event requests. If a command times out and the hardware appears 151 * to be healthy the driver attempts to abort the command. The original command 152 * timeout is also applied to the abort command. If the abort times out too the 153 * driver assumes the device to be dead, fences it off, and calls FMA to retire 154 * it. In all other cases the aborted command should return immediately with a 155 * status indicating it was aborted, and the driver will wait indefinitely for 156 * that to happen. No timeout handling of normal I/O commands is presently done. 157 * 158 * Any command that times out due to the controller dropping dead will be put on 159 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA 160 * memory being reused by the system and later be written to by a "dead" NVMe 161 * controller. 162 * 163 * 164 * Locking: 165 * 166 * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held 167 * when accessing shared state and submission queue registers, ncq_mutex 168 * is held when accessing completion queue state and registers. 169 * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while 170 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both 171 * mutexes themselves. 172 * 173 * Each command also has its own nc_mutex, which is associated with the 174 * condition variable nc_cv. It is only used on admin commands which are run 175 * synchronously. In that case it must be held across calls to 176 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by 177 * nvme_admin_cmd(). It must also be held whenever the completion state of the 178 * command is changed or while a admin command timeout is handled. 179 * 180 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first. 181 * More than one nc_mutex may only be held when aborting commands. In this case, 182 * the nc_mutex of the command to be aborted must be held across the call to 183 * nvme_abort_cmd() to prevent the command from completing while the abort is in 184 * progress. 185 * 186 * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be 187 * acquired first. More than one nq_mutex is never held by a single thread. 188 * The ncq_mutex is only held by nvme_retrieve_cmd() and 189 * nvme_process_iocq(). nvme_process_iocq() is only called from the 190 * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the 191 * mutex is non-contentious but is required for implementation completeness 192 * and safety. 193 * 194 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt 195 * and exclusive-open flag nm_oexcl. 196 * 197 * 198 * Quiesce / Fast Reboot: 199 * 200 * The driver currently does not support fast reboot. A quiesce(9E) entry point 201 * is still provided which is used to send a shutdown notification to the 202 * device. 203 * 204 * 205 * DDI UFM Support 206 * 207 * The driver supports the DDI UFM framework for reporting information about 208 * the device's firmware image and slot configuration. This data can be 209 * queried by userland software via ioctls to the ufm driver. For more 210 * information, see ddi_ufm(9E). 211 * 212 * 213 * Driver Configuration: 214 * 215 * The following driver properties can be changed to control some aspects of the 216 * drivers operation: 217 * - strict-version: can be set to 0 to allow devices conforming to newer 218 * major versions to be used 219 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 220 * specific command status as a fatal error leading device faulting 221 * - admin-queue-len: the maximum length of the admin queue (16-4096) 222 * - io-squeue-len: the maximum length of the I/O submission queues (16-65536) 223 * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536) 224 * - async-event-limit: the maximum number of asynchronous event requests to be 225 * posted by the driver 226 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 227 * cache 228 * - min-phys-block-size: the minimum physical block size to report to blkdev, 229 * which is among other things the basis for ZFS vdev ashift 230 * - max-submission-queues: the maximum number of I/O submission queues. 231 * - max-completion-queues: the maximum number of I/O completion queues, 232 * can be less than max-submission-queues, in which case the completion 233 * queues are shared. 234 * 235 * 236 * TODO: 237 * - figure out sane default for I/O queue depth reported to blkdev 238 * - FMA handling of media errors 239 * - support for devices supporting very large I/O requests using chained PRPs 240 * - support for configuring hardware parameters like interrupt coalescing 241 * - support for media formatting and hard partitioning into namespaces 242 * - support for big-endian systems 243 * - support for fast reboot 244 * - support for NVMe Subsystem Reset (1.1) 245 * - support for Scatter/Gather lists (1.1) 246 * - support for Reservations (1.1) 247 * - support for power management 248 */ 249 250 #include <sys/byteorder.h> 251 #ifdef _BIG_ENDIAN 252 #error nvme driver needs porting for big-endian platforms 253 #endif 254 255 #include <sys/modctl.h> 256 #include <sys/conf.h> 257 #include <sys/devops.h> 258 #include <sys/ddi.h> 259 #include <sys/ddi_ufm.h> 260 #include <sys/sunddi.h> 261 #include <sys/sunndi.h> 262 #include <sys/bitmap.h> 263 #include <sys/sysmacros.h> 264 #include <sys/param.h> 265 #include <sys/varargs.h> 266 #include <sys/cpuvar.h> 267 #include <sys/disp.h> 268 #include <sys/blkdev.h> 269 #include <sys/atomic.h> 270 #include <sys/archsystm.h> 271 #include <sys/sata/sata_hba.h> 272 #include <sys/stat.h> 273 #include <sys/policy.h> 274 #include <sys/list.h> 275 276 #include <sys/nvme.h> 277 278 #ifdef __x86 279 #include <sys/x86_archext.h> 280 #endif 281 282 #include "nvme_reg.h" 283 #include "nvme_var.h" 284 285 /* 286 * Assertions to make sure that we've properly captured various aspects of the 287 * packed structures and haven't broken them during updates. 288 */ 289 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000); 290 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256); 291 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512); 292 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768); 293 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792); 294 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048); 295 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072); 296 297 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000); 298 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32); 299 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104); 300 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128); 301 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384); 302 303 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000); 304 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32); 305 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64); 306 307 308 /* NVMe spec version supported */ 309 static const int nvme_version_major = 1; 310 311 /* tunable for admin command timeout in seconds, default is 1s */ 312 int nvme_admin_cmd_timeout = 1; 313 314 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */ 315 int nvme_format_cmd_timeout = 600; 316 317 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */ 318 int nvme_commit_save_cmd_timeout = 15; 319 320 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 321 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 322 static int nvme_quiesce(dev_info_t *); 323 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 324 static int nvme_setup_interrupts(nvme_t *, int, int); 325 static void nvme_release_interrupts(nvme_t *); 326 static uint_t nvme_intr(caddr_t, caddr_t); 327 328 static void nvme_shutdown(nvme_t *, int, boolean_t); 329 static boolean_t nvme_reset(nvme_t *, boolean_t); 330 static int nvme_init(nvme_t *); 331 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 332 static void nvme_free_cmd(nvme_cmd_t *); 333 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 334 bd_xfer_t *); 335 static void nvme_admin_cmd(nvme_cmd_t *, int); 336 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *); 337 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *); 338 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *); 339 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int); 340 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 341 static void nvme_wait_cmd(nvme_cmd_t *, uint_t); 342 static void nvme_wakeup_cmd(void *); 343 static void nvme_async_event_task(void *); 344 345 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 346 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 347 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 348 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 349 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 350 static inline int nvme_check_cmd_status(nvme_cmd_t *); 351 352 static int nvme_abort_cmd(nvme_cmd_t *, uint_t); 353 static void nvme_async_event(nvme_t *); 354 static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t, 355 uint8_t, boolean_t, uint8_t); 356 static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t, 357 ...); 358 static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **); 359 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t, 360 uint32_t *); 361 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *, 362 void **, size_t *); 363 static int nvme_write_cache_set(nvme_t *, boolean_t); 364 static int nvme_set_nqueues(nvme_t *); 365 366 static void nvme_free_dma(nvme_dma_t *); 367 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 368 nvme_dma_t **); 369 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 370 nvme_dma_t **); 371 static void nvme_free_qpair(nvme_qpair_t *); 372 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t); 373 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 374 375 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 376 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 377 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 378 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 379 380 static boolean_t nvme_check_regs_hdl(nvme_t *); 381 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 382 383 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *); 384 385 static void nvme_bd_xfer_done(void *); 386 static void nvme_bd_driveinfo(void *, bd_drive_t *); 387 static int nvme_bd_mediainfo(void *, bd_media_t *); 388 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 389 static int nvme_bd_read(void *, bd_xfer_t *); 390 static int nvme_bd_write(void *, bd_xfer_t *); 391 static int nvme_bd_sync(void *, bd_xfer_t *); 392 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 393 394 static int nvme_prp_dma_constructor(void *, void *, int); 395 static void nvme_prp_dma_destructor(void *, void *); 396 397 static void nvme_prepare_devid(nvme_t *, uint32_t); 398 399 /* DDI UFM callbacks */ 400 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t, 401 ddi_ufm_image_t *); 402 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t, 403 ddi_ufm_slot_t *); 404 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *); 405 406 static int nvme_open(dev_t *, int, int, cred_t *); 407 static int nvme_close(dev_t, int, int, cred_t *); 408 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 409 410 static ddi_ufm_ops_t nvme_ufm_ops = { 411 NULL, 412 nvme_ufm_fill_image, 413 nvme_ufm_fill_slot, 414 nvme_ufm_getcaps 415 }; 416 417 #define NVME_MINOR_INST_SHIFT 9 418 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) 419 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) 420 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) 421 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) 422 423 static void *nvme_state; 424 static kmem_cache_t *nvme_cmd_cache; 425 426 /* 427 * DMA attributes for queue DMA memory 428 * 429 * Queue DMA memory must be page aligned. The maximum length of a queue is 430 * 65536 entries, and an entry can be 64 bytes long. 431 */ 432 static ddi_dma_attr_t nvme_queue_dma_attr = { 433 .dma_attr_version = DMA_ATTR_V0, 434 .dma_attr_addr_lo = 0, 435 .dma_attr_addr_hi = 0xffffffffffffffffULL, 436 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 437 .dma_attr_align = 0x1000, 438 .dma_attr_burstsizes = 0x7ff, 439 .dma_attr_minxfer = 0x1000, 440 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 441 .dma_attr_seg = 0xffffffffffffffffULL, 442 .dma_attr_sgllen = 1, 443 .dma_attr_granular = 1, 444 .dma_attr_flags = 0, 445 }; 446 447 /* 448 * DMA attributes for transfers using Physical Region Page (PRP) entries 449 * 450 * A PRP entry describes one page of DMA memory using the page size specified 451 * in the controller configuration's memory page size register (CC.MPS). It uses 452 * a 64bit base address aligned to this page size. There is no limitation on 453 * chaining PRPs together for arbitrarily large DMA transfers. 454 */ 455 static ddi_dma_attr_t nvme_prp_dma_attr = { 456 .dma_attr_version = DMA_ATTR_V0, 457 .dma_attr_addr_lo = 0, 458 .dma_attr_addr_hi = 0xffffffffffffffffULL, 459 .dma_attr_count_max = 0xfff, 460 .dma_attr_align = 0x1000, 461 .dma_attr_burstsizes = 0x7ff, 462 .dma_attr_minxfer = 0x1000, 463 .dma_attr_maxxfer = 0x1000, 464 .dma_attr_seg = 0xfff, 465 .dma_attr_sgllen = -1, 466 .dma_attr_granular = 1, 467 .dma_attr_flags = 0, 468 }; 469 470 /* 471 * DMA attributes for transfers using scatter/gather lists 472 * 473 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 474 * 32bit length field. SGL Segment and SGL Last Segment entries require the 475 * length to be a multiple of 16 bytes. 476 */ 477 static ddi_dma_attr_t nvme_sgl_dma_attr = { 478 .dma_attr_version = DMA_ATTR_V0, 479 .dma_attr_addr_lo = 0, 480 .dma_attr_addr_hi = 0xffffffffffffffffULL, 481 .dma_attr_count_max = 0xffffffffUL, 482 .dma_attr_align = 1, 483 .dma_attr_burstsizes = 0x7ff, 484 .dma_attr_minxfer = 0x10, 485 .dma_attr_maxxfer = 0xfffffffffULL, 486 .dma_attr_seg = 0xffffffffffffffffULL, 487 .dma_attr_sgllen = -1, 488 .dma_attr_granular = 0x10, 489 .dma_attr_flags = 0 490 }; 491 492 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 493 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 494 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 495 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 496 }; 497 498 static struct cb_ops nvme_cb_ops = { 499 .cb_open = nvme_open, 500 .cb_close = nvme_close, 501 .cb_strategy = nodev, 502 .cb_print = nodev, 503 .cb_dump = nodev, 504 .cb_read = nodev, 505 .cb_write = nodev, 506 .cb_ioctl = nvme_ioctl, 507 .cb_devmap = nodev, 508 .cb_mmap = nodev, 509 .cb_segmap = nodev, 510 .cb_chpoll = nochpoll, 511 .cb_prop_op = ddi_prop_op, 512 .cb_str = 0, 513 .cb_flag = D_NEW | D_MP, 514 .cb_rev = CB_REV, 515 .cb_aread = nodev, 516 .cb_awrite = nodev 517 }; 518 519 static struct dev_ops nvme_dev_ops = { 520 .devo_rev = DEVO_REV, 521 .devo_refcnt = 0, 522 .devo_getinfo = ddi_no_info, 523 .devo_identify = nulldev, 524 .devo_probe = nulldev, 525 .devo_attach = nvme_attach, 526 .devo_detach = nvme_detach, 527 .devo_reset = nodev, 528 .devo_cb_ops = &nvme_cb_ops, 529 .devo_bus_ops = NULL, 530 .devo_power = NULL, 531 .devo_quiesce = nvme_quiesce, 532 }; 533 534 static struct modldrv nvme_modldrv = { 535 .drv_modops = &mod_driverops, 536 .drv_linkinfo = "NVMe v1.1b", 537 .drv_dev_ops = &nvme_dev_ops 538 }; 539 540 static struct modlinkage nvme_modlinkage = { 541 .ml_rev = MODREV_1, 542 .ml_linkage = { &nvme_modldrv, NULL } 543 }; 544 545 static bd_ops_t nvme_bd_ops = { 546 .o_version = BD_OPS_CURRENT_VERSION, 547 .o_drive_info = nvme_bd_driveinfo, 548 .o_media_info = nvme_bd_mediainfo, 549 .o_devid_init = nvme_bd_devid, 550 .o_sync_cache = nvme_bd_sync, 551 .o_read = nvme_bd_read, 552 .o_write = nvme_bd_write, 553 .o_free_space = NULL, 554 }; 555 556 /* 557 * This list will hold commands that have timed out and couldn't be aborted. 558 * As we don't know what the hardware may still do with the DMA memory we can't 559 * free them, so we'll keep them forever on this list where we can easily look 560 * at them with mdb. 561 */ 562 static struct list nvme_lost_cmds; 563 static kmutex_t nvme_lc_mutex; 564 565 int 566 _init(void) 567 { 568 int error; 569 570 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 571 if (error != DDI_SUCCESS) 572 return (error); 573 574 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 575 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 576 577 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL); 578 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t), 579 offsetof(nvme_cmd_t, nc_list)); 580 581 bd_mod_init(&nvme_dev_ops); 582 583 error = mod_install(&nvme_modlinkage); 584 if (error != DDI_SUCCESS) { 585 ddi_soft_state_fini(&nvme_state); 586 mutex_destroy(&nvme_lc_mutex); 587 list_destroy(&nvme_lost_cmds); 588 bd_mod_fini(&nvme_dev_ops); 589 } 590 591 return (error); 592 } 593 594 int 595 _fini(void) 596 { 597 int error; 598 599 if (!list_is_empty(&nvme_lost_cmds)) 600 return (DDI_FAILURE); 601 602 error = mod_remove(&nvme_modlinkage); 603 if (error == DDI_SUCCESS) { 604 ddi_soft_state_fini(&nvme_state); 605 kmem_cache_destroy(nvme_cmd_cache); 606 mutex_destroy(&nvme_lc_mutex); 607 list_destroy(&nvme_lost_cmds); 608 bd_mod_fini(&nvme_dev_ops); 609 } 610 611 return (error); 612 } 613 614 int 615 _info(struct modinfo *modinfop) 616 { 617 return (mod_info(&nvme_modlinkage, modinfop)); 618 } 619 620 static inline void 621 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 622 { 623 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 624 625 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 626 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 627 } 628 629 static inline void 630 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 631 { 632 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 633 634 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 635 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 636 } 637 638 static inline uint64_t 639 nvme_get64(nvme_t *nvme, uintptr_t reg) 640 { 641 uint64_t val; 642 643 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 644 645 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 646 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 647 648 return (val); 649 } 650 651 static inline uint32_t 652 nvme_get32(nvme_t *nvme, uintptr_t reg) 653 { 654 uint32_t val; 655 656 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 657 658 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 659 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 660 661 return (val); 662 } 663 664 static boolean_t 665 nvme_check_regs_hdl(nvme_t *nvme) 666 { 667 ddi_fm_error_t error; 668 669 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 670 671 if (error.fme_status != DDI_FM_OK) 672 return (B_TRUE); 673 674 return (B_FALSE); 675 } 676 677 static boolean_t 678 nvme_check_dma_hdl(nvme_dma_t *dma) 679 { 680 ddi_fm_error_t error; 681 682 if (dma == NULL) 683 return (B_FALSE); 684 685 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 686 687 if (error.fme_status != DDI_FM_OK) 688 return (B_TRUE); 689 690 return (B_FALSE); 691 } 692 693 static void 694 nvme_free_dma_common(nvme_dma_t *dma) 695 { 696 if (dma->nd_dmah != NULL) 697 (void) ddi_dma_unbind_handle(dma->nd_dmah); 698 if (dma->nd_acch != NULL) 699 ddi_dma_mem_free(&dma->nd_acch); 700 if (dma->nd_dmah != NULL) 701 ddi_dma_free_handle(&dma->nd_dmah); 702 } 703 704 static void 705 nvme_free_dma(nvme_dma_t *dma) 706 { 707 nvme_free_dma_common(dma); 708 kmem_free(dma, sizeof (*dma)); 709 } 710 711 /* ARGSUSED */ 712 static void 713 nvme_prp_dma_destructor(void *buf, void *private) 714 { 715 nvme_dma_t *dma = (nvme_dma_t *)buf; 716 717 nvme_free_dma_common(dma); 718 } 719 720 static int 721 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 722 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 723 { 724 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 725 &dma->nd_dmah) != DDI_SUCCESS) { 726 /* 727 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 728 * the only other possible error is DDI_DMA_BADATTR which 729 * indicates a driver bug which should cause a panic. 730 */ 731 dev_err(nvme->n_dip, CE_PANIC, 732 "!failed to get DMA handle, check DMA attributes"); 733 return (DDI_FAILURE); 734 } 735 736 /* 737 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 738 * or the flags are conflicting, which isn't the case here. 739 */ 740 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 741 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 742 &dma->nd_len, &dma->nd_acch); 743 744 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 745 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 746 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 747 dev_err(nvme->n_dip, CE_WARN, 748 "!failed to bind DMA memory"); 749 atomic_inc_32(&nvme->n_dma_bind_err); 750 nvme_free_dma_common(dma); 751 return (DDI_FAILURE); 752 } 753 754 return (DDI_SUCCESS); 755 } 756 757 static int 758 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 759 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 760 { 761 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 762 763 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 764 DDI_SUCCESS) { 765 *ret = NULL; 766 kmem_free(dma, sizeof (nvme_dma_t)); 767 return (DDI_FAILURE); 768 } 769 770 bzero(dma->nd_memp, dma->nd_len); 771 772 *ret = dma; 773 return (DDI_SUCCESS); 774 } 775 776 /* ARGSUSED */ 777 static int 778 nvme_prp_dma_constructor(void *buf, void *private, int flags) 779 { 780 nvme_dma_t *dma = (nvme_dma_t *)buf; 781 nvme_t *nvme = (nvme_t *)private; 782 783 dma->nd_dmah = NULL; 784 dma->nd_acch = NULL; 785 786 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 787 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 788 return (-1); 789 } 790 791 ASSERT(dma->nd_ncookie == 1); 792 793 dma->nd_cached = B_TRUE; 794 795 return (0); 796 } 797 798 static int 799 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 800 uint_t flags, nvme_dma_t **dma) 801 { 802 uint32_t len = nentry * qe_len; 803 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 804 805 len = roundup(len, nvme->n_pagesize); 806 807 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 808 != DDI_SUCCESS) { 809 dev_err(nvme->n_dip, CE_WARN, 810 "!failed to get DMA memory for queue"); 811 goto fail; 812 } 813 814 if ((*dma)->nd_ncookie != 1) { 815 dev_err(nvme->n_dip, CE_WARN, 816 "!got too many cookies for queue DMA"); 817 goto fail; 818 } 819 820 return (DDI_SUCCESS); 821 822 fail: 823 if (*dma) { 824 nvme_free_dma(*dma); 825 *dma = NULL; 826 } 827 828 return (DDI_FAILURE); 829 } 830 831 static void 832 nvme_free_cq(nvme_cq_t *cq) 833 { 834 mutex_destroy(&cq->ncq_mutex); 835 836 if (cq->ncq_cmd_taskq != NULL) 837 taskq_destroy(cq->ncq_cmd_taskq); 838 839 if (cq->ncq_dma != NULL) 840 nvme_free_dma(cq->ncq_dma); 841 842 kmem_free(cq, sizeof (*cq)); 843 } 844 845 static void 846 nvme_free_qpair(nvme_qpair_t *qp) 847 { 848 int i; 849 850 mutex_destroy(&qp->nq_mutex); 851 sema_destroy(&qp->nq_sema); 852 853 if (qp->nq_sqdma != NULL) 854 nvme_free_dma(qp->nq_sqdma); 855 856 if (qp->nq_active_cmds > 0) 857 for (i = 0; i != qp->nq_nentry; i++) 858 if (qp->nq_cmd[i] != NULL) 859 nvme_free_cmd(qp->nq_cmd[i]); 860 861 if (qp->nq_cmd != NULL) 862 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 863 864 kmem_free(qp, sizeof (nvme_qpair_t)); 865 } 866 867 /* 868 * Destroy the pre-allocated cq array, but only free individual completion 869 * queues from the given starting index. 870 */ 871 static void 872 nvme_destroy_cq_array(nvme_t *nvme, uint_t start) 873 { 874 uint_t i; 875 876 for (i = start; i < nvme->n_cq_count; i++) 877 if (nvme->n_cq[i] != NULL) 878 nvme_free_cq(nvme->n_cq[i]); 879 880 kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count); 881 } 882 883 static int 884 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx, 885 uint_t nthr) 886 { 887 nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); 888 char name[64]; /* large enough for the taskq name */ 889 890 mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, 891 DDI_INTR_PRI(nvme->n_intr_pri)); 892 893 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 894 DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS) 895 goto fail; 896 897 cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp; 898 cq->ncq_nentry = nentry; 899 cq->ncq_id = idx; 900 cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); 901 902 /* 903 * Each completion queue has its own command taskq. 904 */ 905 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u", 906 ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx); 907 908 cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX, 909 TASKQ_PREPOPULATE); 910 911 if (cq->ncq_cmd_taskq == NULL) { 912 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd " 913 "taskq for cq %u", idx); 914 goto fail; 915 } 916 917 *cqp = cq; 918 return (DDI_SUCCESS); 919 920 fail: 921 nvme_free_cq(cq); 922 *cqp = NULL; 923 924 return (DDI_FAILURE); 925 } 926 927 /* 928 * Create the n_cq array big enough to hold "ncq" completion queues. 929 * If the array already exists it will be re-sized (but only larger). 930 * The admin queue is included in this array, which boosts the 931 * max number of entries to UINT16_MAX + 1. 932 */ 933 static int 934 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr) 935 { 936 nvme_cq_t **cq; 937 uint_t i, cq_count; 938 939 ASSERT3U(ncq, >, nvme->n_cq_count); 940 941 cq = nvme->n_cq; 942 cq_count = nvme->n_cq_count; 943 944 nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP); 945 nvme->n_cq_count = ncq; 946 947 for (i = 0; i < cq_count; i++) 948 nvme->n_cq[i] = cq[i]; 949 950 for (; i < nvme->n_cq_count; i++) 951 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) != 952 DDI_SUCCESS) 953 goto fail; 954 955 if (cq != NULL) 956 kmem_free(cq, sizeof (*cq) * cq_count); 957 958 return (DDI_SUCCESS); 959 960 fail: 961 nvme_destroy_cq_array(nvme, cq_count); 962 /* 963 * Restore the original array 964 */ 965 nvme->n_cq_count = cq_count; 966 nvme->n_cq = cq; 967 968 return (DDI_FAILURE); 969 } 970 971 static int 972 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 973 uint_t idx) 974 { 975 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 976 uint_t cq_idx; 977 978 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 979 DDI_INTR_PRI(nvme->n_intr_pri)); 980 981 /* 982 * The NVMe spec defines that a full queue has one empty (unused) slot; 983 * initialize the semaphore accordingly. 984 */ 985 sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL); 986 987 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 988 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 989 goto fail; 990 991 /* 992 * idx == 0 is adminq, those above 0 are shared io completion queues. 993 */ 994 cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1); 995 qp->nq_cq = nvme->n_cq[cq_idx]; 996 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 997 qp->nq_nentry = nentry; 998 999 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 1000 1001 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 1002 qp->nq_next_cmd = 0; 1003 1004 *nqp = qp; 1005 return (DDI_SUCCESS); 1006 1007 fail: 1008 nvme_free_qpair(qp); 1009 *nqp = NULL; 1010 1011 return (DDI_FAILURE); 1012 } 1013 1014 static nvme_cmd_t * 1015 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 1016 { 1017 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 1018 1019 if (cmd == NULL) 1020 return (cmd); 1021 1022 bzero(cmd, sizeof (nvme_cmd_t)); 1023 1024 cmd->nc_nvme = nvme; 1025 1026 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 1027 DDI_INTR_PRI(nvme->n_intr_pri)); 1028 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 1029 1030 return (cmd); 1031 } 1032 1033 static void 1034 nvme_free_cmd(nvme_cmd_t *cmd) 1035 { 1036 /* Don't free commands on the lost commands list. */ 1037 if (list_link_active(&cmd->nc_list)) 1038 return; 1039 1040 if (cmd->nc_dma) { 1041 if (cmd->nc_dma->nd_cached) 1042 kmem_cache_free(cmd->nc_nvme->n_prp_cache, 1043 cmd->nc_dma); 1044 else 1045 nvme_free_dma(cmd->nc_dma); 1046 cmd->nc_dma = NULL; 1047 } 1048 1049 cv_destroy(&cmd->nc_cv); 1050 mutex_destroy(&cmd->nc_mutex); 1051 1052 kmem_cache_free(nvme_cmd_cache, cmd); 1053 } 1054 1055 static void 1056 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1057 { 1058 sema_p(&qp->nq_sema); 1059 nvme_submit_cmd_common(qp, cmd); 1060 } 1061 1062 static int 1063 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1064 { 1065 if (sema_tryp(&qp->nq_sema) == 0) 1066 return (EAGAIN); 1067 1068 nvme_submit_cmd_common(qp, cmd); 1069 return (0); 1070 } 1071 1072 static void 1073 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1074 { 1075 nvme_reg_sqtdbl_t tail = { 0 }; 1076 1077 mutex_enter(&qp->nq_mutex); 1078 cmd->nc_completed = B_FALSE; 1079 1080 /* 1081 * Try to insert the cmd into the active cmd array at the nq_next_cmd 1082 * slot. If the slot is already occupied advance to the next slot and 1083 * try again. This can happen for long running commands like async event 1084 * requests. 1085 */ 1086 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 1087 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1088 qp->nq_cmd[qp->nq_next_cmd] = cmd; 1089 1090 qp->nq_active_cmds++; 1091 1092 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 1093 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 1094 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 1095 sizeof (nvme_sqe_t) * qp->nq_sqtail, 1096 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 1097 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1098 1099 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 1100 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 1101 1102 mutex_exit(&qp->nq_mutex); 1103 } 1104 1105 static nvme_cmd_t * 1106 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid) 1107 { 1108 nvme_cmd_t *cmd; 1109 1110 ASSERT(mutex_owned(&qp->nq_mutex)); 1111 ASSERT3S(cid, <, qp->nq_nentry); 1112 1113 cmd = qp->nq_cmd[cid]; 1114 qp->nq_cmd[cid] = NULL; 1115 ASSERT3U(qp->nq_active_cmds, >, 0); 1116 qp->nq_active_cmds--; 1117 sema_v(&qp->nq_sema); 1118 1119 ASSERT3P(cmd, !=, NULL); 1120 ASSERT3P(cmd->nc_nvme, ==, nvme); 1121 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid); 1122 1123 return (cmd); 1124 } 1125 1126 /* 1127 * Get the command tied to the next completed cqe and bump along completion 1128 * queue head counter. 1129 */ 1130 static nvme_cmd_t * 1131 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq) 1132 { 1133 nvme_qpair_t *qp; 1134 nvme_cqe_t *cqe; 1135 nvme_cmd_t *cmd; 1136 1137 ASSERT(mutex_owned(&cq->ncq_mutex)); 1138 1139 cqe = &cq->ncq_cq[cq->ncq_head]; 1140 1141 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 1142 if (cqe->cqe_sf.sf_p == cq->ncq_phase) 1143 return (NULL); 1144 1145 qp = nvme->n_ioq[cqe->cqe_sqid]; 1146 1147 mutex_enter(&qp->nq_mutex); 1148 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid); 1149 mutex_exit(&qp->nq_mutex); 1150 1151 ASSERT(cmd->nc_sqid == cqe->cqe_sqid); 1152 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 1153 1154 qp->nq_sqhead = cqe->cqe_sqhd; 1155 1156 cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry; 1157 1158 /* Toggle phase on wrap-around. */ 1159 if (cq->ncq_head == 0) 1160 cq->ncq_phase = cq->ncq_phase ? 0 : 1; 1161 1162 return (cmd); 1163 } 1164 1165 /* 1166 * Process all completed commands on the io completion queue. 1167 */ 1168 static uint_t 1169 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) 1170 { 1171 nvme_reg_cqhdbl_t head = { 0 }; 1172 nvme_cmd_t *cmd; 1173 uint_t completed = 0; 1174 1175 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1176 DDI_SUCCESS) 1177 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1178 __func__); 1179 1180 mutex_enter(&cq->ncq_mutex); 1181 1182 while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1183 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd, 1184 TQ_NOSLEEP, &cmd->nc_tqent); 1185 1186 completed++; 1187 } 1188 1189 if (completed > 0) { 1190 /* 1191 * Update the completion queue head doorbell. 1192 */ 1193 head.b.cqhdbl_cqh = cq->ncq_head; 1194 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1195 } 1196 1197 mutex_exit(&cq->ncq_mutex); 1198 1199 return (completed); 1200 } 1201 1202 static nvme_cmd_t * 1203 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 1204 { 1205 nvme_cq_t *cq = qp->nq_cq; 1206 nvme_reg_cqhdbl_t head = { 0 }; 1207 nvme_cmd_t *cmd; 1208 1209 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1210 DDI_SUCCESS) 1211 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1212 __func__); 1213 1214 mutex_enter(&cq->ncq_mutex); 1215 1216 if ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1217 head.b.cqhdbl_cqh = cq->ncq_head; 1218 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1219 } 1220 1221 mutex_exit(&cq->ncq_mutex); 1222 1223 return (cmd); 1224 } 1225 1226 static int 1227 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 1228 { 1229 nvme_cqe_t *cqe = &cmd->nc_cqe; 1230 1231 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1232 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1233 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1234 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1235 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1236 1237 if (cmd->nc_xfer != NULL) 1238 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1239 1240 if (cmd->nc_nvme->n_strict_version) { 1241 cmd->nc_nvme->n_dead = B_TRUE; 1242 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1243 } 1244 1245 return (EIO); 1246 } 1247 1248 static int 1249 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 1250 { 1251 nvme_cqe_t *cqe = &cmd->nc_cqe; 1252 1253 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1254 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1255 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1256 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1257 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1258 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 1259 cmd->nc_nvme->n_dead = B_TRUE; 1260 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1261 } 1262 1263 return (EIO); 1264 } 1265 1266 static int 1267 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 1268 { 1269 nvme_cqe_t *cqe = &cmd->nc_cqe; 1270 1271 switch (cqe->cqe_sf.sf_sc) { 1272 case NVME_CQE_SC_INT_NVM_WRITE: 1273 /* write fail */ 1274 /* TODO: post ereport */ 1275 if (cmd->nc_xfer != NULL) 1276 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1277 return (EIO); 1278 1279 case NVME_CQE_SC_INT_NVM_READ: 1280 /* read fail */ 1281 /* TODO: post ereport */ 1282 if (cmd->nc_xfer != NULL) 1283 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1284 return (EIO); 1285 1286 default: 1287 return (nvme_check_unknown_cmd_status(cmd)); 1288 } 1289 } 1290 1291 static int 1292 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 1293 { 1294 nvme_cqe_t *cqe = &cmd->nc_cqe; 1295 1296 switch (cqe->cqe_sf.sf_sc) { 1297 case NVME_CQE_SC_GEN_SUCCESS: 1298 return (0); 1299 1300 /* 1301 * Errors indicating a bug in the driver should cause a panic. 1302 */ 1303 case NVME_CQE_SC_GEN_INV_OPC: 1304 /* Invalid Command Opcode */ 1305 if (!cmd->nc_dontpanic) 1306 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1307 "programming error: invalid opcode in cmd %p", 1308 (void *)cmd); 1309 return (EINVAL); 1310 1311 case NVME_CQE_SC_GEN_INV_FLD: 1312 /* Invalid Field in Command */ 1313 if (!cmd->nc_dontpanic) 1314 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1315 "programming error: invalid field in cmd %p", 1316 (void *)cmd); 1317 return (EIO); 1318 1319 case NVME_CQE_SC_GEN_ID_CNFL: 1320 /* Command ID Conflict */ 1321 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1322 "cmd ID conflict in cmd %p", (void *)cmd); 1323 return (0); 1324 1325 case NVME_CQE_SC_GEN_INV_NS: 1326 /* Invalid Namespace or Format */ 1327 if (!cmd->nc_dontpanic) 1328 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1329 "programming error: invalid NS/format in cmd %p", 1330 (void *)cmd); 1331 return (EINVAL); 1332 1333 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 1334 /* LBA Out Of Range */ 1335 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1336 "LBA out of range in cmd %p", (void *)cmd); 1337 return (0); 1338 1339 /* 1340 * Non-fatal errors, handle gracefully. 1341 */ 1342 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 1343 /* Data Transfer Error (DMA) */ 1344 /* TODO: post ereport */ 1345 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); 1346 if (cmd->nc_xfer != NULL) 1347 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1348 return (EIO); 1349 1350 case NVME_CQE_SC_GEN_INTERNAL_ERR: 1351 /* 1352 * Internal Error. The spec (v1.0, section 4.5.1.2) says 1353 * detailed error information is returned as async event, 1354 * so we pretty much ignore the error here and handle it 1355 * in the async event handler. 1356 */ 1357 atomic_inc_32(&cmd->nc_nvme->n_internal_err); 1358 if (cmd->nc_xfer != NULL) 1359 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1360 return (EIO); 1361 1362 case NVME_CQE_SC_GEN_ABORT_REQUEST: 1363 /* 1364 * Command Abort Requested. This normally happens only when a 1365 * command times out. 1366 */ 1367 /* TODO: post ereport or change blkdev to handle this? */ 1368 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err); 1369 return (ECANCELED); 1370 1371 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 1372 /* Command Aborted due to Power Loss Notification */ 1373 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1374 cmd->nc_nvme->n_dead = B_TRUE; 1375 return (EIO); 1376 1377 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 1378 /* Command Aborted due to SQ Deletion */ 1379 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del); 1380 return (EIO); 1381 1382 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 1383 /* Capacity Exceeded */ 1384 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); 1385 if (cmd->nc_xfer != NULL) 1386 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1387 return (EIO); 1388 1389 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 1390 /* Namespace Not Ready */ 1391 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); 1392 if (cmd->nc_xfer != NULL) 1393 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1394 return (EIO); 1395 1396 default: 1397 return (nvme_check_unknown_cmd_status(cmd)); 1398 } 1399 } 1400 1401 static int 1402 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 1403 { 1404 nvme_cqe_t *cqe = &cmd->nc_cqe; 1405 1406 switch (cqe->cqe_sf.sf_sc) { 1407 case NVME_CQE_SC_SPC_INV_CQ: 1408 /* Completion Queue Invalid */ 1409 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 1410 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err); 1411 return (EINVAL); 1412 1413 case NVME_CQE_SC_SPC_INV_QID: 1414 /* Invalid Queue Identifier */ 1415 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1416 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 1417 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 1418 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1419 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err); 1420 return (EINVAL); 1421 1422 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 1423 /* Max Queue Size Exceeded */ 1424 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1425 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1426 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc); 1427 return (EINVAL); 1428 1429 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 1430 /* Abort Command Limit Exceeded */ 1431 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 1432 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1433 "abort command limit exceeded in cmd %p", (void *)cmd); 1434 return (0); 1435 1436 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 1437 /* Async Event Request Limit Exceeded */ 1438 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 1439 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1440 "async event request limit exceeded in cmd %p", 1441 (void *)cmd); 1442 return (0); 1443 1444 case NVME_CQE_SC_SPC_INV_INT_VECT: 1445 /* Invalid Interrupt Vector */ 1446 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1447 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect); 1448 return (EINVAL); 1449 1450 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 1451 /* Invalid Log Page */ 1452 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 1453 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); 1454 return (EINVAL); 1455 1456 case NVME_CQE_SC_SPC_INV_FORMAT: 1457 /* Invalid Format */ 1458 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); 1459 atomic_inc_32(&cmd->nc_nvme->n_inv_format); 1460 if (cmd->nc_xfer != NULL) 1461 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1462 return (EINVAL); 1463 1464 case NVME_CQE_SC_SPC_INV_Q_DEL: 1465 /* Invalid Queue Deletion */ 1466 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1467 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del); 1468 return (EINVAL); 1469 1470 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 1471 /* Conflicting Attributes */ 1472 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 1473 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1474 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1475 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); 1476 if (cmd->nc_xfer != NULL) 1477 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1478 return (EINVAL); 1479 1480 case NVME_CQE_SC_SPC_NVM_INV_PROT: 1481 /* Invalid Protection Information */ 1482 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 1483 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1484 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1485 atomic_inc_32(&cmd->nc_nvme->n_inv_prot); 1486 if (cmd->nc_xfer != NULL) 1487 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1488 return (EINVAL); 1489 1490 case NVME_CQE_SC_SPC_NVM_READONLY: 1491 /* Write to Read Only Range */ 1492 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1493 atomic_inc_32(&cmd->nc_nvme->n_readonly); 1494 if (cmd->nc_xfer != NULL) 1495 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1496 return (EROFS); 1497 1498 case NVME_CQE_SC_SPC_INV_FW_SLOT: 1499 /* Invalid Firmware Slot */ 1500 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1501 return (EINVAL); 1502 1503 case NVME_CQE_SC_SPC_INV_FW_IMG: 1504 /* Invalid Firmware Image */ 1505 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1506 return (EINVAL); 1507 1508 case NVME_CQE_SC_SPC_FW_RESET: 1509 /* Conventional Reset Required */ 1510 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1511 return (0); 1512 1513 case NVME_CQE_SC_SPC_FW_NSSR: 1514 /* NVMe Subsystem Reset Required */ 1515 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1516 return (0); 1517 1518 case NVME_CQE_SC_SPC_FW_NEXT_RESET: 1519 /* Activation Requires Reset */ 1520 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1521 return (0); 1522 1523 case NVME_CQE_SC_SPC_FW_MTFA: 1524 /* Activation Requires Maximum Time Violation */ 1525 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1526 return (EAGAIN); 1527 1528 case NVME_CQE_SC_SPC_FW_PROHIBITED: 1529 /* Activation Prohibited */ 1530 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1531 return (EINVAL); 1532 1533 case NVME_CQE_SC_SPC_FW_OVERLAP: 1534 /* Overlapping Firmware Ranges */ 1535 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD); 1536 return (EINVAL); 1537 1538 default: 1539 return (nvme_check_unknown_cmd_status(cmd)); 1540 } 1541 } 1542 1543 static inline int 1544 nvme_check_cmd_status(nvme_cmd_t *cmd) 1545 { 1546 nvme_cqe_t *cqe = &cmd->nc_cqe; 1547 1548 /* 1549 * Take a shortcut if the controller is dead, or if 1550 * command status indicates no error. 1551 */ 1552 if (cmd->nc_nvme->n_dead) 1553 return (EIO); 1554 1555 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1556 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 1557 return (0); 1558 1559 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 1560 return (nvme_check_generic_cmd_status(cmd)); 1561 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 1562 return (nvme_check_specific_cmd_status(cmd)); 1563 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 1564 return (nvme_check_integrity_cmd_status(cmd)); 1565 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 1566 return (nvme_check_vendor_cmd_status(cmd)); 1567 1568 return (nvme_check_unknown_cmd_status(cmd)); 1569 } 1570 1571 static int 1572 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec) 1573 { 1574 nvme_t *nvme = abort_cmd->nc_nvme; 1575 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1576 nvme_abort_cmd_t ac = { 0 }; 1577 int ret = 0; 1578 1579 sema_p(&nvme->n_abort_sema); 1580 1581 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid; 1582 ac.b.ac_sqid = abort_cmd->nc_sqid; 1583 1584 cmd->nc_sqid = 0; 1585 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 1586 cmd->nc_callback = nvme_wakeup_cmd; 1587 cmd->nc_sqe.sqe_cdw10 = ac.r; 1588 1589 /* 1590 * Send the ABORT to the hardware. The ABORT command will return _after_ 1591 * the aborted command has completed (aborted or otherwise), but since 1592 * we still hold the aborted command's mutex its callback hasn't been 1593 * processed yet. 1594 */ 1595 nvme_admin_cmd(cmd, sec); 1596 sema_v(&nvme->n_abort_sema); 1597 1598 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 1599 dev_err(nvme->n_dip, CE_WARN, 1600 "!ABORT failed with sct = %x, sc = %x", 1601 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1602 atomic_inc_32(&nvme->n_abort_failed); 1603 } else { 1604 dev_err(nvme->n_dip, CE_WARN, 1605 "!ABORT of command %d/%d %ssuccessful", 1606 abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid, 1607 cmd->nc_cqe.cqe_dw0 & 1 ? "un" : ""); 1608 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0) 1609 atomic_inc_32(&nvme->n_cmd_aborted); 1610 } 1611 1612 nvme_free_cmd(cmd); 1613 return (ret); 1614 } 1615 1616 /* 1617 * nvme_wait_cmd -- wait for command completion or timeout 1618 * 1619 * In case of a serious error or a timeout of the abort command the hardware 1620 * will be declared dead and FMA will be notified. 1621 */ 1622 static void 1623 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec) 1624 { 1625 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC); 1626 nvme_t *nvme = cmd->nc_nvme; 1627 nvme_reg_csts_t csts; 1628 nvme_qpair_t *qp; 1629 1630 ASSERT(mutex_owned(&cmd->nc_mutex)); 1631 1632 while (!cmd->nc_completed) { 1633 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) 1634 break; 1635 } 1636 1637 if (cmd->nc_completed) 1638 return; 1639 1640 /* 1641 * The command timed out. 1642 * 1643 * Check controller for fatal status, any errors associated with the 1644 * register or DMA handle, or for a double timeout (abort command timed 1645 * out). If necessary log a warning and call FMA. 1646 */ 1647 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1648 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, " 1649 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 1650 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 1651 atomic_inc_32(&nvme->n_cmd_timeout); 1652 1653 if (csts.b.csts_cfs || 1654 nvme_check_regs_hdl(nvme) || 1655 nvme_check_dma_hdl(cmd->nc_dma) || 1656 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 1657 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1658 nvme->n_dead = B_TRUE; 1659 } else if (nvme_abort_cmd(cmd, sec) == 0) { 1660 /* 1661 * If the abort succeeded the command should complete 1662 * immediately with an appropriate status. 1663 */ 1664 while (!cmd->nc_completed) 1665 cv_wait(&cmd->nc_cv, &cmd->nc_mutex); 1666 1667 return; 1668 } 1669 1670 qp = nvme->n_ioq[cmd->nc_sqid]; 1671 1672 mutex_enter(&qp->nq_mutex); 1673 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 1674 mutex_exit(&qp->nq_mutex); 1675 1676 /* 1677 * As we don't know what the presumed dead hardware might still do with 1678 * the DMA memory, we'll put the command on the lost commands list if it 1679 * has any DMA memory. 1680 */ 1681 if (cmd->nc_dma != NULL) { 1682 mutex_enter(&nvme_lc_mutex); 1683 list_insert_head(&nvme_lost_cmds, cmd); 1684 mutex_exit(&nvme_lc_mutex); 1685 } 1686 } 1687 1688 static void 1689 nvme_wakeup_cmd(void *arg) 1690 { 1691 nvme_cmd_t *cmd = arg; 1692 1693 mutex_enter(&cmd->nc_mutex); 1694 cmd->nc_completed = B_TRUE; 1695 cv_signal(&cmd->nc_cv); 1696 mutex_exit(&cmd->nc_mutex); 1697 } 1698 1699 static void 1700 nvme_async_event_task(void *arg) 1701 { 1702 nvme_cmd_t *cmd = arg; 1703 nvme_t *nvme = cmd->nc_nvme; 1704 nvme_error_log_entry_t *error_log = NULL; 1705 nvme_health_log_t *health_log = NULL; 1706 size_t logsize = 0; 1707 nvme_async_event_t event; 1708 1709 /* 1710 * Check for errors associated with the async request itself. The only 1711 * command-specific error is "async event limit exceeded", which 1712 * indicates a programming error in the driver and causes a panic in 1713 * nvme_check_cmd_status(). 1714 * 1715 * Other possible errors are various scenarios where the async request 1716 * was aborted, or internal errors in the device. Internal errors are 1717 * reported to FMA, the command aborts need no special handling here. 1718 * 1719 * And finally, at least qemu nvme does not support async events, 1720 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we 1721 * will avoid posting async events. 1722 */ 1723 1724 if (nvme_check_cmd_status(cmd) != 0) { 1725 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1726 "!async event request returned failure, sct = %x, " 1727 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 1728 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 1729 cmd->nc_cqe.cqe_sf.sf_m); 1730 1731 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1732 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 1733 cmd->nc_nvme->n_dead = B_TRUE; 1734 ddi_fm_service_impact(cmd->nc_nvme->n_dip, 1735 DDI_SERVICE_LOST); 1736 } 1737 1738 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1739 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC && 1740 cmd->nc_cqe.cqe_sf.sf_dnr == 1) { 1741 nvme->n_async_event_supported = B_FALSE; 1742 } 1743 1744 nvme_free_cmd(cmd); 1745 return; 1746 } 1747 1748 1749 event.r = cmd->nc_cqe.cqe_dw0; 1750 1751 /* Clear CQE and re-submit the async request. */ 1752 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 1753 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 1754 1755 switch (event.b.ae_type) { 1756 case NVME_ASYNC_TYPE_ERROR: 1757 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 1758 (void) nvme_get_logpage(nvme, B_FALSE, 1759 (void **)&error_log, &logsize, event.b.ae_logpage); 1760 } else { 1761 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1762 "async event reply: %d", event.b.ae_logpage); 1763 atomic_inc_32(&nvme->n_wrong_logpage); 1764 } 1765 1766 switch (event.b.ae_info) { 1767 case NVME_ASYNC_ERROR_INV_SQ: 1768 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1769 "invalid submission queue"); 1770 return; 1771 1772 case NVME_ASYNC_ERROR_INV_DBL: 1773 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1774 "invalid doorbell write value"); 1775 return; 1776 1777 case NVME_ASYNC_ERROR_DIAGFAIL: 1778 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 1779 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1780 nvme->n_dead = B_TRUE; 1781 atomic_inc_32(&nvme->n_diagfail_event); 1782 break; 1783 1784 case NVME_ASYNC_ERROR_PERSISTENT: 1785 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 1786 "device error"); 1787 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1788 nvme->n_dead = B_TRUE; 1789 atomic_inc_32(&nvme->n_persistent_event); 1790 break; 1791 1792 case NVME_ASYNC_ERROR_TRANSIENT: 1793 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 1794 "device error"); 1795 /* TODO: send ereport */ 1796 atomic_inc_32(&nvme->n_transient_event); 1797 break; 1798 1799 case NVME_ASYNC_ERROR_FW_LOAD: 1800 dev_err(nvme->n_dip, CE_WARN, 1801 "!firmware image load error"); 1802 atomic_inc_32(&nvme->n_fw_load_event); 1803 break; 1804 } 1805 break; 1806 1807 case NVME_ASYNC_TYPE_HEALTH: 1808 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 1809 (void) nvme_get_logpage(nvme, B_FALSE, 1810 (void **)&health_log, &logsize, event.b.ae_logpage, 1811 -1); 1812 } else { 1813 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1814 "async event reply: %d", event.b.ae_logpage); 1815 atomic_inc_32(&nvme->n_wrong_logpage); 1816 } 1817 1818 switch (event.b.ae_info) { 1819 case NVME_ASYNC_HEALTH_RELIABILITY: 1820 dev_err(nvme->n_dip, CE_WARN, 1821 "!device reliability compromised"); 1822 /* TODO: send ereport */ 1823 atomic_inc_32(&nvme->n_reliability_event); 1824 break; 1825 1826 case NVME_ASYNC_HEALTH_TEMPERATURE: 1827 dev_err(nvme->n_dip, CE_WARN, 1828 "!temperature above threshold"); 1829 /* TODO: send ereport */ 1830 atomic_inc_32(&nvme->n_temperature_event); 1831 break; 1832 1833 case NVME_ASYNC_HEALTH_SPARE: 1834 dev_err(nvme->n_dip, CE_WARN, 1835 "!spare space below threshold"); 1836 /* TODO: send ereport */ 1837 atomic_inc_32(&nvme->n_spare_event); 1838 break; 1839 } 1840 break; 1841 1842 case NVME_ASYNC_TYPE_VENDOR: 1843 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 1844 "received, info = %x, logpage = %x", event.b.ae_info, 1845 event.b.ae_logpage); 1846 atomic_inc_32(&nvme->n_vendor_event); 1847 break; 1848 1849 default: 1850 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 1851 "type = %x, info = %x, logpage = %x", event.b.ae_type, 1852 event.b.ae_info, event.b.ae_logpage); 1853 atomic_inc_32(&nvme->n_unknown_event); 1854 break; 1855 } 1856 1857 if (error_log) 1858 kmem_free(error_log, logsize); 1859 1860 if (health_log) 1861 kmem_free(health_log, logsize); 1862 } 1863 1864 static void 1865 nvme_admin_cmd(nvme_cmd_t *cmd, int sec) 1866 { 1867 mutex_enter(&cmd->nc_mutex); 1868 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd); 1869 nvme_wait_cmd(cmd, sec); 1870 mutex_exit(&cmd->nc_mutex); 1871 } 1872 1873 static void 1874 nvme_async_event(nvme_t *nvme) 1875 { 1876 nvme_cmd_t *cmd; 1877 1878 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1879 cmd->nc_sqid = 0; 1880 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 1881 cmd->nc_callback = nvme_async_event_task; 1882 cmd->nc_dontpanic = B_TRUE; 1883 1884 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 1885 } 1886 1887 static int 1888 nvme_format_nvm(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t lbaf, 1889 boolean_t ms, uint8_t pi, boolean_t pil, uint8_t ses) 1890 { 1891 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1892 nvme_format_nvm_t format_nvm = { 0 }; 1893 int ret; 1894 1895 format_nvm.b.fm_lbaf = lbaf & 0xf; 1896 format_nvm.b.fm_ms = ms ? 1 : 0; 1897 format_nvm.b.fm_pi = pi & 0x7; 1898 format_nvm.b.fm_pil = pil ? 1 : 0; 1899 format_nvm.b.fm_ses = ses & 0x7; 1900 1901 cmd->nc_sqid = 0; 1902 cmd->nc_callback = nvme_wakeup_cmd; 1903 cmd->nc_sqe.sqe_nsid = nsid; 1904 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; 1905 cmd->nc_sqe.sqe_cdw10 = format_nvm.r; 1906 1907 /* 1908 * Some devices like Samsung SM951 don't allow formatting of all 1909 * namespaces in one command. Handle that gracefully. 1910 */ 1911 if (nsid == (uint32_t)-1) 1912 cmd->nc_dontpanic = B_TRUE; 1913 /* 1914 * If this format request was initiated by the user, then don't allow a 1915 * programmer error to panic the system. 1916 */ 1917 if (user) 1918 cmd->nc_dontpanic = B_TRUE; 1919 1920 nvme_admin_cmd(cmd, nvme_format_cmd_timeout); 1921 1922 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 1923 dev_err(nvme->n_dip, CE_WARN, 1924 "!FORMAT failed with sct = %x, sc = %x", 1925 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1926 } 1927 1928 nvme_free_cmd(cmd); 1929 return (ret); 1930 } 1931 1932 static int 1933 nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize, 1934 uint8_t logpage, ...) 1935 { 1936 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1937 nvme_getlogpage_t getlogpage = { 0 }; 1938 va_list ap; 1939 int ret; 1940 1941 va_start(ap, logpage); 1942 1943 cmd->nc_sqid = 0; 1944 cmd->nc_callback = nvme_wakeup_cmd; 1945 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 1946 1947 if (user) 1948 cmd->nc_dontpanic = B_TRUE; 1949 1950 getlogpage.b.lp_lid = logpage; 1951 1952 switch (logpage) { 1953 case NVME_LOGPAGE_ERROR: 1954 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 1955 /* 1956 * The GET LOG PAGE command can use at most 2 pages to return 1957 * data, PRP lists are not supported. 1958 */ 1959 *bufsize = MIN(2 * nvme->n_pagesize, 1960 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); 1961 break; 1962 1963 case NVME_LOGPAGE_HEALTH: 1964 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 1965 *bufsize = sizeof (nvme_health_log_t); 1966 break; 1967 1968 case NVME_LOGPAGE_FWSLOT: 1969 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 1970 *bufsize = sizeof (nvme_fwslot_log_t); 1971 break; 1972 1973 default: 1974 dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d", 1975 logpage); 1976 atomic_inc_32(&nvme->n_unknown_logpage); 1977 ret = EINVAL; 1978 goto fail; 1979 } 1980 1981 va_end(ap); 1982 1983 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; 1984 1985 cmd->nc_sqe.sqe_cdw10 = getlogpage.r; 1986 1987 if (nvme_zalloc_dma(nvme, *bufsize, 1988 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 1989 dev_err(nvme->n_dip, CE_WARN, 1990 "!nvme_zalloc_dma failed for GET LOG PAGE"); 1991 ret = ENOMEM; 1992 goto fail; 1993 } 1994 1995 if (cmd->nc_dma->nd_ncookie > 2) { 1996 dev_err(nvme->n_dip, CE_WARN, 1997 "!too many DMA cookies for GET LOG PAGE"); 1998 atomic_inc_32(&nvme->n_too_many_cookies); 1999 ret = ENOMEM; 2000 goto fail; 2001 } 2002 2003 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 2004 if (cmd->nc_dma->nd_ncookie > 1) { 2005 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2006 &cmd->nc_dma->nd_cookie); 2007 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2008 cmd->nc_dma->nd_cookie.dmac_laddress; 2009 } 2010 2011 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2012 2013 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2014 dev_err(nvme->n_dip, CE_WARN, 2015 "!GET LOG PAGE failed with sct = %x, sc = %x", 2016 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2017 goto fail; 2018 } 2019 2020 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2021 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2022 2023 fail: 2024 nvme_free_cmd(cmd); 2025 2026 return (ret); 2027 } 2028 2029 static int 2030 nvme_identify(nvme_t *nvme, boolean_t user, uint32_t nsid, void **buf) 2031 { 2032 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2033 int ret; 2034 2035 if (buf == NULL) 2036 return (EINVAL); 2037 2038 cmd->nc_sqid = 0; 2039 cmd->nc_callback = nvme_wakeup_cmd; 2040 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 2041 cmd->nc_sqe.sqe_nsid = nsid; 2042 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL; 2043 2044 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 2045 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2046 dev_err(nvme->n_dip, CE_WARN, 2047 "!nvme_zalloc_dma failed for IDENTIFY"); 2048 ret = ENOMEM; 2049 goto fail; 2050 } 2051 2052 if (cmd->nc_dma->nd_ncookie > 2) { 2053 dev_err(nvme->n_dip, CE_WARN, 2054 "!too many DMA cookies for IDENTIFY"); 2055 atomic_inc_32(&nvme->n_too_many_cookies); 2056 ret = ENOMEM; 2057 goto fail; 2058 } 2059 2060 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 2061 if (cmd->nc_dma->nd_ncookie > 1) { 2062 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2063 &cmd->nc_dma->nd_cookie); 2064 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2065 cmd->nc_dma->nd_cookie.dmac_laddress; 2066 } 2067 2068 if (user) 2069 cmd->nc_dontpanic = B_TRUE; 2070 2071 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2072 2073 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2074 dev_err(nvme->n_dip, CE_WARN, 2075 "!IDENTIFY failed with sct = %x, sc = %x", 2076 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2077 goto fail; 2078 } 2079 2080 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 2081 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE); 2082 2083 fail: 2084 nvme_free_cmd(cmd); 2085 2086 return (ret); 2087 } 2088 2089 static int 2090 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2091 uint32_t val, uint32_t *res) 2092 { 2093 _NOTE(ARGUNUSED(nsid)); 2094 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2095 int ret = EINVAL; 2096 2097 ASSERT(res != NULL); 2098 2099 cmd->nc_sqid = 0; 2100 cmd->nc_callback = nvme_wakeup_cmd; 2101 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 2102 cmd->nc_sqe.sqe_cdw10 = feature; 2103 cmd->nc_sqe.sqe_cdw11 = val; 2104 2105 if (user) 2106 cmd->nc_dontpanic = B_TRUE; 2107 2108 switch (feature) { 2109 case NVME_FEAT_WRITE_CACHE: 2110 if (!nvme->n_write_cache_present) 2111 goto fail; 2112 break; 2113 2114 case NVME_FEAT_NQUEUES: 2115 break; 2116 2117 default: 2118 goto fail; 2119 } 2120 2121 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2122 2123 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2124 dev_err(nvme->n_dip, CE_WARN, 2125 "!SET FEATURES %d failed with sct = %x, sc = %x", 2126 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2127 cmd->nc_cqe.cqe_sf.sf_sc); 2128 goto fail; 2129 } 2130 2131 *res = cmd->nc_cqe.cqe_dw0; 2132 2133 fail: 2134 nvme_free_cmd(cmd); 2135 return (ret); 2136 } 2137 2138 static int 2139 nvme_get_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2140 uint32_t *res, void **buf, size_t *bufsize) 2141 { 2142 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2143 int ret = EINVAL; 2144 2145 ASSERT(res != NULL); 2146 2147 if (bufsize != NULL) 2148 *bufsize = 0; 2149 2150 cmd->nc_sqid = 0; 2151 cmd->nc_callback = nvme_wakeup_cmd; 2152 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; 2153 cmd->nc_sqe.sqe_cdw10 = feature; 2154 cmd->nc_sqe.sqe_cdw11 = *res; 2155 2156 /* 2157 * For some of the optional features there doesn't seem to be a method 2158 * of detecting whether it is supported other than using it. This will 2159 * cause "Invalid Field in Command" error, which is normally considered 2160 * a programming error. Set the nc_dontpanic flag to override the panic 2161 * in nvme_check_generic_cmd_status(). 2162 */ 2163 switch (feature) { 2164 case NVME_FEAT_ARBITRATION: 2165 case NVME_FEAT_POWER_MGMT: 2166 case NVME_FEAT_TEMPERATURE: 2167 case NVME_FEAT_ERROR: 2168 case NVME_FEAT_NQUEUES: 2169 case NVME_FEAT_INTR_COAL: 2170 case NVME_FEAT_INTR_VECT: 2171 case NVME_FEAT_WRITE_ATOM: 2172 case NVME_FEAT_ASYNC_EVENT: 2173 break; 2174 2175 case NVME_FEAT_WRITE_CACHE: 2176 if (!nvme->n_write_cache_present) 2177 goto fail; 2178 break; 2179 2180 case NVME_FEAT_LBA_RANGE: 2181 if (!nvme->n_lba_range_supported) 2182 goto fail; 2183 2184 cmd->nc_dontpanic = B_TRUE; 2185 cmd->nc_sqe.sqe_nsid = nsid; 2186 ASSERT(bufsize != NULL); 2187 *bufsize = NVME_LBA_RANGE_BUFSIZE; 2188 break; 2189 2190 case NVME_FEAT_AUTO_PST: 2191 if (!nvme->n_auto_pst_supported) 2192 goto fail; 2193 2194 ASSERT(bufsize != NULL); 2195 *bufsize = NVME_AUTO_PST_BUFSIZE; 2196 break; 2197 2198 case NVME_FEAT_PROGRESS: 2199 if (!nvme->n_progress_supported) 2200 goto fail; 2201 2202 cmd->nc_dontpanic = B_TRUE; 2203 break; 2204 2205 default: 2206 goto fail; 2207 } 2208 2209 if (user) 2210 cmd->nc_dontpanic = B_TRUE; 2211 2212 if (bufsize != NULL && *bufsize != 0) { 2213 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, 2214 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2215 dev_err(nvme->n_dip, CE_WARN, 2216 "!nvme_zalloc_dma failed for GET FEATURES"); 2217 ret = ENOMEM; 2218 goto fail; 2219 } 2220 2221 if (cmd->nc_dma->nd_ncookie > 2) { 2222 dev_err(nvme->n_dip, CE_WARN, 2223 "!too many DMA cookies for GET FEATURES"); 2224 atomic_inc_32(&nvme->n_too_many_cookies); 2225 ret = ENOMEM; 2226 goto fail; 2227 } 2228 2229 cmd->nc_sqe.sqe_dptr.d_prp[0] = 2230 cmd->nc_dma->nd_cookie.dmac_laddress; 2231 if (cmd->nc_dma->nd_ncookie > 1) { 2232 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2233 &cmd->nc_dma->nd_cookie); 2234 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2235 cmd->nc_dma->nd_cookie.dmac_laddress; 2236 } 2237 } 2238 2239 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2240 2241 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2242 boolean_t known = B_TRUE; 2243 2244 /* Check if this is unsupported optional feature */ 2245 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2246 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) { 2247 switch (feature) { 2248 case NVME_FEAT_LBA_RANGE: 2249 nvme->n_lba_range_supported = B_FALSE; 2250 break; 2251 case NVME_FEAT_PROGRESS: 2252 nvme->n_progress_supported = B_FALSE; 2253 break; 2254 default: 2255 known = B_FALSE; 2256 break; 2257 } 2258 } else { 2259 known = B_FALSE; 2260 } 2261 2262 /* Report the error otherwise */ 2263 if (!known) { 2264 dev_err(nvme->n_dip, CE_WARN, 2265 "!GET FEATURES %d failed with sct = %x, sc = %x", 2266 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2267 cmd->nc_cqe.cqe_sf.sf_sc); 2268 } 2269 2270 goto fail; 2271 } 2272 2273 if (bufsize != NULL && *bufsize != 0) { 2274 ASSERT(buf != NULL); 2275 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2276 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2277 } 2278 2279 *res = cmd->nc_cqe.cqe_dw0; 2280 2281 fail: 2282 nvme_free_cmd(cmd); 2283 return (ret); 2284 } 2285 2286 static int 2287 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 2288 { 2289 nvme_write_cache_t nwc = { 0 }; 2290 2291 if (enable) 2292 nwc.b.wc_wce = 1; 2293 2294 return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE, 2295 nwc.r, &nwc.r)); 2296 } 2297 2298 static int 2299 nvme_set_nqueues(nvme_t *nvme) 2300 { 2301 nvme_nqueues_t nq = { 0 }; 2302 int ret; 2303 2304 /* 2305 * The default is to allocate one completion queue per vector. 2306 */ 2307 if (nvme->n_completion_queues == -1) 2308 nvme->n_completion_queues = nvme->n_intr_cnt; 2309 2310 /* 2311 * There is no point in having more compeletion queues than 2312 * interrupt vectors. 2313 */ 2314 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2315 nvme->n_intr_cnt); 2316 2317 /* 2318 * The default is to use one submission queue per completion queue. 2319 */ 2320 if (nvme->n_submission_queues == -1) 2321 nvme->n_submission_queues = nvme->n_completion_queues; 2322 2323 /* 2324 * There is no point in having more compeletion queues than 2325 * submission queues. 2326 */ 2327 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2328 nvme->n_submission_queues); 2329 2330 ASSERT(nvme->n_submission_queues > 0); 2331 ASSERT(nvme->n_completion_queues > 0); 2332 2333 nq.b.nq_nsq = nvme->n_submission_queues - 1; 2334 nq.b.nq_ncq = nvme->n_completion_queues - 1; 2335 2336 ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r, 2337 &nq.r); 2338 2339 if (ret == 0) { 2340 /* 2341 * Never use more than the requested number of queues. 2342 */ 2343 nvme->n_submission_queues = MIN(nvme->n_submission_queues, 2344 nq.b.nq_nsq + 1); 2345 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2346 nq.b.nq_ncq + 1); 2347 } 2348 2349 return (ret); 2350 } 2351 2352 static int 2353 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq) 2354 { 2355 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2356 nvme_create_queue_dw10_t dw10 = { 0 }; 2357 nvme_create_cq_dw11_t c_dw11 = { 0 }; 2358 int ret; 2359 2360 dw10.b.q_qid = cq->ncq_id; 2361 dw10.b.q_qsize = cq->ncq_nentry - 1; 2362 2363 c_dw11.b.cq_pc = 1; 2364 c_dw11.b.cq_ien = 1; 2365 c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt; 2366 2367 cmd->nc_sqid = 0; 2368 cmd->nc_callback = nvme_wakeup_cmd; 2369 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 2370 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2371 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 2372 cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress; 2373 2374 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2375 2376 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2377 dev_err(nvme->n_dip, CE_WARN, 2378 "!CREATE CQUEUE failed with sct = %x, sc = %x", 2379 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2380 } 2381 2382 nvme_free_cmd(cmd); 2383 2384 return (ret); 2385 } 2386 2387 static int 2388 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 2389 { 2390 nvme_cq_t *cq = qp->nq_cq; 2391 nvme_cmd_t *cmd; 2392 nvme_create_queue_dw10_t dw10 = { 0 }; 2393 nvme_create_sq_dw11_t s_dw11 = { 0 }; 2394 int ret; 2395 2396 /* 2397 * It is possible to have more qpairs than completion queues, 2398 * and when the idx > ncq_id, that completion queue is shared 2399 * and has already been created. 2400 */ 2401 if (idx <= cq->ncq_id && 2402 nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS) 2403 return (DDI_FAILURE); 2404 2405 dw10.b.q_qid = idx; 2406 dw10.b.q_qsize = qp->nq_nentry - 1; 2407 2408 s_dw11.b.sq_pc = 1; 2409 s_dw11.b.sq_cqid = cq->ncq_id; 2410 2411 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2412 cmd->nc_sqid = 0; 2413 cmd->nc_callback = nvme_wakeup_cmd; 2414 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 2415 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2416 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 2417 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 2418 2419 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2420 2421 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2422 dev_err(nvme->n_dip, CE_WARN, 2423 "!CREATE SQUEUE failed with sct = %x, sc = %x", 2424 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2425 } 2426 2427 nvme_free_cmd(cmd); 2428 2429 return (ret); 2430 } 2431 2432 static boolean_t 2433 nvme_reset(nvme_t *nvme, boolean_t quiesce) 2434 { 2435 nvme_reg_csts_t csts; 2436 int i; 2437 2438 nvme_put32(nvme, NVME_REG_CC, 0); 2439 2440 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2441 if (csts.b.csts_rdy == 1) { 2442 nvme_put32(nvme, NVME_REG_CC, 0); 2443 for (i = 0; i != nvme->n_timeout * 10; i++) { 2444 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2445 if (csts.b.csts_rdy == 0) 2446 break; 2447 2448 if (quiesce) 2449 drv_usecwait(50000); 2450 else 2451 delay(drv_usectohz(50000)); 2452 } 2453 } 2454 2455 nvme_put32(nvme, NVME_REG_AQA, 0); 2456 nvme_put32(nvme, NVME_REG_ASQ, 0); 2457 nvme_put32(nvme, NVME_REG_ACQ, 0); 2458 2459 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2460 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 2461 } 2462 2463 static void 2464 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce) 2465 { 2466 nvme_reg_cc_t cc; 2467 nvme_reg_csts_t csts; 2468 int i; 2469 2470 ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT); 2471 2472 cc.r = nvme_get32(nvme, NVME_REG_CC); 2473 cc.b.cc_shn = mode & 0x3; 2474 nvme_put32(nvme, NVME_REG_CC, cc.r); 2475 2476 for (i = 0; i != 10; i++) { 2477 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2478 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 2479 break; 2480 2481 if (quiesce) 2482 drv_usecwait(100000); 2483 else 2484 delay(drv_usectohz(100000)); 2485 } 2486 } 2487 2488 2489 static void 2490 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 2491 { 2492 /* 2493 * Section 7.7 of the spec describes how to get a unique ID for 2494 * the controller: the vendor ID, the model name and the serial 2495 * number shall be unique when combined. 2496 * 2497 * If a namespace has no EUI64 we use the above and add the hex 2498 * namespace ID to get a unique ID for the namespace. 2499 */ 2500 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2501 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 2502 2503 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2504 bcopy(nvme->n_idctl->id_serial, serial, 2505 sizeof (nvme->n_idctl->id_serial)); 2506 2507 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2508 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 2509 2510 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X", 2511 nvme->n_idctl->id_vid, model, serial, nsid); 2512 } 2513 2514 static int 2515 nvme_init_ns(nvme_t *nvme, int nsid) 2516 { 2517 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; 2518 nvme_identify_nsid_t *idns; 2519 boolean_t was_ignored; 2520 int last_rp; 2521 2522 ns->ns_nvme = nvme; 2523 2524 if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { 2525 dev_err(nvme->n_dip, CE_WARN, 2526 "!failed to identify namespace %d", nsid); 2527 return (DDI_FAILURE); 2528 } 2529 2530 ns->ns_idns = idns; 2531 ns->ns_id = nsid; 2532 ns->ns_block_count = idns->id_nsize; 2533 ns->ns_block_size = 2534 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2535 ns->ns_best_block_size = ns->ns_block_size; 2536 2537 /* 2538 * Get the EUI64 if present. Use it for devid and device node names. 2539 */ 2540 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2541 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); 2542 2543 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 2544 if (*(uint64_t *)ns->ns_eui64 != 0) { 2545 uint8_t *eui64 = ns->ns_eui64; 2546 2547 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), 2548 "%02x%02x%02x%02x%02x%02x%02x%02x", 2549 eui64[0], eui64[1], eui64[2], eui64[3], 2550 eui64[4], eui64[5], eui64[6], eui64[7]); 2551 } else { 2552 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", 2553 ns->ns_id); 2554 2555 nvme_prepare_devid(nvme, ns->ns_id); 2556 } 2557 2558 /* 2559 * Find the LBA format with no metadata and the best relative 2560 * performance. A value of 3 means "degraded", 0 is best. 2561 */ 2562 last_rp = 3; 2563 for (int j = 0; j <= idns->id_nlbaf; j++) { 2564 if (idns->id_lbaf[j].lbaf_lbads == 0) 2565 break; 2566 if (idns->id_lbaf[j].lbaf_ms != 0) 2567 continue; 2568 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 2569 continue; 2570 last_rp = idns->id_lbaf[j].lbaf_rp; 2571 ns->ns_best_block_size = 2572 1 << idns->id_lbaf[j].lbaf_lbads; 2573 } 2574 2575 if (ns->ns_best_block_size < nvme->n_min_block_size) 2576 ns->ns_best_block_size = nvme->n_min_block_size; 2577 2578 was_ignored = ns->ns_ignore; 2579 2580 /* 2581 * We currently don't support namespaces that use either: 2582 * - protection information 2583 * - illegal block size (< 512) 2584 */ 2585 if (idns->id_dps.dp_pinfo) { 2586 dev_err(nvme->n_dip, CE_WARN, 2587 "!ignoring namespace %d, unsupported feature: " 2588 "pinfo = %d", nsid, idns->id_dps.dp_pinfo); 2589 ns->ns_ignore = B_TRUE; 2590 } else if (ns->ns_block_size < 512) { 2591 dev_err(nvme->n_dip, CE_WARN, 2592 "!ignoring namespace %d, unsupported block size %"PRIu64, 2593 nsid, (uint64_t)ns->ns_block_size); 2594 ns->ns_ignore = B_TRUE; 2595 } else { 2596 ns->ns_ignore = B_FALSE; 2597 } 2598 2599 /* 2600 * Keep a count of namespaces which are attachable. 2601 * See comments in nvme_bd_driveinfo() to understand its effect. 2602 */ 2603 if (was_ignored) { 2604 /* 2605 * Previously ignored, but now not. Count it. 2606 */ 2607 if (!ns->ns_ignore) 2608 nvme->n_namespaces_attachable++; 2609 } else { 2610 /* 2611 * Wasn't ignored previously, but now needs to be. 2612 * Discount it. 2613 */ 2614 if (ns->ns_ignore) 2615 nvme->n_namespaces_attachable--; 2616 } 2617 2618 return (DDI_SUCCESS); 2619 } 2620 2621 static int 2622 nvme_init(nvme_t *nvme) 2623 { 2624 nvme_reg_cc_t cc = { 0 }; 2625 nvme_reg_aqa_t aqa = { 0 }; 2626 nvme_reg_asq_t asq = { 0 }; 2627 nvme_reg_acq_t acq = { 0 }; 2628 nvme_reg_cap_t cap; 2629 nvme_reg_vs_t vs; 2630 nvme_reg_csts_t csts; 2631 int i = 0; 2632 uint16_t nqueues; 2633 uint_t tq_threads; 2634 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2635 char *vendor, *product; 2636 2637 /* Check controller version */ 2638 vs.r = nvme_get32(nvme, NVME_REG_VS); 2639 nvme->n_version.v_major = vs.b.vs_mjr; 2640 nvme->n_version.v_minor = vs.b.vs_mnr; 2641 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d", 2642 nvme->n_version.v_major, nvme->n_version.v_minor); 2643 2644 if (nvme->n_version.v_major > nvme_version_major) { 2645 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x", 2646 nvme_version_major); 2647 if (nvme->n_strict_version) 2648 goto fail; 2649 } 2650 2651 /* retrieve controller configuration */ 2652 cap.r = nvme_get64(nvme, NVME_REG_CAP); 2653 2654 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 2655 dev_err(nvme->n_dip, CE_WARN, 2656 "!NVM command set not supported by hardware"); 2657 goto fail; 2658 } 2659 2660 nvme->n_nssr_supported = cap.b.cap_nssrs; 2661 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 2662 nvme->n_timeout = cap.b.cap_to; 2663 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 2664 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 2665 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 2666 2667 /* 2668 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 2669 * the base page size of 4k (1<<12), so add 12 here to get the real 2670 * page size value. 2671 */ 2672 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 2673 cap.b.cap_mpsmax + 12); 2674 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 2675 2676 /* 2677 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 2678 */ 2679 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 2680 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2681 2682 /* 2683 * Set up PRP DMA to transfer 1 page-aligned page at a time. 2684 * Maxxfer may be increased after we identified the controller limits. 2685 */ 2686 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 2687 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2688 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 2689 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 2690 2691 /* 2692 * Reset controller if it's still in ready state. 2693 */ 2694 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 2695 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 2696 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2697 nvme->n_dead = B_TRUE; 2698 goto fail; 2699 } 2700 2701 /* 2702 * Create the cq array with one completion queue to be assigned 2703 * to the admin queue pair and a limited number of taskqs (4). 2704 */ 2705 if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) != 2706 DDI_SUCCESS) { 2707 dev_err(nvme->n_dip, CE_WARN, 2708 "!failed to pre-allocate admin completion queue"); 2709 goto fail; 2710 } 2711 /* 2712 * Create the admin queue pair. 2713 */ 2714 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 2715 != DDI_SUCCESS) { 2716 dev_err(nvme->n_dip, CE_WARN, 2717 "!unable to allocate admin qpair"); 2718 goto fail; 2719 } 2720 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 2721 nvme->n_ioq[0] = nvme->n_adminq; 2722 2723 nvme->n_progress |= NVME_ADMIN_QUEUE; 2724 2725 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2726 "admin-queue-len", nvme->n_admin_queue_len); 2727 2728 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 2729 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 2730 acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress; 2731 2732 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 2733 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 2734 2735 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 2736 nvme_put64(nvme, NVME_REG_ASQ, asq); 2737 nvme_put64(nvme, NVME_REG_ACQ, acq); 2738 2739 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 2740 cc.b.cc_css = 0; /* use NVM command set */ 2741 cc.b.cc_mps = nvme->n_pageshift - 12; 2742 cc.b.cc_shn = 0; /* no shutdown in progress */ 2743 cc.b.cc_en = 1; /* enable controller */ 2744 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 2745 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 2746 2747 nvme_put32(nvme, NVME_REG_CC, cc.r); 2748 2749 /* 2750 * Wait for the controller to become ready. 2751 */ 2752 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2753 if (csts.b.csts_rdy == 0) { 2754 for (i = 0; i != nvme->n_timeout * 10; i++) { 2755 delay(drv_usectohz(50000)); 2756 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2757 2758 if (csts.b.csts_cfs == 1) { 2759 dev_err(nvme->n_dip, CE_WARN, 2760 "!controller fatal status at init"); 2761 ddi_fm_service_impact(nvme->n_dip, 2762 DDI_SERVICE_LOST); 2763 nvme->n_dead = B_TRUE; 2764 goto fail; 2765 } 2766 2767 if (csts.b.csts_rdy == 1) 2768 break; 2769 } 2770 } 2771 2772 if (csts.b.csts_rdy == 0) { 2773 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 2774 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2775 nvme->n_dead = B_TRUE; 2776 goto fail; 2777 } 2778 2779 /* 2780 * Assume an abort command limit of 1. We'll destroy and re-init 2781 * that later when we know the true abort command limit. 2782 */ 2783 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 2784 2785 /* 2786 * Setup initial interrupt for admin queue. 2787 */ 2788 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 2789 != DDI_SUCCESS) && 2790 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 2791 != DDI_SUCCESS) && 2792 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 2793 != DDI_SUCCESS)) { 2794 dev_err(nvme->n_dip, CE_WARN, 2795 "!failed to setup initial interrupt"); 2796 goto fail; 2797 } 2798 2799 /* 2800 * Post an asynchronous event command to catch errors. 2801 * We assume the asynchronous events are supported as required by 2802 * specification (Figure 40 in section 5 of NVMe 1.2). 2803 * However, since at least qemu does not follow the specification, 2804 * we need a mechanism to protect ourselves. 2805 */ 2806 nvme->n_async_event_supported = B_TRUE; 2807 nvme_async_event(nvme); 2808 2809 /* 2810 * Identify Controller 2811 */ 2812 if (nvme_identify(nvme, B_FALSE, 0, (void **)&nvme->n_idctl) != 0) { 2813 dev_err(nvme->n_dip, CE_WARN, 2814 "!failed to identify controller"); 2815 goto fail; 2816 } 2817 2818 /* 2819 * Get Vendor & Product ID 2820 */ 2821 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2822 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2823 sata_split_model(model, &vendor, &product); 2824 2825 if (vendor == NULL) 2826 nvme->n_vendor = strdup("NVMe"); 2827 else 2828 nvme->n_vendor = strdup(vendor); 2829 2830 nvme->n_product = strdup(product); 2831 2832 /* 2833 * Get controller limits. 2834 */ 2835 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 2836 MIN(nvme->n_admin_queue_len / 10, 2837 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 2838 2839 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2840 "async-event-limit", nvme->n_async_event_limit); 2841 2842 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 2843 2844 /* 2845 * Reinitialize the semaphore with the true abort command limit 2846 * supported by the hardware. It's not necessary to disable interrupts 2847 * as only command aborts use the semaphore, and no commands are 2848 * executed or aborted while we're here. 2849 */ 2850 sema_destroy(&nvme->n_abort_sema); 2851 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 2852 SEMA_DRIVER, NULL); 2853 2854 nvme->n_progress |= NVME_CTRL_LIMITS; 2855 2856 if (nvme->n_idctl->id_mdts == 0) 2857 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 2858 else 2859 nvme->n_max_data_transfer_size = 2860 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 2861 2862 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 2863 2864 /* 2865 * Limit n_max_data_transfer_size to what we can handle in one PRP. 2866 * Chained PRPs are currently unsupported. 2867 * 2868 * This is a no-op on hardware which doesn't support a transfer size 2869 * big enough to require chained PRPs. 2870 */ 2871 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 2872 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 2873 2874 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 2875 2876 /* 2877 * Make sure the minimum/maximum queue entry sizes are not 2878 * larger/smaller than the default. 2879 */ 2880 2881 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 2882 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 2883 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 2884 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 2885 goto fail; 2886 2887 /* 2888 * Check for the presence of a Volatile Write Cache. If present, 2889 * enable or disable based on the value of the property 2890 * volatile-write-cache-enable (default is enabled). 2891 */ 2892 nvme->n_write_cache_present = 2893 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 2894 2895 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2896 "volatile-write-cache-present", 2897 nvme->n_write_cache_present ? 1 : 0); 2898 2899 if (!nvme->n_write_cache_present) { 2900 nvme->n_write_cache_enabled = B_FALSE; 2901 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled) 2902 != 0) { 2903 dev_err(nvme->n_dip, CE_WARN, 2904 "!failed to %sable volatile write cache", 2905 nvme->n_write_cache_enabled ? "en" : "dis"); 2906 /* 2907 * Assume the cache is (still) enabled. 2908 */ 2909 nvme->n_write_cache_enabled = B_TRUE; 2910 } 2911 2912 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2913 "volatile-write-cache-enable", 2914 nvme->n_write_cache_enabled ? 1 : 0); 2915 2916 /* 2917 * Assume LBA Range Type feature is supported. If it isn't this 2918 * will be set to B_FALSE by nvme_get_features(). 2919 */ 2920 nvme->n_lba_range_supported = B_TRUE; 2921 2922 /* 2923 * Check support for Autonomous Power State Transition. 2924 */ 2925 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2926 nvme->n_auto_pst_supported = 2927 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; 2928 2929 /* 2930 * Assume Software Progress Marker feature is supported. If it isn't 2931 * this will be set to B_FALSE by nvme_get_features(). 2932 */ 2933 nvme->n_progress_supported = B_TRUE; 2934 2935 /* 2936 * Identify Namespaces 2937 */ 2938 nvme->n_namespace_count = nvme->n_idctl->id_nn; 2939 2940 if (nvme->n_namespace_count == 0) { 2941 dev_err(nvme->n_dip, CE_WARN, 2942 "!controllers without namespaces are not supported"); 2943 goto fail; 2944 } 2945 2946 if (nvme->n_namespace_count > NVME_MINOR_MAX) { 2947 dev_err(nvme->n_dip, CE_WARN, 2948 "!too many namespaces: %d, limiting to %d\n", 2949 nvme->n_namespace_count, NVME_MINOR_MAX); 2950 nvme->n_namespace_count = NVME_MINOR_MAX; 2951 } 2952 2953 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 2954 nvme->n_namespace_count, KM_SLEEP); 2955 2956 for (i = 0; i != nvme->n_namespace_count; i++) { 2957 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, 2958 NULL); 2959 nvme->n_ns[i].ns_ignore = B_TRUE; 2960 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) 2961 goto fail; 2962 } 2963 2964 /* 2965 * Try to set up MSI/MSI-X interrupts. 2966 */ 2967 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 2968 != 0) { 2969 nvme_release_interrupts(nvme); 2970 2971 nqueues = MIN(UINT16_MAX, ncpus); 2972 2973 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 2974 nqueues) != DDI_SUCCESS) && 2975 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 2976 nqueues) != DDI_SUCCESS)) { 2977 dev_err(nvme->n_dip, CE_WARN, 2978 "!failed to setup MSI/MSI-X interrupts"); 2979 goto fail; 2980 } 2981 } 2982 2983 /* 2984 * Create I/O queue pairs. 2985 */ 2986 2987 if (nvme_set_nqueues(nvme) != 0) { 2988 dev_err(nvme->n_dip, CE_WARN, 2989 "!failed to set number of I/O queues to %d", 2990 nvme->n_intr_cnt); 2991 goto fail; 2992 } 2993 2994 /* 2995 * Reallocate I/O queue array 2996 */ 2997 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 2998 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 2999 (nvme->n_submission_queues + 1), KM_SLEEP); 3000 nvme->n_ioq[0] = nvme->n_adminq; 3001 3002 /* 3003 * There should always be at least as many submission queues 3004 * as completion queues. 3005 */ 3006 ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues); 3007 3008 nvme->n_ioq_count = nvme->n_submission_queues; 3009 3010 nvme->n_io_squeue_len = 3011 MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries); 3012 3013 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len", 3014 nvme->n_io_squeue_len); 3015 3016 /* 3017 * Pre-allocate completion queues. 3018 * When there are the same number of submission and completion 3019 * queues there is no value in having a larger completion 3020 * queue length. 3021 */ 3022 if (nvme->n_submission_queues == nvme->n_completion_queues) 3023 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3024 nvme->n_io_squeue_len); 3025 3026 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3027 nvme->n_max_queue_entries); 3028 3029 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", 3030 nvme->n_io_cqueue_len); 3031 3032 /* 3033 * Assign the equal quantity of taskq threads to each completion 3034 * queue, capping the total number of threads to the number 3035 * of CPUs. 3036 */ 3037 tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues; 3038 3039 /* 3040 * In case the calculation above is zero, we need at least one 3041 * thread per completion queue. 3042 */ 3043 tq_threads = MAX(1, tq_threads); 3044 3045 if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, 3046 nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) { 3047 dev_err(nvme->n_dip, CE_WARN, 3048 "!failed to pre-allocate completion queues"); 3049 goto fail; 3050 } 3051 3052 /* 3053 * If we use less completion queues than interrupt vectors return 3054 * some of the interrupt vectors back to the system. 3055 */ 3056 if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) { 3057 nvme_release_interrupts(nvme); 3058 3059 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 3060 nvme->n_completion_queues + 1) != DDI_SUCCESS) { 3061 dev_err(nvme->n_dip, CE_WARN, 3062 "!failed to reduce number of interrupts"); 3063 goto fail; 3064 } 3065 } 3066 3067 /* 3068 * Alloc & register I/O queue pairs 3069 */ 3070 3071 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3072 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len, 3073 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 3074 dev_err(nvme->n_dip, CE_WARN, 3075 "!unable to allocate I/O qpair %d", i); 3076 goto fail; 3077 } 3078 3079 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) { 3080 dev_err(nvme->n_dip, CE_WARN, 3081 "!unable to create I/O qpair %d", i); 3082 goto fail; 3083 } 3084 } 3085 3086 /* 3087 * Post more asynchronous events commands to reduce event reporting 3088 * latency as suggested by the spec. 3089 */ 3090 if (nvme->n_async_event_supported) { 3091 for (i = 1; i != nvme->n_async_event_limit; i++) 3092 nvme_async_event(nvme); 3093 } 3094 3095 return (DDI_SUCCESS); 3096 3097 fail: 3098 (void) nvme_reset(nvme, B_FALSE); 3099 return (DDI_FAILURE); 3100 } 3101 3102 static uint_t 3103 nvme_intr(caddr_t arg1, caddr_t arg2) 3104 { 3105 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 3106 nvme_t *nvme = (nvme_t *)arg1; 3107 int inum = (int)(uintptr_t)arg2; 3108 int ccnt = 0; 3109 int qnum; 3110 3111 if (inum >= nvme->n_intr_cnt) 3112 return (DDI_INTR_UNCLAIMED); 3113 3114 if (nvme->n_dead) 3115 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ? 3116 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); 3117 3118 /* 3119 * The interrupt vector a queue uses is calculated as queue_idx % 3120 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 3121 * in steps of n_intr_cnt to process all queues using this vector. 3122 */ 3123 for (qnum = inum; 3124 qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL; 3125 qnum += nvme->n_intr_cnt) { 3126 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]); 3127 } 3128 3129 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 3130 } 3131 3132 static void 3133 nvme_release_interrupts(nvme_t *nvme) 3134 { 3135 int i; 3136 3137 for (i = 0; i < nvme->n_intr_cnt; i++) { 3138 if (nvme->n_inth[i] == NULL) 3139 break; 3140 3141 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3142 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 3143 else 3144 (void) ddi_intr_disable(nvme->n_inth[i]); 3145 3146 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 3147 (void) ddi_intr_free(nvme->n_inth[i]); 3148 } 3149 3150 kmem_free(nvme->n_inth, nvme->n_inth_sz); 3151 nvme->n_inth = NULL; 3152 nvme->n_inth_sz = 0; 3153 3154 nvme->n_progress &= ~NVME_INTERRUPTS; 3155 } 3156 3157 static int 3158 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 3159 { 3160 int nintrs, navail, count; 3161 int ret; 3162 int i; 3163 3164 if (nvme->n_intr_types == 0) { 3165 ret = ddi_intr_get_supported_types(nvme->n_dip, 3166 &nvme->n_intr_types); 3167 if (ret != DDI_SUCCESS) { 3168 dev_err(nvme->n_dip, CE_WARN, 3169 "!%s: ddi_intr_get_supported types failed", 3170 __func__); 3171 return (ret); 3172 } 3173 #ifdef __x86 3174 if (get_hwenv() == HW_VMWARE) 3175 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 3176 #endif 3177 } 3178 3179 if ((nvme->n_intr_types & intr_type) == 0) 3180 return (DDI_FAILURE); 3181 3182 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 3183 if (ret != DDI_SUCCESS) { 3184 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 3185 __func__); 3186 return (ret); 3187 } 3188 3189 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 3190 if (ret != DDI_SUCCESS) { 3191 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 3192 __func__); 3193 return (ret); 3194 } 3195 3196 /* We want at most one interrupt per queue pair. */ 3197 if (navail > nqpairs) 3198 navail = nqpairs; 3199 3200 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 3201 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 3202 3203 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 3204 &count, 0); 3205 if (ret != DDI_SUCCESS) { 3206 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 3207 __func__); 3208 goto fail; 3209 } 3210 3211 nvme->n_intr_cnt = count; 3212 3213 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 3214 if (ret != DDI_SUCCESS) { 3215 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 3216 __func__); 3217 goto fail; 3218 } 3219 3220 for (i = 0; i < count; i++) { 3221 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 3222 (void *)nvme, (void *)(uintptr_t)i); 3223 if (ret != DDI_SUCCESS) { 3224 dev_err(nvme->n_dip, CE_WARN, 3225 "!%s: ddi_intr_add_handler failed", __func__); 3226 goto fail; 3227 } 3228 } 3229 3230 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 3231 3232 for (i = 0; i < count; i++) { 3233 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3234 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 3235 else 3236 ret = ddi_intr_enable(nvme->n_inth[i]); 3237 3238 if (ret != DDI_SUCCESS) { 3239 dev_err(nvme->n_dip, CE_WARN, 3240 "!%s: enabling interrupt %d failed", __func__, i); 3241 goto fail; 3242 } 3243 } 3244 3245 nvme->n_intr_type = intr_type; 3246 3247 nvme->n_progress |= NVME_INTERRUPTS; 3248 3249 return (DDI_SUCCESS); 3250 3251 fail: 3252 nvme_release_interrupts(nvme); 3253 3254 return (ret); 3255 } 3256 3257 static int 3258 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 3259 { 3260 _NOTE(ARGUNUSED(arg)); 3261 3262 pci_ereport_post(dip, fm_error, NULL); 3263 return (fm_error->fme_status); 3264 } 3265 3266 static int 3267 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3268 { 3269 nvme_t *nvme; 3270 int instance; 3271 int nregs; 3272 off_t regsize; 3273 int i; 3274 char name[32]; 3275 3276 if (cmd != DDI_ATTACH) 3277 return (DDI_FAILURE); 3278 3279 instance = ddi_get_instance(dip); 3280 3281 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 3282 return (DDI_FAILURE); 3283 3284 nvme = ddi_get_soft_state(nvme_state, instance); 3285 ddi_set_driver_private(dip, nvme); 3286 nvme->n_dip = dip; 3287 3288 mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); 3289 3290 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3291 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 3292 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 3293 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 3294 B_TRUE : B_FALSE; 3295 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3296 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 3297 nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3298 DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN); 3299 /* 3300 * Double up the default for completion queues in case of 3301 * queue sharing. 3302 */ 3303 nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3304 DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN); 3305 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3306 DDI_PROP_DONTPASS, "async-event-limit", 3307 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 3308 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3309 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 3310 B_TRUE : B_FALSE; 3311 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3312 DDI_PROP_DONTPASS, "min-phys-block-size", 3313 NVME_DEFAULT_MIN_BLOCK_SIZE); 3314 nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3315 DDI_PROP_DONTPASS, "max-submission-queues", -1); 3316 nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3317 DDI_PROP_DONTPASS, "max-completion-queues", -1); 3318 3319 if (!ISP2(nvme->n_min_block_size) || 3320 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 3321 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 3322 "using default %d", ISP2(nvme->n_min_block_size) ? 3323 "too low" : "not a power of 2", 3324 NVME_DEFAULT_MIN_BLOCK_SIZE); 3325 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 3326 } 3327 3328 if (nvme->n_submission_queues != -1 && 3329 (nvme->n_submission_queues < 1 || 3330 nvme->n_submission_queues > UINT16_MAX)) { 3331 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not " 3332 "valid. Must be [1..%d]", nvme->n_submission_queues, 3333 UINT16_MAX); 3334 nvme->n_submission_queues = -1; 3335 } 3336 3337 if (nvme->n_completion_queues != -1 && 3338 (nvme->n_completion_queues < 1 || 3339 nvme->n_completion_queues > UINT16_MAX)) { 3340 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not " 3341 "valid. Must be [1..%d]", nvme->n_completion_queues, 3342 UINT16_MAX); 3343 nvme->n_completion_queues = -1; 3344 } 3345 3346 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 3347 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 3348 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 3349 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 3350 3351 if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN) 3352 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN; 3353 if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN) 3354 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN; 3355 3356 if (nvme->n_async_event_limit < 1) 3357 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 3358 3359 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 3360 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 3361 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 3362 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 3363 3364 /* 3365 * Setup FMA support. 3366 */ 3367 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 3368 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 3369 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 3370 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 3371 3372 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 3373 3374 if (nvme->n_fm_cap) { 3375 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 3376 nvme->n_reg_acc_attr.devacc_attr_access = 3377 DDI_FLAGERR_ACC; 3378 3379 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 3380 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3381 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3382 } 3383 3384 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3385 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3386 pci_ereport_setup(dip); 3387 3388 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3389 ddi_fm_handler_register(dip, nvme_fm_errcb, 3390 (void *)nvme); 3391 } 3392 3393 nvme->n_progress |= NVME_FMA_INIT; 3394 3395 /* 3396 * The spec defines several register sets. Only the controller 3397 * registers (set 1) are currently used. 3398 */ 3399 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 3400 nregs < 2 || 3401 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 3402 goto fail; 3403 3404 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 3405 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 3406 dev_err(dip, CE_WARN, "!failed to map regset 1"); 3407 goto fail; 3408 } 3409 3410 nvme->n_progress |= NVME_REGS_MAPPED; 3411 3412 /* 3413 * Create PRP DMA cache 3414 */ 3415 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 3416 ddi_driver_name(dip), ddi_get_instance(dip)); 3417 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 3418 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 3419 NULL, (void *)nvme, NULL, 0); 3420 3421 if (nvme_init(nvme) != DDI_SUCCESS) 3422 goto fail; 3423 3424 /* 3425 * Initialize the driver with the UFM subsystem 3426 */ 3427 if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops, 3428 &nvme->n_ufmh, nvme) != 0) { 3429 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem"); 3430 goto fail; 3431 } 3432 mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL); 3433 ddi_ufm_update(nvme->n_ufmh); 3434 nvme->n_progress |= NVME_UFM_INIT; 3435 3436 /* 3437 * Attach the blkdev driver for each namespace. 3438 */ 3439 for (i = 0; i != nvme->n_namespace_count; i++) { 3440 if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name, 3441 S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1), 3442 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { 3443 dev_err(dip, CE_WARN, 3444 "!failed to create minor node for namespace %d", i); 3445 goto fail; 3446 } 3447 3448 if (nvme->n_ns[i].ns_ignore) 3449 continue; 3450 3451 nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i], 3452 &nvme_bd_ops, &nvme->n_prp_dma_attr, KM_SLEEP); 3453 3454 if (nvme->n_ns[i].ns_bd_hdl == NULL) { 3455 dev_err(dip, CE_WARN, 3456 "!failed to get blkdev handle for namespace %d", i); 3457 goto fail; 3458 } 3459 3460 if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl) 3461 != DDI_SUCCESS) { 3462 dev_err(dip, CE_WARN, 3463 "!failed to attach blkdev handle for namespace %d", 3464 i); 3465 goto fail; 3466 } 3467 } 3468 3469 if (ddi_create_minor_node(dip, "devctl", S_IFCHR, 3470 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) 3471 != DDI_SUCCESS) { 3472 dev_err(dip, CE_WARN, "nvme_attach: " 3473 "cannot create devctl minor node"); 3474 goto fail; 3475 } 3476 3477 return (DDI_SUCCESS); 3478 3479 fail: 3480 /* attach successful anyway so that FMA can retire the device */ 3481 if (nvme->n_dead) 3482 return (DDI_SUCCESS); 3483 3484 (void) nvme_detach(dip, DDI_DETACH); 3485 3486 return (DDI_FAILURE); 3487 } 3488 3489 static int 3490 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3491 { 3492 int instance, i; 3493 nvme_t *nvme; 3494 3495 if (cmd != DDI_DETACH) 3496 return (DDI_FAILURE); 3497 3498 instance = ddi_get_instance(dip); 3499 3500 nvme = ddi_get_soft_state(nvme_state, instance); 3501 3502 if (nvme == NULL) 3503 return (DDI_FAILURE); 3504 3505 ddi_remove_minor_node(dip, "devctl"); 3506 mutex_destroy(&nvme->n_minor.nm_mutex); 3507 3508 if (nvme->n_ns) { 3509 for (i = 0; i != nvme->n_namespace_count; i++) { 3510 ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); 3511 mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); 3512 3513 if (nvme->n_ns[i].ns_bd_hdl) { 3514 (void) bd_detach_handle( 3515 nvme->n_ns[i].ns_bd_hdl); 3516 bd_free_handle(nvme->n_ns[i].ns_bd_hdl); 3517 } 3518 3519 if (nvme->n_ns[i].ns_idns) 3520 kmem_free(nvme->n_ns[i].ns_idns, 3521 sizeof (nvme_identify_nsid_t)); 3522 if (nvme->n_ns[i].ns_devid) 3523 strfree(nvme->n_ns[i].ns_devid); 3524 } 3525 3526 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 3527 nvme->n_namespace_count); 3528 } 3529 if (nvme->n_progress & NVME_UFM_INIT) { 3530 ddi_ufm_fini(nvme->n_ufmh); 3531 mutex_destroy(&nvme->n_fwslot_mutex); 3532 } 3533 3534 if (nvme->n_progress & NVME_INTERRUPTS) 3535 nvme_release_interrupts(nvme); 3536 3537 for (i = 0; i < nvme->n_cq_count; i++) { 3538 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL) 3539 taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); 3540 } 3541 3542 if (nvme->n_ioq_count > 0) { 3543 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3544 if (nvme->n_ioq[i] != NULL) { 3545 /* TODO: send destroy queue commands */ 3546 nvme_free_qpair(nvme->n_ioq[i]); 3547 } 3548 } 3549 3550 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 3551 (nvme->n_ioq_count + 1)); 3552 } 3553 3554 if (nvme->n_prp_cache != NULL) { 3555 kmem_cache_destroy(nvme->n_prp_cache); 3556 } 3557 3558 if (nvme->n_progress & NVME_REGS_MAPPED) { 3559 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE); 3560 (void) nvme_reset(nvme, B_FALSE); 3561 } 3562 3563 if (nvme->n_progress & NVME_CTRL_LIMITS) 3564 sema_destroy(&nvme->n_abort_sema); 3565 3566 if (nvme->n_progress & NVME_ADMIN_QUEUE) 3567 nvme_free_qpair(nvme->n_adminq); 3568 3569 if (nvme->n_cq_count > 0) { 3570 nvme_destroy_cq_array(nvme, 0); 3571 nvme->n_cq = NULL; 3572 nvme->n_cq_count = 0; 3573 } 3574 3575 if (nvme->n_idctl) 3576 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); 3577 3578 if (nvme->n_progress & NVME_REGS_MAPPED) 3579 ddi_regs_map_free(&nvme->n_regh); 3580 3581 if (nvme->n_progress & NVME_FMA_INIT) { 3582 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3583 ddi_fm_handler_unregister(nvme->n_dip); 3584 3585 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3586 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3587 pci_ereport_teardown(nvme->n_dip); 3588 3589 ddi_fm_fini(nvme->n_dip); 3590 } 3591 3592 if (nvme->n_vendor != NULL) 3593 strfree(nvme->n_vendor); 3594 3595 if (nvme->n_product != NULL) 3596 strfree(nvme->n_product); 3597 3598 ddi_soft_state_free(nvme_state, instance); 3599 3600 return (DDI_SUCCESS); 3601 } 3602 3603 static int 3604 nvme_quiesce(dev_info_t *dip) 3605 { 3606 int instance; 3607 nvme_t *nvme; 3608 3609 instance = ddi_get_instance(dip); 3610 3611 nvme = ddi_get_soft_state(nvme_state, instance); 3612 3613 if (nvme == NULL) 3614 return (DDI_FAILURE); 3615 3616 nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE); 3617 3618 (void) nvme_reset(nvme, B_TRUE); 3619 3620 return (DDI_FAILURE); 3621 } 3622 3623 static int 3624 nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer) 3625 { 3626 nvme_t *nvme = cmd->nc_nvme; 3627 int nprp_page, nprp; 3628 uint64_t *prp; 3629 3630 if (xfer->x_ndmac == 0) 3631 return (DDI_FAILURE); 3632 3633 cmd->nc_sqe.sqe_dptr.d_prp[0] = xfer->x_dmac.dmac_laddress; 3634 3635 if (xfer->x_ndmac == 1) { 3636 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 3637 return (DDI_SUCCESS); 3638 } else if (xfer->x_ndmac == 2) { 3639 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac); 3640 cmd->nc_sqe.sqe_dptr.d_prp[1] = xfer->x_dmac.dmac_laddress; 3641 return (DDI_SUCCESS); 3642 } 3643 3644 xfer->x_ndmac--; 3645 3646 nprp_page = nvme->n_pagesize / sizeof (uint64_t); 3647 ASSERT(nprp_page > 0); 3648 nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page; 3649 3650 /* 3651 * We currently don't support chained PRPs and set up our DMA 3652 * attributes to reflect that. If we still get an I/O request 3653 * that needs a chained PRP something is very wrong. 3654 */ 3655 VERIFY(nprp == 1); 3656 3657 cmd->nc_dma = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 3658 bzero(cmd->nc_dma->nd_memp, cmd->nc_dma->nd_len); 3659 3660 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_dma->nd_cookie.dmac_laddress; 3661 3662 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 3663 for (prp = (uint64_t *)cmd->nc_dma->nd_memp; 3664 xfer->x_ndmac > 0; 3665 prp++, xfer->x_ndmac--) { 3666 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac); 3667 *prp = xfer->x_dmac.dmac_laddress; 3668 } 3669 3670 (void) ddi_dma_sync(cmd->nc_dma->nd_dmah, 0, cmd->nc_dma->nd_len, 3671 DDI_DMA_SYNC_FORDEV); 3672 return (DDI_SUCCESS); 3673 } 3674 3675 static nvme_cmd_t * 3676 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 3677 { 3678 nvme_t *nvme = ns->ns_nvme; 3679 nvme_cmd_t *cmd; 3680 3681 /* 3682 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 3683 */ 3684 cmd = nvme_alloc_cmd(nvme, (xfer->x_flags & BD_XFER_POLL) ? 3685 KM_NOSLEEP : KM_SLEEP); 3686 3687 if (cmd == NULL) 3688 return (NULL); 3689 3690 cmd->nc_sqe.sqe_opc = opc; 3691 cmd->nc_callback = nvme_bd_xfer_done; 3692 cmd->nc_xfer = xfer; 3693 3694 switch (opc) { 3695 case NVME_OPC_NVM_WRITE: 3696 case NVME_OPC_NVM_READ: 3697 VERIFY(xfer->x_nblks <= 0x10000); 3698 3699 cmd->nc_sqe.sqe_nsid = ns->ns_id; 3700 3701 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 3702 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 3703 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 3704 3705 if (nvme_fill_prp(cmd, xfer) != DDI_SUCCESS) 3706 goto fail; 3707 break; 3708 3709 case NVME_OPC_NVM_FLUSH: 3710 cmd->nc_sqe.sqe_nsid = ns->ns_id; 3711 break; 3712 3713 default: 3714 goto fail; 3715 } 3716 3717 return (cmd); 3718 3719 fail: 3720 nvme_free_cmd(cmd); 3721 return (NULL); 3722 } 3723 3724 static void 3725 nvme_bd_xfer_done(void *arg) 3726 { 3727 nvme_cmd_t *cmd = arg; 3728 bd_xfer_t *xfer = cmd->nc_xfer; 3729 int error = 0; 3730 3731 error = nvme_check_cmd_status(cmd); 3732 nvme_free_cmd(cmd); 3733 3734 bd_xfer_done(xfer, error); 3735 } 3736 3737 static void 3738 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 3739 { 3740 nvme_namespace_t *ns = arg; 3741 nvme_t *nvme = ns->ns_nvme; 3742 uint_t ns_count = MAX(1, nvme->n_namespaces_attachable); 3743 3744 /* 3745 * Set the blkdev qcount to the number of submission queues. 3746 * It will then create one waitq/runq pair for each submission 3747 * queue and spread I/O requests across the queues. 3748 */ 3749 drive->d_qcount = nvme->n_ioq_count; 3750 3751 /* 3752 * I/O activity to individual namespaces is distributed across 3753 * each of the d_qcount blkdev queues (which has been set to 3754 * the number of nvme submission queues). d_qsize is the number 3755 * of submitted and not completed I/Os within each queue that blkdev 3756 * will allow before it starts holding them in the waitq. 3757 * 3758 * Each namespace will create a child blkdev instance, for each one 3759 * we try and set the d_qsize so that each namespace gets an 3760 * equal portion of the submission queue. 3761 * 3762 * If post instantiation of the nvme drive, n_namespaces_attachable 3763 * changes and a namespace is attached it could calculate a 3764 * different d_qsize. It may even be that the sum of the d_qsizes is 3765 * now beyond the submission queue size. Should that be the case 3766 * and the I/O rate is such that blkdev attempts to submit more 3767 * I/Os than the size of the submission queue, the excess I/Os 3768 * will be held behind the semaphore nq_sema. 3769 */ 3770 drive->d_qsize = nvme->n_io_squeue_len / ns_count; 3771 3772 /* 3773 * Don't let the queue size drop below the minimum, though. 3774 */ 3775 drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN); 3776 3777 /* 3778 * d_maxxfer is not set, which means the value is taken from the DMA 3779 * attributes specified to bd_alloc_handle. 3780 */ 3781 3782 drive->d_removable = B_FALSE; 3783 drive->d_hotpluggable = B_FALSE; 3784 3785 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 3786 drive->d_target = ns->ns_id; 3787 drive->d_lun = 0; 3788 3789 drive->d_model = nvme->n_idctl->id_model; 3790 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 3791 drive->d_vendor = nvme->n_vendor; 3792 drive->d_vendor_len = strlen(nvme->n_vendor); 3793 drive->d_product = nvme->n_product; 3794 drive->d_product_len = strlen(nvme->n_product); 3795 drive->d_serial = nvme->n_idctl->id_serial; 3796 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 3797 drive->d_revision = nvme->n_idctl->id_fwrev; 3798 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 3799 } 3800 3801 static int 3802 nvme_bd_mediainfo(void *arg, bd_media_t *media) 3803 { 3804 nvme_namespace_t *ns = arg; 3805 3806 media->m_nblks = ns->ns_block_count; 3807 media->m_blksize = ns->ns_block_size; 3808 media->m_readonly = B_FALSE; 3809 media->m_solidstate = B_TRUE; 3810 3811 media->m_pblksize = ns->ns_best_block_size; 3812 3813 return (0); 3814 } 3815 3816 static int 3817 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 3818 { 3819 nvme_t *nvme = ns->ns_nvme; 3820 nvme_cmd_t *cmd; 3821 nvme_qpair_t *ioq; 3822 boolean_t poll; 3823 int ret; 3824 3825 if (nvme->n_dead) 3826 return (EIO); 3827 3828 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 3829 if (cmd == NULL) 3830 return (ENOMEM); 3831 3832 cmd->nc_sqid = xfer->x_qnum + 1; 3833 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 3834 ioq = nvme->n_ioq[cmd->nc_sqid]; 3835 3836 /* 3837 * Get the polling flag before submitting the command. The command may 3838 * complete immediately after it was submitted, which means we must 3839 * treat both cmd and xfer as if they have been freed already. 3840 */ 3841 poll = (xfer->x_flags & BD_XFER_POLL) != 0; 3842 3843 ret = nvme_submit_io_cmd(ioq, cmd); 3844 3845 if (ret != 0) 3846 return (ret); 3847 3848 if (!poll) 3849 return (0); 3850 3851 do { 3852 cmd = nvme_retrieve_cmd(nvme, ioq); 3853 if (cmd != NULL) 3854 cmd->nc_callback(cmd); 3855 else 3856 drv_usecwait(10); 3857 } while (ioq->nq_active_cmds != 0); 3858 3859 return (0); 3860 } 3861 3862 static int 3863 nvme_bd_read(void *arg, bd_xfer_t *xfer) 3864 { 3865 nvme_namespace_t *ns = arg; 3866 3867 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 3868 } 3869 3870 static int 3871 nvme_bd_write(void *arg, bd_xfer_t *xfer) 3872 { 3873 nvme_namespace_t *ns = arg; 3874 3875 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 3876 } 3877 3878 static int 3879 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 3880 { 3881 nvme_namespace_t *ns = arg; 3882 3883 if (ns->ns_nvme->n_dead) 3884 return (EIO); 3885 3886 /* 3887 * If the volatile write cache is not present or not enabled the FLUSH 3888 * command is a no-op, so we can take a shortcut here. 3889 */ 3890 if (!ns->ns_nvme->n_write_cache_present) { 3891 bd_xfer_done(xfer, ENOTSUP); 3892 return (0); 3893 } 3894 3895 if (!ns->ns_nvme->n_write_cache_enabled) { 3896 bd_xfer_done(xfer, 0); 3897 return (0); 3898 } 3899 3900 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 3901 } 3902 3903 static int 3904 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 3905 { 3906 nvme_namespace_t *ns = arg; 3907 3908 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 3909 if (*(uint64_t *)ns->ns_eui64 != 0) { 3910 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN, 3911 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 3912 } else { 3913 return (ddi_devid_init(devinfo, DEVID_ENCAP, 3914 strlen(ns->ns_devid), ns->ns_devid, devid)); 3915 } 3916 } 3917 3918 static int 3919 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 3920 { 3921 #ifndef __lock_lint 3922 _NOTE(ARGUNUSED(cred_p)); 3923 #endif 3924 minor_t minor = getminor(*devp); 3925 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 3926 int nsid = NVME_MINOR_NSID(minor); 3927 nvme_minor_state_t *nm; 3928 int rv = 0; 3929 3930 if (otyp != OTYP_CHR) 3931 return (EINVAL); 3932 3933 if (nvme == NULL) 3934 return (ENXIO); 3935 3936 if (nsid > nvme->n_namespace_count) 3937 return (ENXIO); 3938 3939 if (nvme->n_dead) 3940 return (EIO); 3941 3942 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 3943 3944 mutex_enter(&nm->nm_mutex); 3945 if (nm->nm_oexcl) { 3946 rv = EBUSY; 3947 goto out; 3948 } 3949 3950 if (flag & FEXCL) { 3951 if (nm->nm_ocnt != 0) { 3952 rv = EBUSY; 3953 goto out; 3954 } 3955 nm->nm_oexcl = B_TRUE; 3956 } 3957 3958 nm->nm_ocnt++; 3959 3960 out: 3961 mutex_exit(&nm->nm_mutex); 3962 return (rv); 3963 3964 } 3965 3966 static int 3967 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 3968 { 3969 #ifndef __lock_lint 3970 _NOTE(ARGUNUSED(cred_p)); 3971 _NOTE(ARGUNUSED(flag)); 3972 #endif 3973 minor_t minor = getminor(dev); 3974 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 3975 int nsid = NVME_MINOR_NSID(minor); 3976 nvme_minor_state_t *nm; 3977 3978 if (otyp != OTYP_CHR) 3979 return (ENXIO); 3980 3981 if (nvme == NULL) 3982 return (ENXIO); 3983 3984 if (nsid > nvme->n_namespace_count) 3985 return (ENXIO); 3986 3987 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 3988 3989 mutex_enter(&nm->nm_mutex); 3990 if (nm->nm_oexcl) 3991 nm->nm_oexcl = B_FALSE; 3992 3993 ASSERT(nm->nm_ocnt > 0); 3994 nm->nm_ocnt--; 3995 mutex_exit(&nm->nm_mutex); 3996 3997 return (0); 3998 } 3999 4000 static int 4001 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4002 cred_t *cred_p) 4003 { 4004 _NOTE(ARGUNUSED(cred_p)); 4005 int rv = 0; 4006 void *idctl; 4007 4008 if ((mode & FREAD) == 0) 4009 return (EPERM); 4010 4011 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) 4012 return (EINVAL); 4013 4014 if ((rv = nvme_identify(nvme, B_TRUE, nsid, (void **)&idctl)) != 0) 4015 return (rv); 4016 4017 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) 4018 != 0) 4019 rv = EFAULT; 4020 4021 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); 4022 4023 return (rv); 4024 } 4025 4026 /* 4027 * Execute commands on behalf of the various ioctls. 4028 */ 4029 static int 4030 nvme_ioc_cmd(nvme_t *nvme, nvme_sqe_t *sqe, boolean_t is_admin, void *data_addr, 4031 uint32_t data_len, int rwk, nvme_cqe_t *cqe, uint_t timeout) 4032 { 4033 nvme_cmd_t *cmd; 4034 nvme_qpair_t *ioq; 4035 int rv = 0; 4036 4037 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 4038 if (is_admin) { 4039 cmd->nc_sqid = 0; 4040 ioq = nvme->n_adminq; 4041 } else { 4042 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; 4043 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4044 ioq = nvme->n_ioq[cmd->nc_sqid]; 4045 } 4046 4047 cmd->nc_callback = nvme_wakeup_cmd; 4048 cmd->nc_sqe = *sqe; 4049 4050 if ((rwk & (FREAD | FWRITE)) != 0) { 4051 if (data_addr == NULL) { 4052 rv = EINVAL; 4053 goto free_cmd; 4054 } 4055 4056 /* 4057 * Because we use PRPs and haven't implemented PRP 4058 * lists here, the maximum data size is restricted to 4059 * 2 pages. 4060 */ 4061 if (data_len > 2 * nvme->n_pagesize) { 4062 dev_err(nvme->n_dip, CE_WARN, "!Data size %u is too " 4063 "large for nvme_ioc_cmd(). Limit is 2 pages " 4064 "(%u bytes)", data_len, 2 * nvme->n_pagesize); 4065 4066 rv = EINVAL; 4067 goto free_cmd; 4068 } 4069 4070 if (nvme_zalloc_dma(nvme, data_len, DDI_DMA_READ, 4071 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 4072 dev_err(nvme->n_dip, CE_WARN, 4073 "!nvme_zalloc_dma failed for nvme_ioc_cmd()"); 4074 4075 rv = ENOMEM; 4076 goto free_cmd; 4077 } 4078 4079 if (cmd->nc_dma->nd_ncookie > 2) { 4080 dev_err(nvme->n_dip, CE_WARN, 4081 "!too many DMA cookies for nvme_ioc_cmd()"); 4082 atomic_inc_32(&nvme->n_too_many_cookies); 4083 4084 rv = E2BIG; 4085 goto free_cmd; 4086 } 4087 4088 cmd->nc_sqe.sqe_dptr.d_prp[0] = 4089 cmd->nc_dma->nd_cookie.dmac_laddress; 4090 4091 if (cmd->nc_dma->nd_ncookie > 1) { 4092 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 4093 &cmd->nc_dma->nd_cookie); 4094 cmd->nc_sqe.sqe_dptr.d_prp[1] = 4095 cmd->nc_dma->nd_cookie.dmac_laddress; 4096 } 4097 4098 if ((rwk & FWRITE) != 0) { 4099 if (ddi_copyin(data_addr, cmd->nc_dma->nd_memp, 4100 data_len, rwk & FKIOCTL) != 0) { 4101 rv = EFAULT; 4102 goto free_cmd; 4103 } 4104 } 4105 } 4106 4107 if (is_admin) { 4108 nvme_admin_cmd(cmd, timeout); 4109 } else { 4110 mutex_enter(&cmd->nc_mutex); 4111 4112 rv = nvme_submit_io_cmd(ioq, cmd); 4113 4114 if (rv == EAGAIN) { 4115 mutex_exit(&cmd->nc_mutex); 4116 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 4117 "!nvme_ioc_cmd() failed, I/O Q full"); 4118 goto free_cmd; 4119 } 4120 4121 nvme_wait_cmd(cmd, timeout); 4122 4123 mutex_exit(&cmd->nc_mutex); 4124 } 4125 4126 if (cqe != NULL) 4127 *cqe = cmd->nc_cqe; 4128 4129 if ((rv = nvme_check_cmd_status(cmd)) != 0) { 4130 dev_err(nvme->n_dip, CE_WARN, 4131 "!nvme_ioc_cmd() failed with sct = %x, sc = %x", 4132 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 4133 4134 goto free_cmd; 4135 } 4136 4137 if ((rwk & FREAD) != 0) { 4138 if (ddi_copyout(cmd->nc_dma->nd_memp, 4139 data_addr, data_len, rwk & FKIOCTL) != 0) 4140 rv = EFAULT; 4141 } 4142 4143 free_cmd: 4144 nvme_free_cmd(cmd); 4145 4146 return (rv); 4147 } 4148 4149 static int 4150 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4151 int mode, cred_t *cred_p) 4152 { 4153 _NOTE(ARGUNUSED(nsid, cred_p)); 4154 int rv = 0; 4155 nvme_reg_cap_t cap = { 0 }; 4156 nvme_capabilities_t nc; 4157 4158 if ((mode & FREAD) == 0) 4159 return (EPERM); 4160 4161 if (nioc->n_len < sizeof (nc)) 4162 return (EINVAL); 4163 4164 cap.r = nvme_get64(nvme, NVME_REG_CAP); 4165 4166 /* 4167 * The MPSMIN and MPSMAX fields in the CAP register use 0 to 4168 * specify the base page size of 4k (1<<12), so add 12 here to 4169 * get the real page size value. 4170 */ 4171 nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); 4172 nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); 4173 4174 if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) 4175 rv = EFAULT; 4176 4177 return (rv); 4178 } 4179 4180 static int 4181 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4182 int mode, cred_t *cred_p) 4183 { 4184 _NOTE(ARGUNUSED(cred_p)); 4185 void *log = NULL; 4186 size_t bufsize = 0; 4187 int rv = 0; 4188 4189 if ((mode & FREAD) == 0) 4190 return (EPERM); 4191 4192 switch (nioc->n_arg) { 4193 case NVME_LOGPAGE_ERROR: 4194 if (nsid != 0) 4195 return (EINVAL); 4196 break; 4197 case NVME_LOGPAGE_HEALTH: 4198 if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) 4199 return (EINVAL); 4200 4201 if (nsid == 0) 4202 nsid = (uint32_t)-1; 4203 4204 break; 4205 case NVME_LOGPAGE_FWSLOT: 4206 if (nsid != 0) 4207 return (EINVAL); 4208 break; 4209 default: 4210 return (EINVAL); 4211 } 4212 4213 if (nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, nioc->n_arg, nsid) 4214 != DDI_SUCCESS) 4215 return (EIO); 4216 4217 if (nioc->n_len < bufsize) { 4218 kmem_free(log, bufsize); 4219 return (EINVAL); 4220 } 4221 4222 if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) 4223 rv = EFAULT; 4224 4225 nioc->n_len = bufsize; 4226 kmem_free(log, bufsize); 4227 4228 return (rv); 4229 } 4230 4231 static int 4232 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4233 int mode, cred_t *cred_p) 4234 { 4235 _NOTE(ARGUNUSED(cred_p)); 4236 void *buf = NULL; 4237 size_t bufsize = 0; 4238 uint32_t res = 0; 4239 uint8_t feature; 4240 int rv = 0; 4241 4242 if ((mode & FREAD) == 0) 4243 return (EPERM); 4244 4245 if ((nioc->n_arg >> 32) > 0xff) 4246 return (EINVAL); 4247 4248 feature = (uint8_t)(nioc->n_arg >> 32); 4249 4250 switch (feature) { 4251 case NVME_FEAT_ARBITRATION: 4252 case NVME_FEAT_POWER_MGMT: 4253 case NVME_FEAT_ERROR: 4254 case NVME_FEAT_NQUEUES: 4255 case NVME_FEAT_INTR_COAL: 4256 case NVME_FEAT_WRITE_ATOM: 4257 case NVME_FEAT_ASYNC_EVENT: 4258 case NVME_FEAT_PROGRESS: 4259 if (nsid != 0) 4260 return (EINVAL); 4261 break; 4262 4263 case NVME_FEAT_TEMPERATURE: 4264 if (nsid != 0) 4265 return (EINVAL); 4266 res = nioc->n_arg & 0xffffffffUL; 4267 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) { 4268 nvme_temp_threshold_t tt; 4269 4270 tt.r = res; 4271 if (tt.b.tt_thsel != NVME_TEMP_THRESH_OVER && 4272 tt.b.tt_thsel != NVME_TEMP_THRESH_UNDER) { 4273 return (EINVAL); 4274 } 4275 4276 if (tt.b.tt_tmpsel > NVME_TEMP_THRESH_MAX_SENSOR) { 4277 return (EINVAL); 4278 } 4279 } else if (res != 0) { 4280 return (EINVAL); 4281 } 4282 break; 4283 4284 case NVME_FEAT_INTR_VECT: 4285 if (nsid != 0) 4286 return (EINVAL); 4287 4288 res = nioc->n_arg & 0xffffffffUL; 4289 if (res >= nvme->n_intr_cnt) 4290 return (EINVAL); 4291 break; 4292 4293 case NVME_FEAT_LBA_RANGE: 4294 if (nvme->n_lba_range_supported == B_FALSE) 4295 return (EINVAL); 4296 4297 if (nsid == 0 || 4298 nsid > nvme->n_namespace_count) 4299 return (EINVAL); 4300 4301 break; 4302 4303 case NVME_FEAT_WRITE_CACHE: 4304 if (nsid != 0) 4305 return (EINVAL); 4306 4307 if (!nvme->n_write_cache_present) 4308 return (EINVAL); 4309 4310 break; 4311 4312 case NVME_FEAT_AUTO_PST: 4313 if (nsid != 0) 4314 return (EINVAL); 4315 4316 if (!nvme->n_auto_pst_supported) 4317 return (EINVAL); 4318 4319 break; 4320 4321 default: 4322 return (EINVAL); 4323 } 4324 4325 rv = nvme_get_features(nvme, B_TRUE, nsid, feature, &res, &buf, 4326 &bufsize); 4327 if (rv != 0) 4328 return (rv); 4329 4330 if (nioc->n_len < bufsize) { 4331 kmem_free(buf, bufsize); 4332 return (EINVAL); 4333 } 4334 4335 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) 4336 rv = EFAULT; 4337 4338 kmem_free(buf, bufsize); 4339 nioc->n_arg = res; 4340 nioc->n_len = bufsize; 4341 4342 return (rv); 4343 } 4344 4345 static int 4346 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4347 cred_t *cred_p) 4348 { 4349 _NOTE(ARGUNUSED(nsid, mode, cred_p)); 4350 4351 if ((mode & FREAD) == 0) 4352 return (EPERM); 4353 4354 nioc->n_arg = nvme->n_intr_cnt; 4355 return (0); 4356 } 4357 4358 static int 4359 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4360 cred_t *cred_p) 4361 { 4362 _NOTE(ARGUNUSED(nsid, cred_p)); 4363 int rv = 0; 4364 4365 if ((mode & FREAD) == 0) 4366 return (EPERM); 4367 4368 if (nioc->n_len < sizeof (nvme->n_version)) 4369 return (ENOMEM); 4370 4371 if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, 4372 sizeof (nvme->n_version), mode) != 0) 4373 rv = EFAULT; 4374 4375 return (rv); 4376 } 4377 4378 static int 4379 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4380 cred_t *cred_p) 4381 { 4382 _NOTE(ARGUNUSED(mode)); 4383 nvme_format_nvm_t frmt = { 0 }; 4384 int c_nsid = nsid != 0 ? nsid - 1 : 0; 4385 4386 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4387 return (EPERM); 4388 4389 frmt.r = nioc->n_arg & 0xffffffff; 4390 4391 /* 4392 * Check whether the FORMAT NVM command is supported. 4393 */ 4394 if (nvme->n_idctl->id_oacs.oa_format == 0) 4395 return (EINVAL); 4396 4397 /* 4398 * Don't allow format or secure erase of individual namespace if that 4399 * would cause a format or secure erase of all namespaces. 4400 */ 4401 if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) 4402 return (EINVAL); 4403 4404 if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && 4405 nvme->n_idctl->id_fna.fn_sec_erase != 0) 4406 return (EINVAL); 4407 4408 /* 4409 * Don't allow formatting with Protection Information. 4410 */ 4411 if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) 4412 return (EINVAL); 4413 4414 /* 4415 * Don't allow formatting using an illegal LBA format, or any LBA format 4416 * that uses metadata. 4417 */ 4418 if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf || 4419 nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) 4420 return (EINVAL); 4421 4422 /* 4423 * Don't allow formatting using an illegal Secure Erase setting. 4424 */ 4425 if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || 4426 (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && 4427 nvme->n_idctl->id_fna.fn_crypt_erase == 0)) 4428 return (EINVAL); 4429 4430 if (nsid == 0) 4431 nsid = (uint32_t)-1; 4432 4433 return (nvme_format_nvm(nvme, B_TRUE, nsid, frmt.b.fm_lbaf, B_FALSE, 0, 4434 B_FALSE, frmt.b.fm_ses)); 4435 } 4436 4437 static int 4438 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4439 cred_t *cred_p) 4440 { 4441 _NOTE(ARGUNUSED(nioc, mode)); 4442 int rv = 0; 4443 4444 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4445 return (EPERM); 4446 4447 if (nsid == 0) 4448 return (EINVAL); 4449 4450 rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl); 4451 if (rv != DDI_SUCCESS) 4452 rv = EBUSY; 4453 4454 return (rv); 4455 } 4456 4457 static int 4458 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4459 cred_t *cred_p) 4460 { 4461 _NOTE(ARGUNUSED(nioc, mode)); 4462 nvme_identify_nsid_t *idns; 4463 int rv = 0; 4464 4465 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4466 return (EPERM); 4467 4468 if (nsid == 0) 4469 return (EINVAL); 4470 4471 /* 4472 * Identify namespace again, free old identify data. 4473 */ 4474 idns = nvme->n_ns[nsid - 1].ns_idns; 4475 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) 4476 return (EIO); 4477 4478 kmem_free(idns, sizeof (nvme_identify_nsid_t)); 4479 4480 rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl); 4481 if (rv != DDI_SUCCESS) 4482 rv = EBUSY; 4483 4484 return (rv); 4485 } 4486 4487 static void 4488 nvme_ufm_update(nvme_t *nvme) 4489 { 4490 mutex_enter(&nvme->n_fwslot_mutex); 4491 ddi_ufm_update(nvme->n_ufmh); 4492 if (nvme->n_fwslot != NULL) { 4493 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t)); 4494 nvme->n_fwslot = NULL; 4495 } 4496 mutex_exit(&nvme->n_fwslot_mutex); 4497 } 4498 4499 static int 4500 nvme_ioctl_firmware_download(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4501 int mode, cred_t *cred_p) 4502 { 4503 int rv = 0; 4504 size_t len, copylen; 4505 offset_t offset; 4506 uintptr_t buf; 4507 nvme_sqe_t sqe = { 4508 .sqe_opc = NVME_OPC_FW_IMAGE_LOAD 4509 }; 4510 4511 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4512 return (EPERM); 4513 4514 if (nsid != 0) 4515 return (EINVAL); 4516 4517 /* 4518 * The offset (in n_len) is restricted to the number of DWORDs in 4519 * 32 bits. 4520 */ 4521 if (nioc->n_len > NVME_FW_OFFSETB_MAX) 4522 return (EINVAL); 4523 4524 /* Confirm that both offset and length are a multiple of DWORD bytes */ 4525 if ((nioc->n_len & NVME_DWORD_MASK) != 0 || 4526 (nioc->n_arg & NVME_DWORD_MASK) != 0) 4527 return (EINVAL); 4528 4529 len = nioc->n_len; 4530 offset = nioc->n_arg; 4531 buf = (uintptr_t)nioc->n_buf; 4532 while (len > 0 && rv == 0) { 4533 /* 4534 * nvme_ioc_cmd() does not use SGLs or PRP lists. 4535 * It is limited to 2 PRPs per NVM command, so limit 4536 * the size of the data to 2 pages. 4537 */ 4538 copylen = MIN(2 * nvme->n_pagesize, len); 4539 4540 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1; 4541 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT); 4542 4543 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void *)buf, copylen, 4544 FWRITE, NULL, nvme_admin_cmd_timeout); 4545 4546 buf += copylen; 4547 offset += copylen; 4548 len -= copylen; 4549 } 4550 4551 /* 4552 * Let the DDI UFM subsystem know that the firmware information for 4553 * this device has changed. 4554 */ 4555 nvme_ufm_update(nvme); 4556 4557 return (rv); 4558 } 4559 4560 static int 4561 nvme_ioctl_firmware_commit(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4562 int mode, cred_t *cred_p) 4563 { 4564 nvme_firmware_commit_dw10_t fc_dw10 = { 0 }; 4565 uint32_t slot = nioc->n_arg & 0xffffffff; 4566 uint32_t action = nioc->n_arg >> 32; 4567 nvme_cqe_t cqe = { 0 }; 4568 nvme_sqe_t sqe = { 4569 .sqe_opc = NVME_OPC_FW_ACTIVATE 4570 }; 4571 int timeout; 4572 int rv; 4573 4574 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4575 return (EPERM); 4576 4577 if (nsid != 0) 4578 return (EINVAL); 4579 4580 /* Validate slot is in range. */ 4581 if (slot < NVME_FW_SLOT_MIN || slot > NVME_FW_SLOT_MAX) 4582 return (EINVAL); 4583 4584 switch (action) { 4585 case NVME_FWC_SAVE: 4586 case NVME_FWC_SAVE_ACTIVATE: 4587 timeout = nvme_commit_save_cmd_timeout; 4588 break; 4589 case NVME_FWC_ACTIVATE: 4590 case NVME_FWC_ACTIVATE_IMMED: 4591 timeout = nvme_admin_cmd_timeout; 4592 break; 4593 default: 4594 return (EINVAL); 4595 } 4596 4597 fc_dw10.b.fc_slot = slot; 4598 fc_dw10.b.fc_action = action; 4599 sqe.sqe_cdw10 = fc_dw10.r; 4600 4601 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, NULL, 0, 0, &cqe, timeout); 4602 4603 nioc->n_arg = ((uint64_t)cqe.cqe_sf.sf_sct << 16) | cqe.cqe_sf.sf_sc; 4604 4605 /* 4606 * Let the DDI UFM subsystem know that the firmware information for 4607 * this device has changed. 4608 */ 4609 nvme_ufm_update(nvme); 4610 4611 return (rv); 4612 } 4613 4614 static int 4615 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, 4616 int *rval_p) 4617 { 4618 #ifndef __lock_lint 4619 _NOTE(ARGUNUSED(rval_p)); 4620 #endif 4621 minor_t minor = getminor(dev); 4622 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4623 int nsid = NVME_MINOR_NSID(minor); 4624 int rv = 0; 4625 nvme_ioctl_t nioc; 4626 4627 int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { 4628 NULL, 4629 nvme_ioctl_identify, 4630 nvme_ioctl_identify, 4631 nvme_ioctl_capabilities, 4632 nvme_ioctl_get_logpage, 4633 nvme_ioctl_get_features, 4634 nvme_ioctl_intr_cnt, 4635 nvme_ioctl_version, 4636 nvme_ioctl_format, 4637 nvme_ioctl_detach, 4638 nvme_ioctl_attach, 4639 nvme_ioctl_firmware_download, 4640 nvme_ioctl_firmware_commit 4641 }; 4642 4643 if (nvme == NULL) 4644 return (ENXIO); 4645 4646 if (nsid > nvme->n_namespace_count) 4647 return (ENXIO); 4648 4649 if (IS_DEVCTL(cmd)) 4650 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); 4651 4652 #ifdef _MULTI_DATAMODEL 4653 switch (ddi_model_convert_from(mode & FMODELS)) { 4654 case DDI_MODEL_ILP32: { 4655 nvme_ioctl32_t nioc32; 4656 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), 4657 mode) != 0) 4658 return (EFAULT); 4659 nioc.n_len = nioc32.n_len; 4660 nioc.n_buf = nioc32.n_buf; 4661 nioc.n_arg = nioc32.n_arg; 4662 break; 4663 } 4664 case DDI_MODEL_NONE: 4665 #endif 4666 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) 4667 != 0) 4668 return (EFAULT); 4669 #ifdef _MULTI_DATAMODEL 4670 break; 4671 } 4672 #endif 4673 4674 if (nvme->n_dead && cmd != NVME_IOC_DETACH) 4675 return (EIO); 4676 4677 4678 if (cmd == NVME_IOC_IDENTIFY_CTRL) { 4679 /* 4680 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and 4681 * attachment point nodes. 4682 */ 4683 nsid = 0; 4684 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { 4685 /* 4686 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it 4687 * will always return identify data for namespace 1. 4688 */ 4689 nsid = 1; 4690 } 4691 4692 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) 4693 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, 4694 cred_p); 4695 else 4696 rv = EINVAL; 4697 4698 #ifdef _MULTI_DATAMODEL 4699 switch (ddi_model_convert_from(mode & FMODELS)) { 4700 case DDI_MODEL_ILP32: { 4701 nvme_ioctl32_t nioc32; 4702 4703 nioc32.n_len = (size32_t)nioc.n_len; 4704 nioc32.n_buf = (uintptr32_t)nioc.n_buf; 4705 nioc32.n_arg = nioc.n_arg; 4706 4707 if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), 4708 mode) != 0) 4709 return (EFAULT); 4710 break; 4711 } 4712 case DDI_MODEL_NONE: 4713 #endif 4714 if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) 4715 != 0) 4716 return (EFAULT); 4717 #ifdef _MULTI_DATAMODEL 4718 break; 4719 } 4720 #endif 4721 4722 return (rv); 4723 } 4724 4725 /* 4726 * DDI UFM Callbacks 4727 */ 4728 static int 4729 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 4730 ddi_ufm_image_t *img) 4731 { 4732 nvme_t *nvme = arg; 4733 4734 if (imgno != 0) 4735 return (EINVAL); 4736 4737 ddi_ufm_image_set_desc(img, "Firmware"); 4738 ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot); 4739 4740 return (0); 4741 } 4742 4743 /* 4744 * Fill out firmware slot information for the requested slot. The firmware 4745 * slot information is gathered by requesting the Firmware Slot Information log 4746 * page. The format of the page is described in section 5.10.1.3. 4747 * 4748 * We lazily cache the log page on the first call and then invalidate the cache 4749 * data after a successful firmware download or firmware commit command. 4750 * The cached data is protected by a mutex as the state can change 4751 * asynchronous to this callback. 4752 */ 4753 static int 4754 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 4755 uint_t slotno, ddi_ufm_slot_t *slot) 4756 { 4757 nvme_t *nvme = arg; 4758 void *log = NULL; 4759 size_t bufsize; 4760 ddi_ufm_attr_t attr = 0; 4761 char fw_ver[NVME_FWVER_SZ + 1]; 4762 int ret; 4763 4764 if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1)) 4765 return (EINVAL); 4766 4767 mutex_enter(&nvme->n_fwslot_mutex); 4768 if (nvme->n_fwslot == NULL) { 4769 ret = nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, 4770 NVME_LOGPAGE_FWSLOT, 0); 4771 if (ret != DDI_SUCCESS || 4772 bufsize != sizeof (nvme_fwslot_log_t)) { 4773 if (log != NULL) 4774 kmem_free(log, bufsize); 4775 mutex_exit(&nvme->n_fwslot_mutex); 4776 return (EIO); 4777 } 4778 nvme->n_fwslot = (nvme_fwslot_log_t *)log; 4779 } 4780 4781 /* 4782 * NVMe numbers firmware slots starting at 1 4783 */ 4784 if (slotno == (nvme->n_fwslot->fw_afi - 1)) 4785 attr |= DDI_UFM_ATTR_ACTIVE; 4786 4787 if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0) 4788 attr |= DDI_UFM_ATTR_WRITEABLE; 4789 4790 if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') { 4791 attr |= DDI_UFM_ATTR_EMPTY; 4792 } else { 4793 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno], 4794 NVME_FWVER_SZ); 4795 fw_ver[NVME_FWVER_SZ] = '\0'; 4796 ddi_ufm_slot_set_version(slot, fw_ver); 4797 } 4798 mutex_exit(&nvme->n_fwslot_mutex); 4799 4800 ddi_ufm_slot_set_attrs(slot, attr); 4801 4802 return (0); 4803 } 4804 4805 static int 4806 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps) 4807 { 4808 *caps = DDI_UFM_CAP_REPORT; 4809 return (0); 4810 } 4811