1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 14 * Copyright 2019 Unix Software Ltd. 15 * Copyright 2020 Joyent, Inc. 16 * Copyright 2020 Racktop Systems. 17 * Copyright 2022 Oxide Computer Company. 18 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 19 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 20 */ 21 22 /* 23 * blkdev driver for NVMe compliant storage devices 24 * 25 * This driver targets and is designed to support all NVMe 1.x devices. 26 * Features are added to the driver as we encounter devices that require them 27 * and our needs, so some commands or log pages may not take advantage of newer 28 * features that devices support at this time. When you encounter such a case, 29 * it is generally fine to add that support to the driver as long as you take 30 * care to ensure that the requisite device version is met before using it. 31 * 32 * The driver has only been tested on x86 systems and will not work on big- 33 * endian systems without changes to the code accessing registers and data 34 * structures used by the hardware. 35 * 36 * 37 * Interrupt Usage: 38 * 39 * The driver will use a single interrupt while configuring the device as the 40 * specification requires, but contrary to the specification it will try to use 41 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it 42 * will switch to multiple-message MSI(-X) if supported. The driver wants to 43 * have one interrupt vector per CPU, but it will work correctly if less are 44 * available. Interrupts can be shared by queues, the interrupt handler will 45 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only 46 * the admin queue will share an interrupt with one I/O queue. The interrupt 47 * handler will retrieve completed commands from all queues sharing an interrupt 48 * vector and will post them to a taskq for completion processing. 49 * 50 * 51 * Command Processing: 52 * 53 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up 54 * to 65536 I/O commands. The driver will configure one I/O queue pair per 55 * available interrupt vector, with the queue length usually much smaller than 56 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 57 * interrupt vectors will be used. 58 * 59 * Additionally the hardware provides a single special admin queue pair that can 60 * hold up to 4096 admin commands. 61 * 62 * From the hardware perspective both queues of a queue pair are independent, 63 * but they share some driver state: the command array (holding pointers to 64 * commands currently being processed by the hardware) and the active command 65 * counter. Access to a submission queue and the shared state is protected by 66 * nq_mutex; completion queue is protected by ncq_mutex. 67 * 68 * When a command is submitted to a queue pair the active command counter is 69 * incremented and a pointer to the command is stored in the command array. The 70 * array index is used as command identifier (CID) in the submission queue 71 * entry. Some commands may take a very long time to complete, and if the queue 72 * wraps around in that time a submission may find the next array slot to still 73 * be used by a long-running command. In this case the array is sequentially 74 * searched for the next free slot. The length of the command array is the same 75 * as the configured queue length. Queue overrun is prevented by the semaphore, 76 * so a command submission may block if the queue is full. 77 * 78 * 79 * Polled I/O Support: 80 * 81 * For kernel core dump support the driver can do polled I/O. As interrupts are 82 * turned off while dumping the driver will just submit a command in the regular 83 * way, and then repeatedly attempt a command retrieval until it gets the 84 * command back. 85 * 86 * 87 * Namespace Support: 88 * 89 * NVMe devices can have multiple namespaces, each being a independent data 90 * store. The driver supports multiple namespaces and creates a blkdev interface 91 * for each namespace found. Namespaces can have various attributes to support 92 * protection information. This driver does not support any of this and ignores 93 * namespaces that have these attributes. 94 * 95 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 96 * (EUI64). This driver uses the EUI64 if present to generate the devid and 97 * passes it to blkdev to use it in the device node names. As this is currently 98 * untested namespaces with EUI64 are ignored by default. 99 * 100 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a 101 * single controller. This is an artificial limit imposed by the driver to be 102 * able to address a reasonable number of controllers and namespaces using a 103 * 32bit minor node number. 104 * 105 * 106 * Minor nodes: 107 * 108 * For each NVMe device the driver exposes one minor node for the controller and 109 * one minor node for each namespace. The only operations supported by those 110 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the 111 * interface for the nvmeadm(8) utility. 112 * 113 * Exclusive opens are required for certain ioctl(9E) operations that alter 114 * controller and/or namespace state. While different namespaces may be opened 115 * exclusively in parallel, an exclusive open of the controller minor node 116 * requires that no namespaces are currently open (exclusive or otherwise). 117 * Opening any namespace minor node (exclusive or otherwise) will fail while 118 * the controller minor node is opened exclusively by any other thread. Thus it 119 * is possible for one thread at a time to open the controller minor node 120 * exclusively, and keep it open while opening any namespace minor node of the 121 * same controller, exclusively or otherwise. 122 * 123 * 124 * 125 * Blkdev Interface: 126 * 127 * This driver uses blkdev to do all the heavy lifting involved with presenting 128 * a disk device to the system. As a result, the processing of I/O requests is 129 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 130 * setup, and splitting of transfers into manageable chunks. 131 * 132 * I/O requests coming in from blkdev are turned into NVM commands and posted to 133 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 134 * queues. There is currently no timeout handling of I/O commands. 135 * 136 * Blkdev also supports querying device/media information and generating a 137 * devid. The driver reports the best block size as determined by the namespace 138 * format back to blkdev as physical block size to support partition and block 139 * alignment. The devid is either based on the namespace EUI64, if present, or 140 * composed using the device vendor ID, model number, serial number, and the 141 * namespace ID. 142 * 143 * 144 * Error Handling: 145 * 146 * Error handling is currently limited to detecting fatal hardware errors, 147 * either by asynchronous events, or synchronously through command status or 148 * admin command timeouts. In case of severe errors the device is fenced off, 149 * all further requests will return EIO. FMA is then called to fault the device. 150 * 151 * The hardware has a limit for outstanding asynchronous event requests. Before 152 * this limit is known the driver assumes it is at least 1 and posts a single 153 * asynchronous request. Later when the limit is known more asynchronous event 154 * requests are posted to allow quicker reception of error information. When an 155 * asynchronous event is posted by the hardware the driver will parse the error 156 * status fields and log information or fault the device, depending on the 157 * severity of the asynchronous event. The asynchronous event request is then 158 * reused and posted to the admin queue again. 159 * 160 * On command completion the command status is checked for errors. In case of 161 * errors indicating a driver bug the driver panics. Almost all other error 162 * status values just cause EIO to be returned. 163 * 164 * Command timeouts are currently detected for all admin commands except 165 * asynchronous event requests. If a command times out and the hardware appears 166 * to be healthy the driver attempts to abort the command. The original command 167 * timeout is also applied to the abort command. If the abort times out too the 168 * driver assumes the device to be dead, fences it off, and calls FMA to retire 169 * it. In all other cases the aborted command should return immediately with a 170 * status indicating it was aborted, and the driver will wait indefinitely for 171 * that to happen. No timeout handling of normal I/O commands is presently done. 172 * 173 * Any command that times out due to the controller dropping dead will be put on 174 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA 175 * memory being reused by the system and later be written to by a "dead" NVMe 176 * controller. 177 * 178 * 179 * Locking: 180 * 181 * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held 182 * when accessing shared state and submission queue registers, ncq_mutex 183 * is held when accessing completion queue state and registers. 184 * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while 185 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both 186 * mutexes themselves. 187 * 188 * Each command also has its own nc_mutex, which is associated with the 189 * condition variable nc_cv. It is only used on admin commands which are run 190 * synchronously. In that case it must be held across calls to 191 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by 192 * nvme_admin_cmd(). It must also be held whenever the completion state of the 193 * command is changed or while a admin command timeout is handled. 194 * 195 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first. 196 * More than one nc_mutex may only be held when aborting commands. In this case, 197 * the nc_mutex of the command to be aborted must be held across the call to 198 * nvme_abort_cmd() to prevent the command from completing while the abort is in 199 * progress. 200 * 201 * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be 202 * acquired first. More than one nq_mutex is never held by a single thread. 203 * The ncq_mutex is only held by nvme_retrieve_cmd() and 204 * nvme_process_iocq(). nvme_process_iocq() is only called from the 205 * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the 206 * mutex is non-contentious but is required for implementation completeness 207 * and safety. 208 * 209 * There is one mutex n_minor_mutex which protects all open flags nm_open and 210 * exclusive-open thread pointers nm_oexcl of each minor node associated with a 211 * controller and its namespaces. 212 * 213 * In addition, there is one mutex n_mgmt_mutex which must be held whenever the 214 * driver state for any namespace is changed, especially across calls to 215 * nvme_init_ns(), nvme_attach_ns() and nvme_detach_ns(). Except when detaching 216 * nvme, it should also be held across calls that modify the blkdev handle of a 217 * namespace. Command and queue mutexes may be acquired and released while 218 * n_mgmt_mutex is held, n_minor_mutex should not. 219 * 220 * 221 * Quiesce / Fast Reboot: 222 * 223 * The driver currently does not support fast reboot. A quiesce(9E) entry point 224 * is still provided which is used to send a shutdown notification to the 225 * device. 226 * 227 * 228 * NVMe Hotplug: 229 * 230 * The driver supports hot removal. The driver uses the NDI event framework 231 * to register a callback, nvme_remove_callback, to clean up when a disk is 232 * removed. In particular, the driver will unqueue outstanding I/O commands and 233 * set n_dead on the softstate to true so that other operations, such as ioctls 234 * and command submissions, fail as well. 235 * 236 * While the callback registration relies on the NDI event framework, the 237 * removal event itself is kicked off in the PCIe hotplug framework, when the 238 * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a 239 * device was removed from the slot. 240 * 241 * The NVMe driver instance itself will remain until the final close of the 242 * device. 243 * 244 * 245 * DDI UFM Support 246 * 247 * The driver supports the DDI UFM framework for reporting information about 248 * the device's firmware image and slot configuration. This data can be 249 * queried by userland software via ioctls to the ufm driver. For more 250 * information, see ddi_ufm(9E). 251 * 252 * 253 * Driver Configuration: 254 * 255 * The following driver properties can be changed to control some aspects of the 256 * drivers operation: 257 * - strict-version: can be set to 0 to allow devices conforming to newer 258 * major versions to be used 259 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 260 * specific command status as a fatal error leading device faulting 261 * - admin-queue-len: the maximum length of the admin queue (16-4096) 262 * - io-squeue-len: the maximum length of the I/O submission queues (16-65536) 263 * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536) 264 * - async-event-limit: the maximum number of asynchronous event requests to be 265 * posted by the driver 266 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 267 * cache 268 * - min-phys-block-size: the minimum physical block size to report to blkdev, 269 * which is among other things the basis for ZFS vdev ashift 270 * - max-submission-queues: the maximum number of I/O submission queues. 271 * - max-completion-queues: the maximum number of I/O completion queues, 272 * can be less than max-submission-queues, in which case the completion 273 * queues are shared. 274 * 275 * 276 * TODO: 277 * - figure out sane default for I/O queue depth reported to blkdev 278 * - FMA handling of media errors 279 * - support for devices supporting very large I/O requests using chained PRPs 280 * - support for configuring hardware parameters like interrupt coalescing 281 * - support for media formatting and hard partitioning into namespaces 282 * - support for big-endian systems 283 * - support for fast reboot 284 * - support for NVMe Subsystem Reset (1.1) 285 * - support for Scatter/Gather lists (1.1) 286 * - support for Reservations (1.1) 287 * - support for power management 288 */ 289 290 #include <sys/byteorder.h> 291 #ifdef _BIG_ENDIAN 292 #error nvme driver needs porting for big-endian platforms 293 #endif 294 295 #include <sys/modctl.h> 296 #include <sys/conf.h> 297 #include <sys/devops.h> 298 #include <sys/ddi.h> 299 #include <sys/ddi_ufm.h> 300 #include <sys/sunddi.h> 301 #include <sys/sunndi.h> 302 #include <sys/bitmap.h> 303 #include <sys/sysmacros.h> 304 #include <sys/param.h> 305 #include <sys/varargs.h> 306 #include <sys/cpuvar.h> 307 #include <sys/disp.h> 308 #include <sys/blkdev.h> 309 #include <sys/atomic.h> 310 #include <sys/archsystm.h> 311 #include <sys/sata/sata_hba.h> 312 #include <sys/stat.h> 313 #include <sys/policy.h> 314 #include <sys/list.h> 315 #include <sys/dkio.h> 316 317 #include <sys/nvme.h> 318 319 #ifdef __x86 320 #include <sys/x86_archext.h> 321 #endif 322 323 #include "nvme_reg.h" 324 #include "nvme_var.h" 325 326 /* 327 * Assertions to make sure that we've properly captured various aspects of the 328 * packed structures and haven't broken them during updates. 329 */ 330 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000); 331 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256); 332 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512); 333 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520); 334 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768); 335 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792); 336 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048); 337 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072); 338 339 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000); 340 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32); 341 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92); 342 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104); 343 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128); 344 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384); 345 346 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000); 347 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32); 348 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64); 349 350 351 /* NVMe spec version supported */ 352 static const int nvme_version_major = 1; 353 354 /* tunable for admin command timeout in seconds, default is 1s */ 355 int nvme_admin_cmd_timeout = 1; 356 357 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */ 358 int nvme_format_cmd_timeout = 600; 359 360 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */ 361 int nvme_commit_save_cmd_timeout = 15; 362 363 /* 364 * tunable for the size of arbitrary vendor specific admin commands, 365 * default is 16MiB. 366 */ 367 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24; 368 369 /* 370 * tunable for the max timeout of arbitary vendor specific admin commands, 371 * default is 60s. 372 */ 373 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60; 374 375 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 376 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 377 static int nvme_quiesce(dev_info_t *); 378 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 379 static int nvme_setup_interrupts(nvme_t *, int, int); 380 static void nvme_release_interrupts(nvme_t *); 381 static uint_t nvme_intr(caddr_t, caddr_t); 382 383 static void nvme_shutdown(nvme_t *, int, boolean_t); 384 static boolean_t nvme_reset(nvme_t *, boolean_t); 385 static int nvme_init(nvme_t *); 386 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 387 static void nvme_free_cmd(nvme_cmd_t *); 388 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 389 bd_xfer_t *); 390 static void nvme_admin_cmd(nvme_cmd_t *, int); 391 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *); 392 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *); 393 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *); 394 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int); 395 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 396 static void nvme_wait_cmd(nvme_cmd_t *, uint_t); 397 static void nvme_wakeup_cmd(void *); 398 static void nvme_async_event_task(void *); 399 400 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 401 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 402 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 403 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 404 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 405 static inline int nvme_check_cmd_status(nvme_cmd_t *); 406 407 static int nvme_abort_cmd(nvme_cmd_t *, uint_t); 408 static void nvme_async_event(nvme_t *); 409 static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t, 410 uint8_t, boolean_t, uint8_t); 411 static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t, 412 ...); 413 static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **); 414 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t, 415 uint32_t *); 416 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *, 417 void **, size_t *); 418 static int nvme_write_cache_set(nvme_t *, boolean_t); 419 static int nvme_set_nqueues(nvme_t *); 420 421 static void nvme_free_dma(nvme_dma_t *); 422 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 423 nvme_dma_t **); 424 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 425 nvme_dma_t **); 426 static void nvme_free_qpair(nvme_qpair_t *); 427 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t); 428 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 429 430 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 431 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 432 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 433 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 434 435 static boolean_t nvme_check_regs_hdl(nvme_t *); 436 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 437 438 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t); 439 440 static void nvme_bd_xfer_done(void *); 441 static void nvme_bd_driveinfo(void *, bd_drive_t *); 442 static int nvme_bd_mediainfo(void *, bd_media_t *); 443 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 444 static int nvme_bd_read(void *, bd_xfer_t *); 445 static int nvme_bd_write(void *, bd_xfer_t *); 446 static int nvme_bd_sync(void *, bd_xfer_t *); 447 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 448 static int nvme_bd_free_space(void *, bd_xfer_t *); 449 450 static int nvme_prp_dma_constructor(void *, void *, int); 451 static void nvme_prp_dma_destructor(void *, void *); 452 453 static void nvme_prepare_devid(nvme_t *, uint32_t); 454 455 /* DDI UFM callbacks */ 456 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t, 457 ddi_ufm_image_t *); 458 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t, 459 ddi_ufm_slot_t *); 460 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *); 461 462 static int nvme_open(dev_t *, int, int, cred_t *); 463 static int nvme_close(dev_t, int, int, cred_t *); 464 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 465 466 static int nvme_init_ns(nvme_t *, int); 467 static int nvme_attach_ns(nvme_t *, int); 468 static int nvme_detach_ns(nvme_t *, int); 469 470 #define NVME_NSID2NS(nvme, nsid) (&((nvme)->n_ns[(nsid) - 1])) 471 472 static ddi_ufm_ops_t nvme_ufm_ops = { 473 NULL, 474 nvme_ufm_fill_image, 475 nvme_ufm_fill_slot, 476 nvme_ufm_getcaps 477 }; 478 479 #define NVME_MINOR_INST_SHIFT 9 480 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) 481 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) 482 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) 483 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) 484 #define NVME_IS_VENDOR_SPECIFIC_CMD(x) (((x) >= 0xC0) && ((x) <= 0xFF)) 485 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MIN 0xC0 486 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX 0xFF 487 #define NVME_IS_VENDOR_SPECIFIC_LOGPAGE(x) \ 488 (((x) >= NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) && \ 489 ((x) <= NVME_VENDOR_SPECIFIC_LOGPAGE_MAX)) 490 491 /* 492 * NVMe versions 1.3 and later actually support log pages up to UINT32_MAX 493 * DWords in size. However, revision 1.3 also modified the layout of the Get Log 494 * Page command significantly relative to version 1.2, including changing 495 * reserved bits, adding new bitfields, and requiring the use of command DWord 496 * 11 to fully specify the size of the log page (the lower and upper 16 bits of 497 * the number of DWords in the page are split between DWord 10 and DWord 11, 498 * respectively). 499 * 500 * All of these impose significantly different layout requirements on the 501 * `nvme_getlogpage_t` type. This could be solved with two different types, or a 502 * complicated/nested union with the two versions as the overlying members. Both 503 * of these are reasonable, if a bit convoluted. However, these is no current 504 * need for such large pages, or a way to test them, as most log pages actually 505 * fit within the current size limit. So for simplicity, we retain the size cap 506 * from version 1.2. 507 * 508 * Note that the number of DWords is zero-based, so we add 1. It is subtracted 509 * to form a zero-based value in `nvme_get_logpage`. 510 */ 511 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE \ 512 (((1 << 12) + 1) * sizeof (uint32_t)) 513 514 static void *nvme_state; 515 static kmem_cache_t *nvme_cmd_cache; 516 517 /* 518 * DMA attributes for queue DMA memory 519 * 520 * Queue DMA memory must be page aligned. The maximum length of a queue is 521 * 65536 entries, and an entry can be 64 bytes long. 522 */ 523 static ddi_dma_attr_t nvme_queue_dma_attr = { 524 .dma_attr_version = DMA_ATTR_V0, 525 .dma_attr_addr_lo = 0, 526 .dma_attr_addr_hi = 0xffffffffffffffffULL, 527 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 528 .dma_attr_align = 0x1000, 529 .dma_attr_burstsizes = 0x7ff, 530 .dma_attr_minxfer = 0x1000, 531 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 532 .dma_attr_seg = 0xffffffffffffffffULL, 533 .dma_attr_sgllen = 1, 534 .dma_attr_granular = 1, 535 .dma_attr_flags = 0, 536 }; 537 538 /* 539 * DMA attributes for transfers using Physical Region Page (PRP) entries 540 * 541 * A PRP entry describes one page of DMA memory using the page size specified 542 * in the controller configuration's memory page size register (CC.MPS). It uses 543 * a 64bit base address aligned to this page size. There is no limitation on 544 * chaining PRPs together for arbitrarily large DMA transfers. 545 */ 546 static ddi_dma_attr_t nvme_prp_dma_attr = { 547 .dma_attr_version = DMA_ATTR_V0, 548 .dma_attr_addr_lo = 0, 549 .dma_attr_addr_hi = 0xffffffffffffffffULL, 550 .dma_attr_count_max = 0xfff, 551 .dma_attr_align = 0x1000, 552 .dma_attr_burstsizes = 0x7ff, 553 .dma_attr_minxfer = 0x1000, 554 .dma_attr_maxxfer = 0x1000, 555 .dma_attr_seg = 0xfff, 556 .dma_attr_sgllen = -1, 557 .dma_attr_granular = 1, 558 .dma_attr_flags = 0, 559 }; 560 561 /* 562 * DMA attributes for transfers using scatter/gather lists 563 * 564 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 565 * 32bit length field. SGL Segment and SGL Last Segment entries require the 566 * length to be a multiple of 16 bytes. 567 */ 568 static ddi_dma_attr_t nvme_sgl_dma_attr = { 569 .dma_attr_version = DMA_ATTR_V0, 570 .dma_attr_addr_lo = 0, 571 .dma_attr_addr_hi = 0xffffffffffffffffULL, 572 .dma_attr_count_max = 0xffffffffUL, 573 .dma_attr_align = 1, 574 .dma_attr_burstsizes = 0x7ff, 575 .dma_attr_minxfer = 0x10, 576 .dma_attr_maxxfer = 0xfffffffffULL, 577 .dma_attr_seg = 0xffffffffffffffffULL, 578 .dma_attr_sgllen = -1, 579 .dma_attr_granular = 0x10, 580 .dma_attr_flags = 0 581 }; 582 583 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 584 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 585 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 586 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 587 }; 588 589 static struct cb_ops nvme_cb_ops = { 590 .cb_open = nvme_open, 591 .cb_close = nvme_close, 592 .cb_strategy = nodev, 593 .cb_print = nodev, 594 .cb_dump = nodev, 595 .cb_read = nodev, 596 .cb_write = nodev, 597 .cb_ioctl = nvme_ioctl, 598 .cb_devmap = nodev, 599 .cb_mmap = nodev, 600 .cb_segmap = nodev, 601 .cb_chpoll = nochpoll, 602 .cb_prop_op = ddi_prop_op, 603 .cb_str = 0, 604 .cb_flag = D_NEW | D_MP, 605 .cb_rev = CB_REV, 606 .cb_aread = nodev, 607 .cb_awrite = nodev 608 }; 609 610 static struct dev_ops nvme_dev_ops = { 611 .devo_rev = DEVO_REV, 612 .devo_refcnt = 0, 613 .devo_getinfo = ddi_no_info, 614 .devo_identify = nulldev, 615 .devo_probe = nulldev, 616 .devo_attach = nvme_attach, 617 .devo_detach = nvme_detach, 618 .devo_reset = nodev, 619 .devo_cb_ops = &nvme_cb_ops, 620 .devo_bus_ops = NULL, 621 .devo_power = NULL, 622 .devo_quiesce = nvme_quiesce, 623 }; 624 625 static struct modldrv nvme_modldrv = { 626 .drv_modops = &mod_driverops, 627 .drv_linkinfo = "NVMe v1.1b", 628 .drv_dev_ops = &nvme_dev_ops 629 }; 630 631 static struct modlinkage nvme_modlinkage = { 632 .ml_rev = MODREV_1, 633 .ml_linkage = { &nvme_modldrv, NULL } 634 }; 635 636 static bd_ops_t nvme_bd_ops = { 637 .o_version = BD_OPS_CURRENT_VERSION, 638 .o_drive_info = nvme_bd_driveinfo, 639 .o_media_info = nvme_bd_mediainfo, 640 .o_devid_init = nvme_bd_devid, 641 .o_sync_cache = nvme_bd_sync, 642 .o_read = nvme_bd_read, 643 .o_write = nvme_bd_write, 644 .o_free_space = nvme_bd_free_space, 645 }; 646 647 /* 648 * This list will hold commands that have timed out and couldn't be aborted. 649 * As we don't know what the hardware may still do with the DMA memory we can't 650 * free them, so we'll keep them forever on this list where we can easily look 651 * at them with mdb. 652 */ 653 static struct list nvme_lost_cmds; 654 static kmutex_t nvme_lc_mutex; 655 656 int 657 _init(void) 658 { 659 int error; 660 661 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 662 if (error != DDI_SUCCESS) 663 return (error); 664 665 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 666 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 667 668 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL); 669 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t), 670 offsetof(nvme_cmd_t, nc_list)); 671 672 bd_mod_init(&nvme_dev_ops); 673 674 error = mod_install(&nvme_modlinkage); 675 if (error != DDI_SUCCESS) { 676 ddi_soft_state_fini(&nvme_state); 677 mutex_destroy(&nvme_lc_mutex); 678 list_destroy(&nvme_lost_cmds); 679 bd_mod_fini(&nvme_dev_ops); 680 } 681 682 return (error); 683 } 684 685 int 686 _fini(void) 687 { 688 int error; 689 690 if (!list_is_empty(&nvme_lost_cmds)) 691 return (DDI_FAILURE); 692 693 error = mod_remove(&nvme_modlinkage); 694 if (error == DDI_SUCCESS) { 695 ddi_soft_state_fini(&nvme_state); 696 kmem_cache_destroy(nvme_cmd_cache); 697 mutex_destroy(&nvme_lc_mutex); 698 list_destroy(&nvme_lost_cmds); 699 bd_mod_fini(&nvme_dev_ops); 700 } 701 702 return (error); 703 } 704 705 int 706 _info(struct modinfo *modinfop) 707 { 708 return (mod_info(&nvme_modlinkage, modinfop)); 709 } 710 711 static inline void 712 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 713 { 714 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 715 716 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 717 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 718 } 719 720 static inline void 721 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 722 { 723 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 724 725 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 726 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 727 } 728 729 static inline uint64_t 730 nvme_get64(nvme_t *nvme, uintptr_t reg) 731 { 732 uint64_t val; 733 734 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 735 736 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 737 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 738 739 return (val); 740 } 741 742 static inline uint32_t 743 nvme_get32(nvme_t *nvme, uintptr_t reg) 744 { 745 uint32_t val; 746 747 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 748 749 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 750 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 751 752 return (val); 753 } 754 755 static boolean_t 756 nvme_check_regs_hdl(nvme_t *nvme) 757 { 758 ddi_fm_error_t error; 759 760 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 761 762 if (error.fme_status != DDI_FM_OK) 763 return (B_TRUE); 764 765 return (B_FALSE); 766 } 767 768 static boolean_t 769 nvme_check_dma_hdl(nvme_dma_t *dma) 770 { 771 ddi_fm_error_t error; 772 773 if (dma == NULL) 774 return (B_FALSE); 775 776 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 777 778 if (error.fme_status != DDI_FM_OK) 779 return (B_TRUE); 780 781 return (B_FALSE); 782 } 783 784 static void 785 nvme_free_dma_common(nvme_dma_t *dma) 786 { 787 if (dma->nd_dmah != NULL) 788 (void) ddi_dma_unbind_handle(dma->nd_dmah); 789 if (dma->nd_acch != NULL) 790 ddi_dma_mem_free(&dma->nd_acch); 791 if (dma->nd_dmah != NULL) 792 ddi_dma_free_handle(&dma->nd_dmah); 793 } 794 795 static void 796 nvme_free_dma(nvme_dma_t *dma) 797 { 798 nvme_free_dma_common(dma); 799 kmem_free(dma, sizeof (*dma)); 800 } 801 802 /* ARGSUSED */ 803 static void 804 nvme_prp_dma_destructor(void *buf, void *private) 805 { 806 nvme_dma_t *dma = (nvme_dma_t *)buf; 807 808 nvme_free_dma_common(dma); 809 } 810 811 static int 812 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 813 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 814 { 815 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 816 &dma->nd_dmah) != DDI_SUCCESS) { 817 /* 818 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 819 * the only other possible error is DDI_DMA_BADATTR which 820 * indicates a driver bug which should cause a panic. 821 */ 822 dev_err(nvme->n_dip, CE_PANIC, 823 "!failed to get DMA handle, check DMA attributes"); 824 return (DDI_FAILURE); 825 } 826 827 /* 828 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 829 * or the flags are conflicting, which isn't the case here. 830 */ 831 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 832 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 833 &dma->nd_len, &dma->nd_acch); 834 835 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 836 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 837 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 838 dev_err(nvme->n_dip, CE_WARN, 839 "!failed to bind DMA memory"); 840 atomic_inc_32(&nvme->n_dma_bind_err); 841 nvme_free_dma_common(dma); 842 return (DDI_FAILURE); 843 } 844 845 return (DDI_SUCCESS); 846 } 847 848 static int 849 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 850 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 851 { 852 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 853 854 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 855 DDI_SUCCESS) { 856 *ret = NULL; 857 kmem_free(dma, sizeof (nvme_dma_t)); 858 return (DDI_FAILURE); 859 } 860 861 bzero(dma->nd_memp, dma->nd_len); 862 863 *ret = dma; 864 return (DDI_SUCCESS); 865 } 866 867 /* ARGSUSED */ 868 static int 869 nvme_prp_dma_constructor(void *buf, void *private, int flags) 870 { 871 nvme_dma_t *dma = (nvme_dma_t *)buf; 872 nvme_t *nvme = (nvme_t *)private; 873 874 dma->nd_dmah = NULL; 875 dma->nd_acch = NULL; 876 877 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 878 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 879 return (-1); 880 } 881 882 ASSERT(dma->nd_ncookie == 1); 883 884 dma->nd_cached = B_TRUE; 885 886 return (0); 887 } 888 889 static int 890 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 891 uint_t flags, nvme_dma_t **dma) 892 { 893 uint32_t len = nentry * qe_len; 894 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 895 896 len = roundup(len, nvme->n_pagesize); 897 898 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 899 != DDI_SUCCESS) { 900 dev_err(nvme->n_dip, CE_WARN, 901 "!failed to get DMA memory for queue"); 902 goto fail; 903 } 904 905 if ((*dma)->nd_ncookie != 1) { 906 dev_err(nvme->n_dip, CE_WARN, 907 "!got too many cookies for queue DMA"); 908 goto fail; 909 } 910 911 return (DDI_SUCCESS); 912 913 fail: 914 if (*dma) { 915 nvme_free_dma(*dma); 916 *dma = NULL; 917 } 918 919 return (DDI_FAILURE); 920 } 921 922 static void 923 nvme_free_cq(nvme_cq_t *cq) 924 { 925 mutex_destroy(&cq->ncq_mutex); 926 927 if (cq->ncq_cmd_taskq != NULL) 928 taskq_destroy(cq->ncq_cmd_taskq); 929 930 if (cq->ncq_dma != NULL) 931 nvme_free_dma(cq->ncq_dma); 932 933 kmem_free(cq, sizeof (*cq)); 934 } 935 936 static void 937 nvme_free_qpair(nvme_qpair_t *qp) 938 { 939 int i; 940 941 mutex_destroy(&qp->nq_mutex); 942 sema_destroy(&qp->nq_sema); 943 944 if (qp->nq_sqdma != NULL) 945 nvme_free_dma(qp->nq_sqdma); 946 947 if (qp->nq_active_cmds > 0) 948 for (i = 0; i != qp->nq_nentry; i++) 949 if (qp->nq_cmd[i] != NULL) 950 nvme_free_cmd(qp->nq_cmd[i]); 951 952 if (qp->nq_cmd != NULL) 953 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 954 955 kmem_free(qp, sizeof (nvme_qpair_t)); 956 } 957 958 /* 959 * Destroy the pre-allocated cq array, but only free individual completion 960 * queues from the given starting index. 961 */ 962 static void 963 nvme_destroy_cq_array(nvme_t *nvme, uint_t start) 964 { 965 uint_t i; 966 967 for (i = start; i < nvme->n_cq_count; i++) 968 if (nvme->n_cq[i] != NULL) 969 nvme_free_cq(nvme->n_cq[i]); 970 971 kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count); 972 } 973 974 static int 975 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx, 976 uint_t nthr) 977 { 978 nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); 979 char name[64]; /* large enough for the taskq name */ 980 981 mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, 982 DDI_INTR_PRI(nvme->n_intr_pri)); 983 984 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 985 DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS) 986 goto fail; 987 988 cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp; 989 cq->ncq_nentry = nentry; 990 cq->ncq_id = idx; 991 cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); 992 993 /* 994 * Each completion queue has its own command taskq. 995 */ 996 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u", 997 ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx); 998 999 cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX, 1000 TASKQ_PREPOPULATE); 1001 1002 if (cq->ncq_cmd_taskq == NULL) { 1003 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd " 1004 "taskq for cq %u", idx); 1005 goto fail; 1006 } 1007 1008 *cqp = cq; 1009 return (DDI_SUCCESS); 1010 1011 fail: 1012 nvme_free_cq(cq); 1013 *cqp = NULL; 1014 1015 return (DDI_FAILURE); 1016 } 1017 1018 /* 1019 * Create the n_cq array big enough to hold "ncq" completion queues. 1020 * If the array already exists it will be re-sized (but only larger). 1021 * The admin queue is included in this array, which boosts the 1022 * max number of entries to UINT16_MAX + 1. 1023 */ 1024 static int 1025 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr) 1026 { 1027 nvme_cq_t **cq; 1028 uint_t i, cq_count; 1029 1030 ASSERT3U(ncq, >, nvme->n_cq_count); 1031 1032 cq = nvme->n_cq; 1033 cq_count = nvme->n_cq_count; 1034 1035 nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP); 1036 nvme->n_cq_count = ncq; 1037 1038 for (i = 0; i < cq_count; i++) 1039 nvme->n_cq[i] = cq[i]; 1040 1041 for (; i < nvme->n_cq_count; i++) 1042 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) != 1043 DDI_SUCCESS) 1044 goto fail; 1045 1046 if (cq != NULL) 1047 kmem_free(cq, sizeof (*cq) * cq_count); 1048 1049 return (DDI_SUCCESS); 1050 1051 fail: 1052 nvme_destroy_cq_array(nvme, cq_count); 1053 /* 1054 * Restore the original array 1055 */ 1056 nvme->n_cq_count = cq_count; 1057 nvme->n_cq = cq; 1058 1059 return (DDI_FAILURE); 1060 } 1061 1062 static int 1063 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 1064 uint_t idx) 1065 { 1066 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 1067 uint_t cq_idx; 1068 1069 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 1070 DDI_INTR_PRI(nvme->n_intr_pri)); 1071 1072 /* 1073 * The NVMe spec defines that a full queue has one empty (unused) slot; 1074 * initialize the semaphore accordingly. 1075 */ 1076 sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL); 1077 1078 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 1079 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 1080 goto fail; 1081 1082 /* 1083 * idx == 0 is adminq, those above 0 are shared io completion queues. 1084 */ 1085 cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1); 1086 qp->nq_cq = nvme->n_cq[cq_idx]; 1087 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 1088 qp->nq_nentry = nentry; 1089 1090 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 1091 1092 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 1093 qp->nq_next_cmd = 0; 1094 1095 *nqp = qp; 1096 return (DDI_SUCCESS); 1097 1098 fail: 1099 nvme_free_qpair(qp); 1100 *nqp = NULL; 1101 1102 return (DDI_FAILURE); 1103 } 1104 1105 static nvme_cmd_t * 1106 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 1107 { 1108 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 1109 1110 if (cmd == NULL) 1111 return (cmd); 1112 1113 bzero(cmd, sizeof (nvme_cmd_t)); 1114 1115 cmd->nc_nvme = nvme; 1116 1117 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 1118 DDI_INTR_PRI(nvme->n_intr_pri)); 1119 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 1120 1121 return (cmd); 1122 } 1123 1124 static void 1125 nvme_free_cmd(nvme_cmd_t *cmd) 1126 { 1127 /* Don't free commands on the lost commands list. */ 1128 if (list_link_active(&cmd->nc_list)) 1129 return; 1130 1131 if (cmd->nc_dma) { 1132 nvme_free_dma(cmd->nc_dma); 1133 cmd->nc_dma = NULL; 1134 } 1135 1136 if (cmd->nc_prp) { 1137 kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp); 1138 cmd->nc_prp = NULL; 1139 } 1140 1141 cv_destroy(&cmd->nc_cv); 1142 mutex_destroy(&cmd->nc_mutex); 1143 1144 kmem_cache_free(nvme_cmd_cache, cmd); 1145 } 1146 1147 static void 1148 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1149 { 1150 sema_p(&qp->nq_sema); 1151 nvme_submit_cmd_common(qp, cmd); 1152 } 1153 1154 static int 1155 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1156 { 1157 if (cmd->nc_nvme->n_dead) { 1158 return (EIO); 1159 } 1160 1161 if (sema_tryp(&qp->nq_sema) == 0) 1162 return (EAGAIN); 1163 1164 nvme_submit_cmd_common(qp, cmd); 1165 return (0); 1166 } 1167 1168 static void 1169 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1170 { 1171 nvme_reg_sqtdbl_t tail = { 0 }; 1172 1173 mutex_enter(&qp->nq_mutex); 1174 cmd->nc_completed = B_FALSE; 1175 1176 /* 1177 * Now that we hold the queue pair lock, we must check whether or not 1178 * the controller has been listed as dead (e.g. was removed due to 1179 * hotplug). This is necessary as otherwise we could race with 1180 * nvme_remove_callback(). Because this has not been enqueued, we don't 1181 * call nvme_unqueue_cmd(), which is why we must manually decrement the 1182 * semaphore. 1183 */ 1184 if (cmd->nc_nvme->n_dead) { 1185 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback, 1186 cmd, TQ_NOSLEEP, &cmd->nc_tqent); 1187 sema_v(&qp->nq_sema); 1188 mutex_exit(&qp->nq_mutex); 1189 return; 1190 } 1191 1192 /* 1193 * Try to insert the cmd into the active cmd array at the nq_next_cmd 1194 * slot. If the slot is already occupied advance to the next slot and 1195 * try again. This can happen for long running commands like async event 1196 * requests. 1197 */ 1198 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 1199 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1200 qp->nq_cmd[qp->nq_next_cmd] = cmd; 1201 1202 qp->nq_active_cmds++; 1203 1204 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 1205 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 1206 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 1207 sizeof (nvme_sqe_t) * qp->nq_sqtail, 1208 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 1209 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1210 1211 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 1212 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 1213 1214 mutex_exit(&qp->nq_mutex); 1215 } 1216 1217 static nvme_cmd_t * 1218 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid) 1219 { 1220 nvme_cmd_t *cmd; 1221 1222 ASSERT(mutex_owned(&qp->nq_mutex)); 1223 ASSERT3S(cid, <, qp->nq_nentry); 1224 1225 cmd = qp->nq_cmd[cid]; 1226 qp->nq_cmd[cid] = NULL; 1227 ASSERT3U(qp->nq_active_cmds, >, 0); 1228 qp->nq_active_cmds--; 1229 sema_v(&qp->nq_sema); 1230 1231 ASSERT3P(cmd, !=, NULL); 1232 ASSERT3P(cmd->nc_nvme, ==, nvme); 1233 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid); 1234 1235 return (cmd); 1236 } 1237 1238 /* 1239 * Get the command tied to the next completed cqe and bump along completion 1240 * queue head counter. 1241 */ 1242 static nvme_cmd_t * 1243 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq) 1244 { 1245 nvme_qpair_t *qp; 1246 nvme_cqe_t *cqe; 1247 nvme_cmd_t *cmd; 1248 1249 ASSERT(mutex_owned(&cq->ncq_mutex)); 1250 1251 cqe = &cq->ncq_cq[cq->ncq_head]; 1252 1253 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 1254 if (cqe->cqe_sf.sf_p == cq->ncq_phase) 1255 return (NULL); 1256 1257 qp = nvme->n_ioq[cqe->cqe_sqid]; 1258 1259 mutex_enter(&qp->nq_mutex); 1260 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid); 1261 mutex_exit(&qp->nq_mutex); 1262 1263 ASSERT(cmd->nc_sqid == cqe->cqe_sqid); 1264 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 1265 1266 qp->nq_sqhead = cqe->cqe_sqhd; 1267 1268 cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry; 1269 1270 /* Toggle phase on wrap-around. */ 1271 if (cq->ncq_head == 0) 1272 cq->ncq_phase = cq->ncq_phase ? 0 : 1; 1273 1274 return (cmd); 1275 } 1276 1277 /* 1278 * Process all completed commands on the io completion queue. 1279 */ 1280 static uint_t 1281 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) 1282 { 1283 nvme_reg_cqhdbl_t head = { 0 }; 1284 nvme_cmd_t *cmd; 1285 uint_t completed = 0; 1286 1287 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1288 DDI_SUCCESS) 1289 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1290 __func__); 1291 1292 mutex_enter(&cq->ncq_mutex); 1293 1294 while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1295 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd, 1296 TQ_NOSLEEP, &cmd->nc_tqent); 1297 1298 completed++; 1299 } 1300 1301 if (completed > 0) { 1302 /* 1303 * Update the completion queue head doorbell. 1304 */ 1305 head.b.cqhdbl_cqh = cq->ncq_head; 1306 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1307 } 1308 1309 mutex_exit(&cq->ncq_mutex); 1310 1311 return (completed); 1312 } 1313 1314 static nvme_cmd_t * 1315 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 1316 { 1317 nvme_cq_t *cq = qp->nq_cq; 1318 nvme_reg_cqhdbl_t head = { 0 }; 1319 nvme_cmd_t *cmd; 1320 1321 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1322 DDI_SUCCESS) 1323 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1324 __func__); 1325 1326 mutex_enter(&cq->ncq_mutex); 1327 1328 if ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1329 head.b.cqhdbl_cqh = cq->ncq_head; 1330 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1331 } 1332 1333 mutex_exit(&cq->ncq_mutex); 1334 1335 return (cmd); 1336 } 1337 1338 static int 1339 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 1340 { 1341 nvme_cqe_t *cqe = &cmd->nc_cqe; 1342 1343 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1344 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1345 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1346 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1347 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1348 1349 if (cmd->nc_xfer != NULL) 1350 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1351 1352 if (cmd->nc_nvme->n_strict_version) { 1353 cmd->nc_nvme->n_dead = B_TRUE; 1354 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1355 } 1356 1357 return (EIO); 1358 } 1359 1360 static int 1361 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 1362 { 1363 nvme_cqe_t *cqe = &cmd->nc_cqe; 1364 1365 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1366 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1367 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1368 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1369 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1370 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 1371 cmd->nc_nvme->n_dead = B_TRUE; 1372 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1373 } 1374 1375 return (EIO); 1376 } 1377 1378 static int 1379 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 1380 { 1381 nvme_cqe_t *cqe = &cmd->nc_cqe; 1382 1383 switch (cqe->cqe_sf.sf_sc) { 1384 case NVME_CQE_SC_INT_NVM_WRITE: 1385 /* write fail */ 1386 /* TODO: post ereport */ 1387 if (cmd->nc_xfer != NULL) 1388 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1389 return (EIO); 1390 1391 case NVME_CQE_SC_INT_NVM_READ: 1392 /* read fail */ 1393 /* TODO: post ereport */ 1394 if (cmd->nc_xfer != NULL) 1395 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1396 return (EIO); 1397 1398 default: 1399 return (nvme_check_unknown_cmd_status(cmd)); 1400 } 1401 } 1402 1403 static int 1404 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 1405 { 1406 nvme_cqe_t *cqe = &cmd->nc_cqe; 1407 1408 switch (cqe->cqe_sf.sf_sc) { 1409 case NVME_CQE_SC_GEN_SUCCESS: 1410 return (0); 1411 1412 /* 1413 * Errors indicating a bug in the driver should cause a panic. 1414 */ 1415 case NVME_CQE_SC_GEN_INV_OPC: 1416 /* Invalid Command Opcode */ 1417 if (!cmd->nc_dontpanic) 1418 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1419 "programming error: invalid opcode in cmd %p", 1420 (void *)cmd); 1421 return (EINVAL); 1422 1423 case NVME_CQE_SC_GEN_INV_FLD: 1424 /* Invalid Field in Command */ 1425 if (!cmd->nc_dontpanic) 1426 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1427 "programming error: invalid field in cmd %p", 1428 (void *)cmd); 1429 return (EIO); 1430 1431 case NVME_CQE_SC_GEN_ID_CNFL: 1432 /* Command ID Conflict */ 1433 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1434 "cmd ID conflict in cmd %p", (void *)cmd); 1435 return (0); 1436 1437 case NVME_CQE_SC_GEN_INV_NS: 1438 /* Invalid Namespace or Format */ 1439 if (!cmd->nc_dontpanic) 1440 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1441 "programming error: invalid NS/format in cmd %p", 1442 (void *)cmd); 1443 return (EINVAL); 1444 1445 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 1446 /* LBA Out Of Range */ 1447 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1448 "LBA out of range in cmd %p", (void *)cmd); 1449 return (0); 1450 1451 /* 1452 * Non-fatal errors, handle gracefully. 1453 */ 1454 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 1455 /* Data Transfer Error (DMA) */ 1456 /* TODO: post ereport */ 1457 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); 1458 if (cmd->nc_xfer != NULL) 1459 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1460 return (EIO); 1461 1462 case NVME_CQE_SC_GEN_INTERNAL_ERR: 1463 /* 1464 * Internal Error. The spec (v1.0, section 4.5.1.2) says 1465 * detailed error information is returned as async event, 1466 * so we pretty much ignore the error here and handle it 1467 * in the async event handler. 1468 */ 1469 atomic_inc_32(&cmd->nc_nvme->n_internal_err); 1470 if (cmd->nc_xfer != NULL) 1471 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1472 return (EIO); 1473 1474 case NVME_CQE_SC_GEN_ABORT_REQUEST: 1475 /* 1476 * Command Abort Requested. This normally happens only when a 1477 * command times out. 1478 */ 1479 /* TODO: post ereport or change blkdev to handle this? */ 1480 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err); 1481 return (ECANCELED); 1482 1483 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 1484 /* Command Aborted due to Power Loss Notification */ 1485 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1486 cmd->nc_nvme->n_dead = B_TRUE; 1487 return (EIO); 1488 1489 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 1490 /* Command Aborted due to SQ Deletion */ 1491 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del); 1492 return (EIO); 1493 1494 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 1495 /* Capacity Exceeded */ 1496 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); 1497 if (cmd->nc_xfer != NULL) 1498 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1499 return (EIO); 1500 1501 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 1502 /* Namespace Not Ready */ 1503 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); 1504 if (cmd->nc_xfer != NULL) 1505 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1506 return (EIO); 1507 1508 default: 1509 return (nvme_check_unknown_cmd_status(cmd)); 1510 } 1511 } 1512 1513 static int 1514 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 1515 { 1516 nvme_cqe_t *cqe = &cmd->nc_cqe; 1517 1518 switch (cqe->cqe_sf.sf_sc) { 1519 case NVME_CQE_SC_SPC_INV_CQ: 1520 /* Completion Queue Invalid */ 1521 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 1522 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err); 1523 return (EINVAL); 1524 1525 case NVME_CQE_SC_SPC_INV_QID: 1526 /* Invalid Queue Identifier */ 1527 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1528 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 1529 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 1530 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1531 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err); 1532 return (EINVAL); 1533 1534 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 1535 /* Max Queue Size Exceeded */ 1536 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1537 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1538 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc); 1539 return (EINVAL); 1540 1541 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 1542 /* Abort Command Limit Exceeded */ 1543 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 1544 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1545 "abort command limit exceeded in cmd %p", (void *)cmd); 1546 return (0); 1547 1548 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 1549 /* Async Event Request Limit Exceeded */ 1550 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 1551 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1552 "async event request limit exceeded in cmd %p", 1553 (void *)cmd); 1554 return (0); 1555 1556 case NVME_CQE_SC_SPC_INV_INT_VECT: 1557 /* Invalid Interrupt Vector */ 1558 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1559 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect); 1560 return (EINVAL); 1561 1562 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 1563 /* Invalid Log Page */ 1564 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 1565 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); 1566 return (EINVAL); 1567 1568 case NVME_CQE_SC_SPC_INV_FORMAT: 1569 /* Invalid Format */ 1570 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); 1571 atomic_inc_32(&cmd->nc_nvme->n_inv_format); 1572 if (cmd->nc_xfer != NULL) 1573 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1574 return (EINVAL); 1575 1576 case NVME_CQE_SC_SPC_INV_Q_DEL: 1577 /* Invalid Queue Deletion */ 1578 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1579 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del); 1580 return (EINVAL); 1581 1582 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 1583 /* Conflicting Attributes */ 1584 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 1585 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1586 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1587 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); 1588 if (cmd->nc_xfer != NULL) 1589 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1590 return (EINVAL); 1591 1592 case NVME_CQE_SC_SPC_NVM_INV_PROT: 1593 /* Invalid Protection Information */ 1594 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 1595 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1596 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1597 atomic_inc_32(&cmd->nc_nvme->n_inv_prot); 1598 if (cmd->nc_xfer != NULL) 1599 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1600 return (EINVAL); 1601 1602 case NVME_CQE_SC_SPC_NVM_READONLY: 1603 /* Write to Read Only Range */ 1604 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1605 atomic_inc_32(&cmd->nc_nvme->n_readonly); 1606 if (cmd->nc_xfer != NULL) 1607 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1608 return (EROFS); 1609 1610 case NVME_CQE_SC_SPC_INV_FW_SLOT: 1611 /* Invalid Firmware Slot */ 1612 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1613 return (EINVAL); 1614 1615 case NVME_CQE_SC_SPC_INV_FW_IMG: 1616 /* Invalid Firmware Image */ 1617 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1618 return (EINVAL); 1619 1620 case NVME_CQE_SC_SPC_FW_RESET: 1621 /* Conventional Reset Required */ 1622 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1623 return (0); 1624 1625 case NVME_CQE_SC_SPC_FW_NSSR: 1626 /* NVMe Subsystem Reset Required */ 1627 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1628 return (0); 1629 1630 case NVME_CQE_SC_SPC_FW_NEXT_RESET: 1631 /* Activation Requires Reset */ 1632 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1633 return (0); 1634 1635 case NVME_CQE_SC_SPC_FW_MTFA: 1636 /* Activation Requires Maximum Time Violation */ 1637 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1638 return (EAGAIN); 1639 1640 case NVME_CQE_SC_SPC_FW_PROHIBITED: 1641 /* Activation Prohibited */ 1642 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1643 return (EINVAL); 1644 1645 case NVME_CQE_SC_SPC_FW_OVERLAP: 1646 /* Overlapping Firmware Ranges */ 1647 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD); 1648 return (EINVAL); 1649 1650 default: 1651 return (nvme_check_unknown_cmd_status(cmd)); 1652 } 1653 } 1654 1655 static inline int 1656 nvme_check_cmd_status(nvme_cmd_t *cmd) 1657 { 1658 nvme_cqe_t *cqe = &cmd->nc_cqe; 1659 1660 /* 1661 * Take a shortcut if the controller is dead, or if 1662 * command status indicates no error. 1663 */ 1664 if (cmd->nc_nvme->n_dead) 1665 return (EIO); 1666 1667 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1668 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 1669 return (0); 1670 1671 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 1672 return (nvme_check_generic_cmd_status(cmd)); 1673 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 1674 return (nvme_check_specific_cmd_status(cmd)); 1675 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 1676 return (nvme_check_integrity_cmd_status(cmd)); 1677 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 1678 return (nvme_check_vendor_cmd_status(cmd)); 1679 1680 return (nvme_check_unknown_cmd_status(cmd)); 1681 } 1682 1683 static int 1684 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec) 1685 { 1686 nvme_t *nvme = abort_cmd->nc_nvme; 1687 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1688 nvme_abort_cmd_t ac = { 0 }; 1689 int ret = 0; 1690 1691 sema_p(&nvme->n_abort_sema); 1692 1693 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid; 1694 ac.b.ac_sqid = abort_cmd->nc_sqid; 1695 1696 cmd->nc_sqid = 0; 1697 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 1698 cmd->nc_callback = nvme_wakeup_cmd; 1699 cmd->nc_sqe.sqe_cdw10 = ac.r; 1700 1701 /* 1702 * Send the ABORT to the hardware. The ABORT command will return _after_ 1703 * the aborted command has completed (aborted or otherwise), but since 1704 * we still hold the aborted command's mutex its callback hasn't been 1705 * processed yet. 1706 */ 1707 nvme_admin_cmd(cmd, sec); 1708 sema_v(&nvme->n_abort_sema); 1709 1710 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 1711 dev_err(nvme->n_dip, CE_WARN, 1712 "!ABORT failed with sct = %x, sc = %x", 1713 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1714 atomic_inc_32(&nvme->n_abort_failed); 1715 } else { 1716 dev_err(nvme->n_dip, CE_WARN, 1717 "!ABORT of command %d/%d %ssuccessful", 1718 abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid, 1719 cmd->nc_cqe.cqe_dw0 & 1 ? "un" : ""); 1720 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0) 1721 atomic_inc_32(&nvme->n_cmd_aborted); 1722 } 1723 1724 nvme_free_cmd(cmd); 1725 return (ret); 1726 } 1727 1728 /* 1729 * nvme_wait_cmd -- wait for command completion or timeout 1730 * 1731 * In case of a serious error or a timeout of the abort command the hardware 1732 * will be declared dead and FMA will be notified. 1733 */ 1734 static void 1735 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec) 1736 { 1737 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC); 1738 nvme_t *nvme = cmd->nc_nvme; 1739 nvme_reg_csts_t csts; 1740 nvme_qpair_t *qp; 1741 1742 ASSERT(mutex_owned(&cmd->nc_mutex)); 1743 1744 while (!cmd->nc_completed) { 1745 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) 1746 break; 1747 } 1748 1749 if (cmd->nc_completed) 1750 return; 1751 1752 /* 1753 * The command timed out. 1754 * 1755 * Check controller for fatal status, any errors associated with the 1756 * register or DMA handle, or for a double timeout (abort command timed 1757 * out). If necessary log a warning and call FMA. 1758 */ 1759 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1760 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, " 1761 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 1762 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 1763 atomic_inc_32(&nvme->n_cmd_timeout); 1764 1765 if (csts.b.csts_cfs || 1766 nvme_check_regs_hdl(nvme) || 1767 nvme_check_dma_hdl(cmd->nc_dma) || 1768 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 1769 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1770 nvme->n_dead = B_TRUE; 1771 } else if (nvme_abort_cmd(cmd, sec) == 0) { 1772 /* 1773 * If the abort succeeded the command should complete 1774 * immediately with an appropriate status. 1775 */ 1776 while (!cmd->nc_completed) 1777 cv_wait(&cmd->nc_cv, &cmd->nc_mutex); 1778 1779 return; 1780 } 1781 1782 qp = nvme->n_ioq[cmd->nc_sqid]; 1783 1784 mutex_enter(&qp->nq_mutex); 1785 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 1786 mutex_exit(&qp->nq_mutex); 1787 1788 /* 1789 * As we don't know what the presumed dead hardware might still do with 1790 * the DMA memory, we'll put the command on the lost commands list if it 1791 * has any DMA memory. 1792 */ 1793 if (cmd->nc_dma != NULL) { 1794 mutex_enter(&nvme_lc_mutex); 1795 list_insert_head(&nvme_lost_cmds, cmd); 1796 mutex_exit(&nvme_lc_mutex); 1797 } 1798 } 1799 1800 static void 1801 nvme_wakeup_cmd(void *arg) 1802 { 1803 nvme_cmd_t *cmd = arg; 1804 1805 mutex_enter(&cmd->nc_mutex); 1806 cmd->nc_completed = B_TRUE; 1807 cv_signal(&cmd->nc_cv); 1808 mutex_exit(&cmd->nc_mutex); 1809 } 1810 1811 static void 1812 nvme_async_event_task(void *arg) 1813 { 1814 nvme_cmd_t *cmd = arg; 1815 nvme_t *nvme = cmd->nc_nvme; 1816 nvme_error_log_entry_t *error_log = NULL; 1817 nvme_health_log_t *health_log = NULL; 1818 nvme_nschange_list_t *nslist = NULL; 1819 size_t logsize = 0; 1820 nvme_async_event_t event; 1821 1822 /* 1823 * Check for errors associated with the async request itself. The only 1824 * command-specific error is "async event limit exceeded", which 1825 * indicates a programming error in the driver and causes a panic in 1826 * nvme_check_cmd_status(). 1827 * 1828 * Other possible errors are various scenarios where the async request 1829 * was aborted, or internal errors in the device. Internal errors are 1830 * reported to FMA, the command aborts need no special handling here. 1831 * 1832 * And finally, at least qemu nvme does not support async events, 1833 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we 1834 * will avoid posting async events. 1835 */ 1836 1837 if (nvme_check_cmd_status(cmd) != 0) { 1838 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1839 "!async event request returned failure, sct = %x, " 1840 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 1841 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 1842 cmd->nc_cqe.cqe_sf.sf_m); 1843 1844 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1845 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 1846 cmd->nc_nvme->n_dead = B_TRUE; 1847 ddi_fm_service_impact(cmd->nc_nvme->n_dip, 1848 DDI_SERVICE_LOST); 1849 } 1850 1851 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1852 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC && 1853 cmd->nc_cqe.cqe_sf.sf_dnr == 1) { 1854 nvme->n_async_event_supported = B_FALSE; 1855 } 1856 1857 nvme_free_cmd(cmd); 1858 return; 1859 } 1860 1861 event.r = cmd->nc_cqe.cqe_dw0; 1862 1863 /* Clear CQE and re-submit the async request. */ 1864 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 1865 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 1866 1867 switch (event.b.ae_type) { 1868 case NVME_ASYNC_TYPE_ERROR: 1869 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 1870 (void) nvme_get_logpage(nvme, B_FALSE, 1871 (void **)&error_log, &logsize, event.b.ae_logpage); 1872 } else { 1873 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1874 "async event reply: %d", event.b.ae_logpage); 1875 atomic_inc_32(&nvme->n_wrong_logpage); 1876 } 1877 1878 switch (event.b.ae_info) { 1879 case NVME_ASYNC_ERROR_INV_SQ: 1880 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1881 "invalid submission queue"); 1882 return; 1883 1884 case NVME_ASYNC_ERROR_INV_DBL: 1885 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1886 "invalid doorbell write value"); 1887 return; 1888 1889 case NVME_ASYNC_ERROR_DIAGFAIL: 1890 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 1891 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1892 nvme->n_dead = B_TRUE; 1893 atomic_inc_32(&nvme->n_diagfail_event); 1894 break; 1895 1896 case NVME_ASYNC_ERROR_PERSISTENT: 1897 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 1898 "device error"); 1899 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1900 nvme->n_dead = B_TRUE; 1901 atomic_inc_32(&nvme->n_persistent_event); 1902 break; 1903 1904 case NVME_ASYNC_ERROR_TRANSIENT: 1905 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 1906 "device error"); 1907 /* TODO: send ereport */ 1908 atomic_inc_32(&nvme->n_transient_event); 1909 break; 1910 1911 case NVME_ASYNC_ERROR_FW_LOAD: 1912 dev_err(nvme->n_dip, CE_WARN, 1913 "!firmware image load error"); 1914 atomic_inc_32(&nvme->n_fw_load_event); 1915 break; 1916 } 1917 break; 1918 1919 case NVME_ASYNC_TYPE_HEALTH: 1920 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 1921 (void) nvme_get_logpage(nvme, B_FALSE, 1922 (void **)&health_log, &logsize, event.b.ae_logpage, 1923 -1); 1924 } else { 1925 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1926 "async event reply: %d", event.b.ae_logpage); 1927 atomic_inc_32(&nvme->n_wrong_logpage); 1928 } 1929 1930 switch (event.b.ae_info) { 1931 case NVME_ASYNC_HEALTH_RELIABILITY: 1932 dev_err(nvme->n_dip, CE_WARN, 1933 "!device reliability compromised"); 1934 /* TODO: send ereport */ 1935 atomic_inc_32(&nvme->n_reliability_event); 1936 break; 1937 1938 case NVME_ASYNC_HEALTH_TEMPERATURE: 1939 dev_err(nvme->n_dip, CE_WARN, 1940 "!temperature above threshold"); 1941 /* TODO: send ereport */ 1942 atomic_inc_32(&nvme->n_temperature_event); 1943 break; 1944 1945 case NVME_ASYNC_HEALTH_SPARE: 1946 dev_err(nvme->n_dip, CE_WARN, 1947 "!spare space below threshold"); 1948 /* TODO: send ereport */ 1949 atomic_inc_32(&nvme->n_spare_event); 1950 break; 1951 } 1952 break; 1953 1954 case NVME_ASYNC_TYPE_NOTICE: 1955 switch (event.b.ae_info) { 1956 case NVME_ASYNC_NOTICE_NS_CHANGE: 1957 dev_err(nvme->n_dip, CE_NOTE, 1958 "namespace attribute change event, " 1959 "logpage = %x", event.b.ae_logpage); 1960 atomic_inc_32(&nvme->n_notice_event); 1961 1962 if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) 1963 break; 1964 1965 if (nvme_get_logpage(nvme, B_FALSE, (void **)&nslist, 1966 &logsize, event.b.ae_logpage, -1) != 0) { 1967 break; 1968 } 1969 1970 if (nslist->nscl_ns[0] == UINT32_MAX) { 1971 dev_err(nvme->n_dip, CE_CONT, 1972 "more than %u namespaces have changed.\n", 1973 NVME_NSCHANGE_LIST_SIZE); 1974 break; 1975 } 1976 1977 mutex_enter(&nvme->n_mgmt_mutex); 1978 for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) { 1979 uint32_t nsid = nslist->nscl_ns[i]; 1980 1981 if (nsid == 0) /* end of list */ 1982 break; 1983 1984 dev_err(nvme->n_dip, CE_NOTE, 1985 "!namespace %u (%s) has changed.", nsid, 1986 NVME_NSID2NS(nvme, nsid)->ns_name); 1987 1988 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) 1989 continue; 1990 1991 bd_state_change( 1992 NVME_NSID2NS(nvme, nsid)->ns_bd_hdl); 1993 } 1994 mutex_exit(&nvme->n_mgmt_mutex); 1995 1996 break; 1997 1998 case NVME_ASYNC_NOTICE_FW_ACTIVATE: 1999 dev_err(nvme->n_dip, CE_NOTE, 2000 "firmware activation starting, " 2001 "logpage = %x", event.b.ae_logpage); 2002 atomic_inc_32(&nvme->n_notice_event); 2003 break; 2004 2005 case NVME_ASYNC_NOTICE_TELEMETRY: 2006 dev_err(nvme->n_dip, CE_NOTE, 2007 "telemetry log changed, " 2008 "logpage = %x", event.b.ae_logpage); 2009 atomic_inc_32(&nvme->n_notice_event); 2010 break; 2011 2012 case NVME_ASYNC_NOTICE_NS_ASYMM: 2013 dev_err(nvme->n_dip, CE_NOTE, 2014 "asymmetric namespace access change, " 2015 "logpage = %x", event.b.ae_logpage); 2016 atomic_inc_32(&nvme->n_notice_event); 2017 break; 2018 2019 case NVME_ASYNC_NOTICE_LATENCYLOG: 2020 dev_err(nvme->n_dip, CE_NOTE, 2021 "predictable latency event aggregate log change, " 2022 "logpage = %x", event.b.ae_logpage); 2023 atomic_inc_32(&nvme->n_notice_event); 2024 break; 2025 2026 case NVME_ASYNC_NOTICE_LBASTATUS: 2027 dev_err(nvme->n_dip, CE_NOTE, 2028 "LBA status information alert, " 2029 "logpage = %x", event.b.ae_logpage); 2030 atomic_inc_32(&nvme->n_notice_event); 2031 break; 2032 2033 case NVME_ASYNC_NOTICE_ENDURANCELOG: 2034 dev_err(nvme->n_dip, CE_NOTE, 2035 "endurance group event aggregate log page change, " 2036 "logpage = %x", event.b.ae_logpage); 2037 atomic_inc_32(&nvme->n_notice_event); 2038 break; 2039 2040 default: 2041 dev_err(nvme->n_dip, CE_WARN, 2042 "!unknown notice async event received, " 2043 "info = %x, logpage = %x", event.b.ae_info, 2044 event.b.ae_logpage); 2045 atomic_inc_32(&nvme->n_unknown_event); 2046 break; 2047 } 2048 break; 2049 2050 case NVME_ASYNC_TYPE_VENDOR: 2051 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 2052 "received, info = %x, logpage = %x", event.b.ae_info, 2053 event.b.ae_logpage); 2054 atomic_inc_32(&nvme->n_vendor_event); 2055 break; 2056 2057 default: 2058 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 2059 "type = %x, info = %x, logpage = %x", event.b.ae_type, 2060 event.b.ae_info, event.b.ae_logpage); 2061 atomic_inc_32(&nvme->n_unknown_event); 2062 break; 2063 } 2064 2065 if (error_log != NULL) 2066 kmem_free(error_log, logsize); 2067 2068 if (health_log != NULL) 2069 kmem_free(health_log, logsize); 2070 2071 if (nslist != NULL) 2072 kmem_free(nslist, logsize); 2073 } 2074 2075 static void 2076 nvme_admin_cmd(nvme_cmd_t *cmd, int sec) 2077 { 2078 mutex_enter(&cmd->nc_mutex); 2079 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd); 2080 nvme_wait_cmd(cmd, sec); 2081 mutex_exit(&cmd->nc_mutex); 2082 } 2083 2084 static void 2085 nvme_async_event(nvme_t *nvme) 2086 { 2087 nvme_cmd_t *cmd; 2088 2089 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2090 cmd->nc_sqid = 0; 2091 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 2092 cmd->nc_callback = nvme_async_event_task; 2093 cmd->nc_dontpanic = B_TRUE; 2094 2095 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 2096 } 2097 2098 static int 2099 nvme_format_nvm(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t lbaf, 2100 boolean_t ms, uint8_t pi, boolean_t pil, uint8_t ses) 2101 { 2102 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2103 nvme_format_nvm_t format_nvm = { 0 }; 2104 int ret; 2105 2106 format_nvm.b.fm_lbaf = lbaf & 0xf; 2107 format_nvm.b.fm_ms = ms ? 1 : 0; 2108 format_nvm.b.fm_pi = pi & 0x7; 2109 format_nvm.b.fm_pil = pil ? 1 : 0; 2110 format_nvm.b.fm_ses = ses & 0x7; 2111 2112 cmd->nc_sqid = 0; 2113 cmd->nc_callback = nvme_wakeup_cmd; 2114 cmd->nc_sqe.sqe_nsid = nsid; 2115 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; 2116 cmd->nc_sqe.sqe_cdw10 = format_nvm.r; 2117 2118 /* 2119 * Some devices like Samsung SM951 don't allow formatting of all 2120 * namespaces in one command. Handle that gracefully. 2121 */ 2122 if (nsid == (uint32_t)-1) 2123 cmd->nc_dontpanic = B_TRUE; 2124 /* 2125 * If this format request was initiated by the user, then don't allow a 2126 * programmer error to panic the system. 2127 */ 2128 if (user) 2129 cmd->nc_dontpanic = B_TRUE; 2130 2131 nvme_admin_cmd(cmd, nvme_format_cmd_timeout); 2132 2133 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2134 dev_err(nvme->n_dip, CE_WARN, 2135 "!FORMAT failed with sct = %x, sc = %x", 2136 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2137 } 2138 2139 nvme_free_cmd(cmd); 2140 return (ret); 2141 } 2142 2143 /* 2144 * The `bufsize` parameter is usually an output parameter, set by this routine 2145 * when filling in the supported types of logpages from the device. However, for 2146 * vendor-specific pages, it is an input parameter, and must be set 2147 * appropriately by callers. 2148 */ 2149 static int 2150 nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize, 2151 uint8_t logpage, ...) 2152 { 2153 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2154 nvme_getlogpage_t getlogpage = { 0 }; 2155 va_list ap; 2156 int ret; 2157 2158 va_start(ap, logpage); 2159 2160 cmd->nc_sqid = 0; 2161 cmd->nc_callback = nvme_wakeup_cmd; 2162 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 2163 2164 if (user) 2165 cmd->nc_dontpanic = B_TRUE; 2166 2167 getlogpage.b.lp_lid = logpage; 2168 2169 switch (logpage) { 2170 case NVME_LOGPAGE_ERROR: 2171 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2172 *bufsize = MIN(NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE, 2173 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); 2174 break; 2175 2176 case NVME_LOGPAGE_HEALTH: 2177 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2178 *bufsize = sizeof (nvme_health_log_t); 2179 break; 2180 2181 case NVME_LOGPAGE_FWSLOT: 2182 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2183 *bufsize = sizeof (nvme_fwslot_log_t); 2184 break; 2185 2186 case NVME_LOGPAGE_NSCHANGE: 2187 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2188 *bufsize = sizeof (nvme_nschange_list_t); 2189 break; 2190 2191 default: 2192 /* 2193 * This intentionally only checks against the minimum valid 2194 * log page ID. `logpage` is a uint8_t, and `0xFF` is a valid 2195 * page ID, so this one-sided check avoids a compiler error 2196 * about a check that's always true. 2197 */ 2198 if (logpage < NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) { 2199 dev_err(nvme->n_dip, CE_WARN, 2200 "!unknown log page requested: %d", logpage); 2201 atomic_inc_32(&nvme->n_unknown_logpage); 2202 ret = EINVAL; 2203 goto fail; 2204 } 2205 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2206 } 2207 2208 va_end(ap); 2209 2210 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; 2211 2212 cmd->nc_sqe.sqe_cdw10 = getlogpage.r; 2213 2214 if (nvme_zalloc_dma(nvme, *bufsize, 2215 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2216 dev_err(nvme->n_dip, CE_WARN, 2217 "!nvme_zalloc_dma failed for GET LOG PAGE"); 2218 ret = ENOMEM; 2219 goto fail; 2220 } 2221 2222 if ((ret = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 2223 goto fail; 2224 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2225 2226 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2227 dev_err(nvme->n_dip, CE_WARN, 2228 "!GET LOG PAGE failed with sct = %x, sc = %x", 2229 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2230 goto fail; 2231 } 2232 2233 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2234 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2235 2236 fail: 2237 nvme_free_cmd(cmd); 2238 2239 return (ret); 2240 } 2241 2242 static int 2243 nvme_identify(nvme_t *nvme, boolean_t user, uint32_t nsid, void **buf) 2244 { 2245 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2246 int ret; 2247 2248 if (buf == NULL) 2249 return (EINVAL); 2250 2251 cmd->nc_sqid = 0; 2252 cmd->nc_callback = nvme_wakeup_cmd; 2253 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 2254 cmd->nc_sqe.sqe_nsid = nsid; 2255 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL; 2256 2257 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 2258 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2259 dev_err(nvme->n_dip, CE_WARN, 2260 "!nvme_zalloc_dma failed for IDENTIFY"); 2261 ret = ENOMEM; 2262 goto fail; 2263 } 2264 2265 if (cmd->nc_dma->nd_ncookie > 2) { 2266 dev_err(nvme->n_dip, CE_WARN, 2267 "!too many DMA cookies for IDENTIFY"); 2268 atomic_inc_32(&nvme->n_too_many_cookies); 2269 ret = ENOMEM; 2270 goto fail; 2271 } 2272 2273 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 2274 if (cmd->nc_dma->nd_ncookie > 1) { 2275 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2276 &cmd->nc_dma->nd_cookie); 2277 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2278 cmd->nc_dma->nd_cookie.dmac_laddress; 2279 } 2280 2281 if (user) 2282 cmd->nc_dontpanic = B_TRUE; 2283 2284 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2285 2286 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2287 dev_err(nvme->n_dip, CE_WARN, 2288 "!IDENTIFY failed with sct = %x, sc = %x", 2289 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2290 goto fail; 2291 } 2292 2293 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 2294 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE); 2295 2296 fail: 2297 nvme_free_cmd(cmd); 2298 2299 return (ret); 2300 } 2301 2302 static int 2303 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2304 uint32_t val, uint32_t *res) 2305 { 2306 _NOTE(ARGUNUSED(nsid)); 2307 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2308 int ret = EINVAL; 2309 2310 ASSERT(res != NULL); 2311 2312 cmd->nc_sqid = 0; 2313 cmd->nc_callback = nvme_wakeup_cmd; 2314 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 2315 cmd->nc_sqe.sqe_cdw10 = feature; 2316 cmd->nc_sqe.sqe_cdw11 = val; 2317 2318 if (user) 2319 cmd->nc_dontpanic = B_TRUE; 2320 2321 switch (feature) { 2322 case NVME_FEAT_WRITE_CACHE: 2323 if (!nvme->n_write_cache_present) 2324 goto fail; 2325 break; 2326 2327 case NVME_FEAT_NQUEUES: 2328 break; 2329 2330 default: 2331 goto fail; 2332 } 2333 2334 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2335 2336 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2337 dev_err(nvme->n_dip, CE_WARN, 2338 "!SET FEATURES %d failed with sct = %x, sc = %x", 2339 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2340 cmd->nc_cqe.cqe_sf.sf_sc); 2341 goto fail; 2342 } 2343 2344 *res = cmd->nc_cqe.cqe_dw0; 2345 2346 fail: 2347 nvme_free_cmd(cmd); 2348 return (ret); 2349 } 2350 2351 static int 2352 nvme_get_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2353 uint32_t *res, void **buf, size_t *bufsize) 2354 { 2355 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2356 int ret = EINVAL; 2357 2358 ASSERT(res != NULL); 2359 2360 if (bufsize != NULL) 2361 *bufsize = 0; 2362 2363 cmd->nc_sqid = 0; 2364 cmd->nc_callback = nvme_wakeup_cmd; 2365 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; 2366 cmd->nc_sqe.sqe_cdw10 = feature; 2367 cmd->nc_sqe.sqe_cdw11 = *res; 2368 2369 /* 2370 * For some of the optional features there doesn't seem to be a method 2371 * of detecting whether it is supported other than using it. This will 2372 * cause "Invalid Field in Command" error, which is normally considered 2373 * a programming error. Set the nc_dontpanic flag to override the panic 2374 * in nvme_check_generic_cmd_status(). 2375 */ 2376 switch (feature) { 2377 case NVME_FEAT_ARBITRATION: 2378 case NVME_FEAT_POWER_MGMT: 2379 case NVME_FEAT_TEMPERATURE: 2380 case NVME_FEAT_ERROR: 2381 case NVME_FEAT_NQUEUES: 2382 case NVME_FEAT_INTR_COAL: 2383 case NVME_FEAT_INTR_VECT: 2384 case NVME_FEAT_WRITE_ATOM: 2385 case NVME_FEAT_ASYNC_EVENT: 2386 break; 2387 2388 case NVME_FEAT_WRITE_CACHE: 2389 if (!nvme->n_write_cache_present) 2390 goto fail; 2391 break; 2392 2393 case NVME_FEAT_LBA_RANGE: 2394 if (!nvme->n_lba_range_supported) 2395 goto fail; 2396 2397 cmd->nc_dontpanic = B_TRUE; 2398 cmd->nc_sqe.sqe_nsid = nsid; 2399 ASSERT(bufsize != NULL); 2400 *bufsize = NVME_LBA_RANGE_BUFSIZE; 2401 break; 2402 2403 case NVME_FEAT_AUTO_PST: 2404 if (!nvme->n_auto_pst_supported) 2405 goto fail; 2406 2407 ASSERT(bufsize != NULL); 2408 *bufsize = NVME_AUTO_PST_BUFSIZE; 2409 break; 2410 2411 case NVME_FEAT_PROGRESS: 2412 if (!nvme->n_progress_supported) 2413 goto fail; 2414 2415 cmd->nc_dontpanic = B_TRUE; 2416 break; 2417 2418 default: 2419 goto fail; 2420 } 2421 2422 if (user) 2423 cmd->nc_dontpanic = B_TRUE; 2424 2425 if (bufsize != NULL && *bufsize != 0) { 2426 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, 2427 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2428 dev_err(nvme->n_dip, CE_WARN, 2429 "!nvme_zalloc_dma failed for GET FEATURES"); 2430 ret = ENOMEM; 2431 goto fail; 2432 } 2433 2434 if (cmd->nc_dma->nd_ncookie > 2) { 2435 dev_err(nvme->n_dip, CE_WARN, 2436 "!too many DMA cookies for GET FEATURES"); 2437 atomic_inc_32(&nvme->n_too_many_cookies); 2438 ret = ENOMEM; 2439 goto fail; 2440 } 2441 2442 cmd->nc_sqe.sqe_dptr.d_prp[0] = 2443 cmd->nc_dma->nd_cookie.dmac_laddress; 2444 if (cmd->nc_dma->nd_ncookie > 1) { 2445 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2446 &cmd->nc_dma->nd_cookie); 2447 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2448 cmd->nc_dma->nd_cookie.dmac_laddress; 2449 } 2450 } 2451 2452 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2453 2454 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2455 boolean_t known = B_TRUE; 2456 2457 /* Check if this is unsupported optional feature */ 2458 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2459 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) { 2460 switch (feature) { 2461 case NVME_FEAT_LBA_RANGE: 2462 nvme->n_lba_range_supported = B_FALSE; 2463 break; 2464 case NVME_FEAT_PROGRESS: 2465 nvme->n_progress_supported = B_FALSE; 2466 break; 2467 default: 2468 known = B_FALSE; 2469 break; 2470 } 2471 } else { 2472 known = B_FALSE; 2473 } 2474 2475 /* Report the error otherwise */ 2476 if (!known) { 2477 dev_err(nvme->n_dip, CE_WARN, 2478 "!GET FEATURES %d failed with sct = %x, sc = %x", 2479 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2480 cmd->nc_cqe.cqe_sf.sf_sc); 2481 } 2482 2483 goto fail; 2484 } 2485 2486 if (bufsize != NULL && *bufsize != 0) { 2487 ASSERT(buf != NULL); 2488 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2489 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2490 } 2491 2492 *res = cmd->nc_cqe.cqe_dw0; 2493 2494 fail: 2495 nvme_free_cmd(cmd); 2496 return (ret); 2497 } 2498 2499 static int 2500 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 2501 { 2502 nvme_write_cache_t nwc = { 0 }; 2503 2504 if (enable) 2505 nwc.b.wc_wce = 1; 2506 2507 return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE, 2508 nwc.r, &nwc.r)); 2509 } 2510 2511 static int 2512 nvme_set_nqueues(nvme_t *nvme) 2513 { 2514 nvme_nqueues_t nq = { 0 }; 2515 int ret; 2516 2517 /* 2518 * The default is to allocate one completion queue per vector. 2519 */ 2520 if (nvme->n_completion_queues == -1) 2521 nvme->n_completion_queues = nvme->n_intr_cnt; 2522 2523 /* 2524 * There is no point in having more completion queues than 2525 * interrupt vectors. 2526 */ 2527 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2528 nvme->n_intr_cnt); 2529 2530 /* 2531 * The default is to use one submission queue per completion queue. 2532 */ 2533 if (nvme->n_submission_queues == -1) 2534 nvme->n_submission_queues = nvme->n_completion_queues; 2535 2536 /* 2537 * There is no point in having more compeletion queues than 2538 * submission queues. 2539 */ 2540 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2541 nvme->n_submission_queues); 2542 2543 ASSERT(nvme->n_submission_queues > 0); 2544 ASSERT(nvme->n_completion_queues > 0); 2545 2546 nq.b.nq_nsq = nvme->n_submission_queues - 1; 2547 nq.b.nq_ncq = nvme->n_completion_queues - 1; 2548 2549 ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r, 2550 &nq.r); 2551 2552 if (ret == 0) { 2553 /* 2554 * Never use more than the requested number of queues. 2555 */ 2556 nvme->n_submission_queues = MIN(nvme->n_submission_queues, 2557 nq.b.nq_nsq + 1); 2558 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2559 nq.b.nq_ncq + 1); 2560 } 2561 2562 return (ret); 2563 } 2564 2565 static int 2566 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq) 2567 { 2568 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2569 nvme_create_queue_dw10_t dw10 = { 0 }; 2570 nvme_create_cq_dw11_t c_dw11 = { 0 }; 2571 int ret; 2572 2573 dw10.b.q_qid = cq->ncq_id; 2574 dw10.b.q_qsize = cq->ncq_nentry - 1; 2575 2576 c_dw11.b.cq_pc = 1; 2577 c_dw11.b.cq_ien = 1; 2578 c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt; 2579 2580 cmd->nc_sqid = 0; 2581 cmd->nc_callback = nvme_wakeup_cmd; 2582 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 2583 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2584 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 2585 cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress; 2586 2587 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2588 2589 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2590 dev_err(nvme->n_dip, CE_WARN, 2591 "!CREATE CQUEUE failed with sct = %x, sc = %x", 2592 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2593 } 2594 2595 nvme_free_cmd(cmd); 2596 2597 return (ret); 2598 } 2599 2600 static int 2601 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 2602 { 2603 nvme_cq_t *cq = qp->nq_cq; 2604 nvme_cmd_t *cmd; 2605 nvme_create_queue_dw10_t dw10 = { 0 }; 2606 nvme_create_sq_dw11_t s_dw11 = { 0 }; 2607 int ret; 2608 2609 /* 2610 * It is possible to have more qpairs than completion queues, 2611 * and when the idx > ncq_id, that completion queue is shared 2612 * and has already been created. 2613 */ 2614 if (idx <= cq->ncq_id && 2615 nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS) 2616 return (DDI_FAILURE); 2617 2618 dw10.b.q_qid = idx; 2619 dw10.b.q_qsize = qp->nq_nentry - 1; 2620 2621 s_dw11.b.sq_pc = 1; 2622 s_dw11.b.sq_cqid = cq->ncq_id; 2623 2624 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2625 cmd->nc_sqid = 0; 2626 cmd->nc_callback = nvme_wakeup_cmd; 2627 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 2628 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2629 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 2630 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 2631 2632 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2633 2634 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2635 dev_err(nvme->n_dip, CE_WARN, 2636 "!CREATE SQUEUE failed with sct = %x, sc = %x", 2637 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2638 } 2639 2640 nvme_free_cmd(cmd); 2641 2642 return (ret); 2643 } 2644 2645 static boolean_t 2646 nvme_reset(nvme_t *nvme, boolean_t quiesce) 2647 { 2648 nvme_reg_csts_t csts; 2649 int i; 2650 2651 nvme_put32(nvme, NVME_REG_CC, 0); 2652 2653 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2654 if (csts.b.csts_rdy == 1) { 2655 nvme_put32(nvme, NVME_REG_CC, 0); 2656 for (i = 0; i != nvme->n_timeout * 10; i++) { 2657 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2658 if (csts.b.csts_rdy == 0) 2659 break; 2660 2661 if (quiesce) 2662 drv_usecwait(50000); 2663 else 2664 delay(drv_usectohz(50000)); 2665 } 2666 } 2667 2668 nvme_put32(nvme, NVME_REG_AQA, 0); 2669 nvme_put32(nvme, NVME_REG_ASQ, 0); 2670 nvme_put32(nvme, NVME_REG_ACQ, 0); 2671 2672 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2673 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 2674 } 2675 2676 static void 2677 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce) 2678 { 2679 nvme_reg_cc_t cc; 2680 nvme_reg_csts_t csts; 2681 int i; 2682 2683 ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT); 2684 2685 cc.r = nvme_get32(nvme, NVME_REG_CC); 2686 cc.b.cc_shn = mode & 0x3; 2687 nvme_put32(nvme, NVME_REG_CC, cc.r); 2688 2689 for (i = 0; i != 10; i++) { 2690 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2691 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 2692 break; 2693 2694 if (quiesce) 2695 drv_usecwait(100000); 2696 else 2697 delay(drv_usectohz(100000)); 2698 } 2699 } 2700 2701 2702 static void 2703 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 2704 { 2705 /* 2706 * Section 7.7 of the spec describes how to get a unique ID for 2707 * the controller: the vendor ID, the model name and the serial 2708 * number shall be unique when combined. 2709 * 2710 * If a namespace has no EUI64 we use the above and add the hex 2711 * namespace ID to get a unique ID for the namespace. 2712 */ 2713 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2714 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 2715 2716 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2717 bcopy(nvme->n_idctl->id_serial, serial, 2718 sizeof (nvme->n_idctl->id_serial)); 2719 2720 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2721 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 2722 2723 NVME_NSID2NS(nvme, nsid)->ns_devid = kmem_asprintf("%4X-%s-%s-%X", 2724 nvme->n_idctl->id_vid, model, serial, nsid); 2725 } 2726 2727 static int 2728 nvme_init_ns(nvme_t *nvme, int nsid) 2729 { 2730 nvme_namespace_t *ns = NVME_NSID2NS(nvme, nsid); 2731 nvme_identify_nsid_t *idns; 2732 boolean_t was_ignored; 2733 int last_rp; 2734 2735 ns->ns_nvme = nvme; 2736 2737 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex)); 2738 2739 if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { 2740 dev_err(nvme->n_dip, CE_WARN, 2741 "!failed to identify namespace %d", nsid); 2742 return (DDI_FAILURE); 2743 } 2744 2745 if (ns->ns_idns != NULL) 2746 kmem_free(ns->ns_idns, sizeof (nvme_identify_nsid_t)); 2747 2748 ns->ns_idns = idns; 2749 ns->ns_id = nsid; 2750 ns->ns_block_count = idns->id_nsize; 2751 ns->ns_block_size = 2752 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2753 ns->ns_best_block_size = ns->ns_block_size; 2754 2755 /* 2756 * Get the EUI64 if present. Use it for devid and device node names. 2757 */ 2758 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2759 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); 2760 2761 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 2762 if (*(uint64_t *)ns->ns_eui64 != 0) { 2763 uint8_t *eui64 = ns->ns_eui64; 2764 2765 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), 2766 "%02x%02x%02x%02x%02x%02x%02x%02x", 2767 eui64[0], eui64[1], eui64[2], eui64[3], 2768 eui64[4], eui64[5], eui64[6], eui64[7]); 2769 } else { 2770 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", 2771 ns->ns_id); 2772 2773 nvme_prepare_devid(nvme, ns->ns_id); 2774 } 2775 2776 /* 2777 * Find the LBA format with no metadata and the best relative 2778 * performance. A value of 3 means "degraded", 0 is best. 2779 */ 2780 last_rp = 3; 2781 for (int j = 0; j <= idns->id_nlbaf; j++) { 2782 if (idns->id_lbaf[j].lbaf_lbads == 0) 2783 break; 2784 if (idns->id_lbaf[j].lbaf_ms != 0) 2785 continue; 2786 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 2787 continue; 2788 last_rp = idns->id_lbaf[j].lbaf_rp; 2789 ns->ns_best_block_size = 2790 1 << idns->id_lbaf[j].lbaf_lbads; 2791 } 2792 2793 if (ns->ns_best_block_size < nvme->n_min_block_size) 2794 ns->ns_best_block_size = nvme->n_min_block_size; 2795 2796 was_ignored = ns->ns_ignore; 2797 2798 /* 2799 * We currently don't support namespaces that use either: 2800 * - protection information 2801 * - illegal block size (< 512) 2802 */ 2803 if (idns->id_dps.dp_pinfo) { 2804 dev_err(nvme->n_dip, CE_WARN, 2805 "!ignoring namespace %d, unsupported feature: " 2806 "pinfo = %d", nsid, idns->id_dps.dp_pinfo); 2807 ns->ns_ignore = B_TRUE; 2808 } else if (ns->ns_block_size < 512) { 2809 dev_err(nvme->n_dip, CE_WARN, 2810 "!ignoring namespace %d, unsupported block size %"PRIu64, 2811 nsid, (uint64_t)ns->ns_block_size); 2812 ns->ns_ignore = B_TRUE; 2813 } else { 2814 ns->ns_ignore = B_FALSE; 2815 } 2816 2817 /* 2818 * Keep a count of namespaces which are attachable. 2819 * See comments in nvme_bd_driveinfo() to understand its effect. 2820 */ 2821 if (was_ignored) { 2822 /* 2823 * Previously ignored, but now not. Count it. 2824 */ 2825 if (!ns->ns_ignore) 2826 nvme->n_namespaces_attachable++; 2827 } else { 2828 /* 2829 * Wasn't ignored previously, but now needs to be. 2830 * Discount it. 2831 */ 2832 if (ns->ns_ignore) 2833 nvme->n_namespaces_attachable--; 2834 } 2835 2836 return (DDI_SUCCESS); 2837 } 2838 2839 static int 2840 nvme_attach_ns(nvme_t *nvme, int nsid) 2841 { 2842 nvme_namespace_t *ns = NVME_NSID2NS(nvme, nsid); 2843 2844 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex)); 2845 2846 if (ns->ns_ignore) 2847 return (ENOTSUP); 2848 2849 if (ns->ns_bd_hdl == NULL) { 2850 bd_ops_t ops = nvme_bd_ops; 2851 2852 if (!nvme->n_idctl->id_oncs.on_dset_mgmt) 2853 ops.o_free_space = NULL; 2854 2855 ns->ns_bd_hdl = bd_alloc_handle(ns, &ops, &nvme->n_prp_dma_attr, 2856 KM_SLEEP); 2857 2858 if (ns->ns_bd_hdl == NULL) { 2859 dev_err(nvme->n_dip, CE_WARN, "!Failed to get blkdev " 2860 "handle for namespace id %d", nsid); 2861 return (EINVAL); 2862 } 2863 } 2864 2865 if (bd_attach_handle(nvme->n_dip, ns->ns_bd_hdl) != DDI_SUCCESS) 2866 return (EBUSY); 2867 2868 ns->ns_attached = B_TRUE; 2869 2870 return (0); 2871 } 2872 2873 static int 2874 nvme_detach_ns(nvme_t *nvme, int nsid) 2875 { 2876 nvme_namespace_t *ns = NVME_NSID2NS(nvme, nsid); 2877 int rv; 2878 2879 ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex)); 2880 2881 if (ns->ns_ignore || !ns->ns_attached) 2882 return (0); 2883 2884 ASSERT(ns->ns_bd_hdl != NULL); 2885 rv = bd_detach_handle(ns->ns_bd_hdl); 2886 if (rv != DDI_SUCCESS) 2887 return (EBUSY); 2888 else 2889 ns->ns_attached = B_FALSE; 2890 2891 return (0); 2892 } 2893 2894 static int 2895 nvme_init(nvme_t *nvme) 2896 { 2897 nvme_reg_cc_t cc = { 0 }; 2898 nvme_reg_aqa_t aqa = { 0 }; 2899 nvme_reg_asq_t asq = { 0 }; 2900 nvme_reg_acq_t acq = { 0 }; 2901 nvme_reg_cap_t cap; 2902 nvme_reg_vs_t vs; 2903 nvme_reg_csts_t csts; 2904 int i = 0; 2905 uint16_t nqueues; 2906 uint_t tq_threads; 2907 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2908 char *vendor, *product; 2909 2910 /* Check controller version */ 2911 vs.r = nvme_get32(nvme, NVME_REG_VS); 2912 nvme->n_version.v_major = vs.b.vs_mjr; 2913 nvme->n_version.v_minor = vs.b.vs_mnr; 2914 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d", 2915 nvme->n_version.v_major, nvme->n_version.v_minor); 2916 2917 if (nvme->n_version.v_major > nvme_version_major) { 2918 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x", 2919 nvme_version_major); 2920 if (nvme->n_strict_version) 2921 goto fail; 2922 } 2923 2924 /* retrieve controller configuration */ 2925 cap.r = nvme_get64(nvme, NVME_REG_CAP); 2926 2927 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 2928 dev_err(nvme->n_dip, CE_WARN, 2929 "!NVM command set not supported by hardware"); 2930 goto fail; 2931 } 2932 2933 nvme->n_nssr_supported = cap.b.cap_nssrs; 2934 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 2935 nvme->n_timeout = cap.b.cap_to; 2936 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 2937 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 2938 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 2939 2940 /* 2941 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 2942 * the base page size of 4k (1<<12), so add 12 here to get the real 2943 * page size value. 2944 */ 2945 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 2946 cap.b.cap_mpsmax + 12); 2947 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 2948 2949 /* 2950 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 2951 */ 2952 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 2953 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2954 2955 /* 2956 * Set up PRP DMA to transfer 1 page-aligned page at a time. 2957 * Maxxfer may be increased after we identified the controller limits. 2958 */ 2959 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 2960 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2961 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 2962 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 2963 2964 /* 2965 * Reset controller if it's still in ready state. 2966 */ 2967 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 2968 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 2969 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2970 nvme->n_dead = B_TRUE; 2971 goto fail; 2972 } 2973 2974 /* 2975 * Create the cq array with one completion queue to be assigned 2976 * to the admin queue pair and a limited number of taskqs (4). 2977 */ 2978 if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) != 2979 DDI_SUCCESS) { 2980 dev_err(nvme->n_dip, CE_WARN, 2981 "!failed to pre-allocate admin completion queue"); 2982 goto fail; 2983 } 2984 /* 2985 * Create the admin queue pair. 2986 */ 2987 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 2988 != DDI_SUCCESS) { 2989 dev_err(nvme->n_dip, CE_WARN, 2990 "!unable to allocate admin qpair"); 2991 goto fail; 2992 } 2993 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 2994 nvme->n_ioq[0] = nvme->n_adminq; 2995 2996 nvme->n_progress |= NVME_ADMIN_QUEUE; 2997 2998 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2999 "admin-queue-len", nvme->n_admin_queue_len); 3000 3001 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 3002 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 3003 acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress; 3004 3005 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 3006 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 3007 3008 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 3009 nvme_put64(nvme, NVME_REG_ASQ, asq); 3010 nvme_put64(nvme, NVME_REG_ACQ, acq); 3011 3012 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 3013 cc.b.cc_css = 0; /* use NVM command set */ 3014 cc.b.cc_mps = nvme->n_pageshift - 12; 3015 cc.b.cc_shn = 0; /* no shutdown in progress */ 3016 cc.b.cc_en = 1; /* enable controller */ 3017 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 3018 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 3019 3020 nvme_put32(nvme, NVME_REG_CC, cc.r); 3021 3022 /* 3023 * Wait for the controller to become ready. 3024 */ 3025 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 3026 if (csts.b.csts_rdy == 0) { 3027 for (i = 0; i != nvme->n_timeout * 10; i++) { 3028 delay(drv_usectohz(50000)); 3029 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 3030 3031 if (csts.b.csts_cfs == 1) { 3032 dev_err(nvme->n_dip, CE_WARN, 3033 "!controller fatal status at init"); 3034 ddi_fm_service_impact(nvme->n_dip, 3035 DDI_SERVICE_LOST); 3036 nvme->n_dead = B_TRUE; 3037 goto fail; 3038 } 3039 3040 if (csts.b.csts_rdy == 1) 3041 break; 3042 } 3043 } 3044 3045 if (csts.b.csts_rdy == 0) { 3046 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 3047 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 3048 nvme->n_dead = B_TRUE; 3049 goto fail; 3050 } 3051 3052 /* 3053 * Assume an abort command limit of 1. We'll destroy and re-init 3054 * that later when we know the true abort command limit. 3055 */ 3056 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 3057 3058 /* 3059 * Set up initial interrupt for admin queue. 3060 */ 3061 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 3062 != DDI_SUCCESS) && 3063 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 3064 != DDI_SUCCESS) && 3065 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 3066 != DDI_SUCCESS)) { 3067 dev_err(nvme->n_dip, CE_WARN, 3068 "!failed to setup initial interrupt"); 3069 goto fail; 3070 } 3071 3072 /* 3073 * Post an asynchronous event command to catch errors. 3074 * We assume the asynchronous events are supported as required by 3075 * specification (Figure 40 in section 5 of NVMe 1.2). 3076 * However, since at least qemu does not follow the specification, 3077 * we need a mechanism to protect ourselves. 3078 */ 3079 nvme->n_async_event_supported = B_TRUE; 3080 nvme_async_event(nvme); 3081 3082 /* 3083 * Identify Controller 3084 */ 3085 if (nvme_identify(nvme, B_FALSE, 0, (void **)&nvme->n_idctl) != 0) { 3086 dev_err(nvme->n_dip, CE_WARN, 3087 "!failed to identify controller"); 3088 goto fail; 3089 } 3090 3091 /* 3092 * Get Vendor & Product ID 3093 */ 3094 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 3095 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 3096 sata_split_model(model, &vendor, &product); 3097 3098 if (vendor == NULL) 3099 nvme->n_vendor = strdup("NVMe"); 3100 else 3101 nvme->n_vendor = strdup(vendor); 3102 3103 nvme->n_product = strdup(product); 3104 3105 /* 3106 * Get controller limits. 3107 */ 3108 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 3109 MIN(nvme->n_admin_queue_len / 10, 3110 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 3111 3112 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3113 "async-event-limit", nvme->n_async_event_limit); 3114 3115 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 3116 3117 /* 3118 * Reinitialize the semaphore with the true abort command limit 3119 * supported by the hardware. It's not necessary to disable interrupts 3120 * as only command aborts use the semaphore, and no commands are 3121 * executed or aborted while we're here. 3122 */ 3123 sema_destroy(&nvme->n_abort_sema); 3124 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 3125 SEMA_DRIVER, NULL); 3126 3127 nvme->n_progress |= NVME_CTRL_LIMITS; 3128 3129 if (nvme->n_idctl->id_mdts == 0) 3130 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 3131 else 3132 nvme->n_max_data_transfer_size = 3133 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 3134 3135 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 3136 3137 /* 3138 * Limit n_max_data_transfer_size to what we can handle in one PRP. 3139 * Chained PRPs are currently unsupported. 3140 * 3141 * This is a no-op on hardware which doesn't support a transfer size 3142 * big enough to require chained PRPs. 3143 */ 3144 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 3145 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 3146 3147 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 3148 3149 /* 3150 * Make sure the minimum/maximum queue entry sizes are not 3151 * larger/smaller than the default. 3152 */ 3153 3154 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 3155 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 3156 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 3157 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 3158 goto fail; 3159 3160 /* 3161 * Check for the presence of a Volatile Write Cache. If present, 3162 * enable or disable based on the value of the property 3163 * volatile-write-cache-enable (default is enabled). 3164 */ 3165 nvme->n_write_cache_present = 3166 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 3167 3168 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3169 "volatile-write-cache-present", 3170 nvme->n_write_cache_present ? 1 : 0); 3171 3172 if (!nvme->n_write_cache_present) { 3173 nvme->n_write_cache_enabled = B_FALSE; 3174 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled) 3175 != 0) { 3176 dev_err(nvme->n_dip, CE_WARN, 3177 "!failed to %sable volatile write cache", 3178 nvme->n_write_cache_enabled ? "en" : "dis"); 3179 /* 3180 * Assume the cache is (still) enabled. 3181 */ 3182 nvme->n_write_cache_enabled = B_TRUE; 3183 } 3184 3185 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3186 "volatile-write-cache-enable", 3187 nvme->n_write_cache_enabled ? 1 : 0); 3188 3189 /* 3190 * Assume LBA Range Type feature is supported. If it isn't this 3191 * will be set to B_FALSE by nvme_get_features(). 3192 */ 3193 nvme->n_lba_range_supported = B_TRUE; 3194 3195 /* 3196 * Check support for Autonomous Power State Transition. 3197 */ 3198 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 3199 nvme->n_auto_pst_supported = 3200 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; 3201 3202 /* 3203 * Assume Software Progress Marker feature is supported. If it isn't 3204 * this will be set to B_FALSE by nvme_get_features(). 3205 */ 3206 nvme->n_progress_supported = B_TRUE; 3207 3208 /* 3209 * Identify Namespaces 3210 */ 3211 nvme->n_namespace_count = nvme->n_idctl->id_nn; 3212 3213 if (nvme->n_namespace_count == 0) { 3214 dev_err(nvme->n_dip, CE_WARN, 3215 "!controllers without namespaces are not supported"); 3216 goto fail; 3217 } 3218 3219 if (nvme->n_namespace_count > NVME_MINOR_MAX) { 3220 dev_err(nvme->n_dip, CE_WARN, 3221 "!too many namespaces: %d, limiting to %d\n", 3222 nvme->n_namespace_count, NVME_MINOR_MAX); 3223 nvme->n_namespace_count = NVME_MINOR_MAX; 3224 } 3225 3226 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 3227 nvme->n_namespace_count, KM_SLEEP); 3228 3229 /* 3230 * Try to set up MSI/MSI-X interrupts. 3231 */ 3232 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 3233 != 0) { 3234 nvme_release_interrupts(nvme); 3235 3236 nqueues = MIN(UINT16_MAX, ncpus); 3237 3238 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 3239 nqueues) != DDI_SUCCESS) && 3240 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 3241 nqueues) != DDI_SUCCESS)) { 3242 dev_err(nvme->n_dip, CE_WARN, 3243 "!failed to setup MSI/MSI-X interrupts"); 3244 goto fail; 3245 } 3246 } 3247 3248 /* 3249 * Create I/O queue pairs. 3250 */ 3251 3252 if (nvme_set_nqueues(nvme) != 0) { 3253 dev_err(nvme->n_dip, CE_WARN, 3254 "!failed to set number of I/O queues to %d", 3255 nvme->n_intr_cnt); 3256 goto fail; 3257 } 3258 3259 /* 3260 * Reallocate I/O queue array 3261 */ 3262 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 3263 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 3264 (nvme->n_submission_queues + 1), KM_SLEEP); 3265 nvme->n_ioq[0] = nvme->n_adminq; 3266 3267 /* 3268 * There should always be at least as many submission queues 3269 * as completion queues. 3270 */ 3271 ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues); 3272 3273 nvme->n_ioq_count = nvme->n_submission_queues; 3274 3275 nvme->n_io_squeue_len = 3276 MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries); 3277 3278 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len", 3279 nvme->n_io_squeue_len); 3280 3281 /* 3282 * Pre-allocate completion queues. 3283 * When there are the same number of submission and completion 3284 * queues there is no value in having a larger completion 3285 * queue length. 3286 */ 3287 if (nvme->n_submission_queues == nvme->n_completion_queues) 3288 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3289 nvme->n_io_squeue_len); 3290 3291 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3292 nvme->n_max_queue_entries); 3293 3294 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", 3295 nvme->n_io_cqueue_len); 3296 3297 /* 3298 * Assign the equal quantity of taskq threads to each completion 3299 * queue, capping the total number of threads to the number 3300 * of CPUs. 3301 */ 3302 tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues; 3303 3304 /* 3305 * In case the calculation above is zero, we need at least one 3306 * thread per completion queue. 3307 */ 3308 tq_threads = MAX(1, tq_threads); 3309 3310 if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, 3311 nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) { 3312 dev_err(nvme->n_dip, CE_WARN, 3313 "!failed to pre-allocate completion queues"); 3314 goto fail; 3315 } 3316 3317 /* 3318 * If we use less completion queues than interrupt vectors return 3319 * some of the interrupt vectors back to the system. 3320 */ 3321 if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) { 3322 nvme_release_interrupts(nvme); 3323 3324 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 3325 nvme->n_completion_queues + 1) != DDI_SUCCESS) { 3326 dev_err(nvme->n_dip, CE_WARN, 3327 "!failed to reduce number of interrupts"); 3328 goto fail; 3329 } 3330 } 3331 3332 /* 3333 * Alloc & register I/O queue pairs 3334 */ 3335 3336 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3337 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len, 3338 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 3339 dev_err(nvme->n_dip, CE_WARN, 3340 "!unable to allocate I/O qpair %d", i); 3341 goto fail; 3342 } 3343 3344 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) { 3345 dev_err(nvme->n_dip, CE_WARN, 3346 "!unable to create I/O qpair %d", i); 3347 goto fail; 3348 } 3349 } 3350 3351 /* 3352 * Post more asynchronous events commands to reduce event reporting 3353 * latency as suggested by the spec. 3354 */ 3355 if (nvme->n_async_event_supported) { 3356 for (i = 1; i != nvme->n_async_event_limit; i++) 3357 nvme_async_event(nvme); 3358 } 3359 3360 return (DDI_SUCCESS); 3361 3362 fail: 3363 (void) nvme_reset(nvme, B_FALSE); 3364 return (DDI_FAILURE); 3365 } 3366 3367 static uint_t 3368 nvme_intr(caddr_t arg1, caddr_t arg2) 3369 { 3370 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 3371 nvme_t *nvme = (nvme_t *)arg1; 3372 int inum = (int)(uintptr_t)arg2; 3373 int ccnt = 0; 3374 int qnum; 3375 3376 if (inum >= nvme->n_intr_cnt) 3377 return (DDI_INTR_UNCLAIMED); 3378 3379 if (nvme->n_dead) 3380 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ? 3381 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); 3382 3383 /* 3384 * The interrupt vector a queue uses is calculated as queue_idx % 3385 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 3386 * in steps of n_intr_cnt to process all queues using this vector. 3387 */ 3388 for (qnum = inum; 3389 qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL; 3390 qnum += nvme->n_intr_cnt) { 3391 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]); 3392 } 3393 3394 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 3395 } 3396 3397 static void 3398 nvme_release_interrupts(nvme_t *nvme) 3399 { 3400 int i; 3401 3402 for (i = 0; i < nvme->n_intr_cnt; i++) { 3403 if (nvme->n_inth[i] == NULL) 3404 break; 3405 3406 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3407 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 3408 else 3409 (void) ddi_intr_disable(nvme->n_inth[i]); 3410 3411 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 3412 (void) ddi_intr_free(nvme->n_inth[i]); 3413 } 3414 3415 kmem_free(nvme->n_inth, nvme->n_inth_sz); 3416 nvme->n_inth = NULL; 3417 nvme->n_inth_sz = 0; 3418 3419 nvme->n_progress &= ~NVME_INTERRUPTS; 3420 } 3421 3422 static int 3423 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 3424 { 3425 int nintrs, navail, count; 3426 int ret; 3427 int i; 3428 3429 if (nvme->n_intr_types == 0) { 3430 ret = ddi_intr_get_supported_types(nvme->n_dip, 3431 &nvme->n_intr_types); 3432 if (ret != DDI_SUCCESS) { 3433 dev_err(nvme->n_dip, CE_WARN, 3434 "!%s: ddi_intr_get_supported types failed", 3435 __func__); 3436 return (ret); 3437 } 3438 #ifdef __x86 3439 if (get_hwenv() == HW_VMWARE) 3440 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 3441 #endif 3442 } 3443 3444 if ((nvme->n_intr_types & intr_type) == 0) 3445 return (DDI_FAILURE); 3446 3447 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 3448 if (ret != DDI_SUCCESS) { 3449 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 3450 __func__); 3451 return (ret); 3452 } 3453 3454 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 3455 if (ret != DDI_SUCCESS) { 3456 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 3457 __func__); 3458 return (ret); 3459 } 3460 3461 /* We want at most one interrupt per queue pair. */ 3462 if (navail > nqpairs) 3463 navail = nqpairs; 3464 3465 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 3466 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 3467 3468 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 3469 &count, 0); 3470 if (ret != DDI_SUCCESS) { 3471 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 3472 __func__); 3473 goto fail; 3474 } 3475 3476 nvme->n_intr_cnt = count; 3477 3478 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 3479 if (ret != DDI_SUCCESS) { 3480 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 3481 __func__); 3482 goto fail; 3483 } 3484 3485 for (i = 0; i < count; i++) { 3486 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 3487 (void *)nvme, (void *)(uintptr_t)i); 3488 if (ret != DDI_SUCCESS) { 3489 dev_err(nvme->n_dip, CE_WARN, 3490 "!%s: ddi_intr_add_handler failed", __func__); 3491 goto fail; 3492 } 3493 } 3494 3495 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 3496 3497 for (i = 0; i < count; i++) { 3498 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3499 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 3500 else 3501 ret = ddi_intr_enable(nvme->n_inth[i]); 3502 3503 if (ret != DDI_SUCCESS) { 3504 dev_err(nvme->n_dip, CE_WARN, 3505 "!%s: enabling interrupt %d failed", __func__, i); 3506 goto fail; 3507 } 3508 } 3509 3510 nvme->n_intr_type = intr_type; 3511 3512 nvme->n_progress |= NVME_INTERRUPTS; 3513 3514 return (DDI_SUCCESS); 3515 3516 fail: 3517 nvme_release_interrupts(nvme); 3518 3519 return (ret); 3520 } 3521 3522 static int 3523 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 3524 { 3525 _NOTE(ARGUNUSED(arg)); 3526 3527 pci_ereport_post(dip, fm_error, NULL); 3528 return (fm_error->fme_status); 3529 } 3530 3531 static void 3532 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a, 3533 void *b) 3534 { 3535 nvme_t *nvme = a; 3536 3537 nvme->n_dead = B_TRUE; 3538 3539 /* 3540 * Fail all outstanding commands, including those in the admin queue 3541 * (queue 0). 3542 */ 3543 for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) { 3544 nvme_qpair_t *qp = nvme->n_ioq[i]; 3545 3546 mutex_enter(&qp->nq_mutex); 3547 for (size_t j = 0; j < qp->nq_nentry; j++) { 3548 nvme_cmd_t *cmd = qp->nq_cmd[j]; 3549 nvme_cmd_t *u_cmd; 3550 3551 if (cmd == NULL) { 3552 continue; 3553 } 3554 3555 /* 3556 * Since we have the queue lock held the entire time we 3557 * iterate over it, it's not possible for the queue to 3558 * change underneath us. Thus, we don't need to check 3559 * that the return value of nvme_unqueue_cmd matches the 3560 * requested cmd to unqueue. 3561 */ 3562 u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 3563 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, 3564 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); 3565 3566 ASSERT3P(u_cmd, ==, cmd); 3567 } 3568 mutex_exit(&qp->nq_mutex); 3569 } 3570 } 3571 3572 static int 3573 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3574 { 3575 nvme_t *nvme; 3576 int instance; 3577 int nregs; 3578 off_t regsize; 3579 int i; 3580 char name[32]; 3581 3582 if (cmd != DDI_ATTACH) 3583 return (DDI_FAILURE); 3584 3585 instance = ddi_get_instance(dip); 3586 3587 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 3588 return (DDI_FAILURE); 3589 3590 nvme = ddi_get_soft_state(nvme_state, instance); 3591 ddi_set_driver_private(dip, nvme); 3592 nvme->n_dip = dip; 3593 3594 /* Set up event handlers for hot removal. */ 3595 if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT, 3596 &nvme->n_rm_cookie) != DDI_SUCCESS) { 3597 goto fail; 3598 } 3599 if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie, 3600 nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) != 3601 DDI_SUCCESS) { 3602 goto fail; 3603 } 3604 3605 mutex_init(&nvme->n_minor_mutex, NULL, MUTEX_DRIVER, NULL); 3606 nvme->n_progress |= NVME_MUTEX_INIT; 3607 3608 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3609 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 3610 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 3611 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 3612 B_TRUE : B_FALSE; 3613 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3614 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 3615 nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3616 DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN); 3617 /* 3618 * Double up the default for completion queues in case of 3619 * queue sharing. 3620 */ 3621 nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3622 DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN); 3623 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3624 DDI_PROP_DONTPASS, "async-event-limit", 3625 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 3626 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3627 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 3628 B_TRUE : B_FALSE; 3629 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3630 DDI_PROP_DONTPASS, "min-phys-block-size", 3631 NVME_DEFAULT_MIN_BLOCK_SIZE); 3632 nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3633 DDI_PROP_DONTPASS, "max-submission-queues", -1); 3634 nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3635 DDI_PROP_DONTPASS, "max-completion-queues", -1); 3636 3637 if (!ISP2(nvme->n_min_block_size) || 3638 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 3639 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 3640 "using default %d", ISP2(nvme->n_min_block_size) ? 3641 "too low" : "not a power of 2", 3642 NVME_DEFAULT_MIN_BLOCK_SIZE); 3643 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 3644 } 3645 3646 if (nvme->n_submission_queues != -1 && 3647 (nvme->n_submission_queues < 1 || 3648 nvme->n_submission_queues > UINT16_MAX)) { 3649 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not " 3650 "valid. Must be [1..%d]", nvme->n_submission_queues, 3651 UINT16_MAX); 3652 nvme->n_submission_queues = -1; 3653 } 3654 3655 if (nvme->n_completion_queues != -1 && 3656 (nvme->n_completion_queues < 1 || 3657 nvme->n_completion_queues > UINT16_MAX)) { 3658 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not " 3659 "valid. Must be [1..%d]", nvme->n_completion_queues, 3660 UINT16_MAX); 3661 nvme->n_completion_queues = -1; 3662 } 3663 3664 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 3665 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 3666 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 3667 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 3668 3669 if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN) 3670 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN; 3671 if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN) 3672 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN; 3673 3674 if (nvme->n_async_event_limit < 1) 3675 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 3676 3677 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 3678 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 3679 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 3680 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 3681 3682 /* 3683 * Set up FMA support. 3684 */ 3685 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 3686 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 3687 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 3688 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 3689 3690 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 3691 3692 if (nvme->n_fm_cap) { 3693 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 3694 nvme->n_reg_acc_attr.devacc_attr_access = 3695 DDI_FLAGERR_ACC; 3696 3697 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 3698 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3699 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3700 } 3701 3702 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3703 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3704 pci_ereport_setup(dip); 3705 3706 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3707 ddi_fm_handler_register(dip, nvme_fm_errcb, 3708 (void *)nvme); 3709 } 3710 3711 nvme->n_progress |= NVME_FMA_INIT; 3712 3713 /* 3714 * The spec defines several register sets. Only the controller 3715 * registers (set 1) are currently used. 3716 */ 3717 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 3718 nregs < 2 || 3719 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 3720 goto fail; 3721 3722 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 3723 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 3724 dev_err(dip, CE_WARN, "!failed to map regset 1"); 3725 goto fail; 3726 } 3727 3728 nvme->n_progress |= NVME_REGS_MAPPED; 3729 3730 /* 3731 * Create PRP DMA cache 3732 */ 3733 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 3734 ddi_driver_name(dip), ddi_get_instance(dip)); 3735 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 3736 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 3737 NULL, (void *)nvme, NULL, 0); 3738 3739 if (nvme_init(nvme) != DDI_SUCCESS) 3740 goto fail; 3741 3742 /* 3743 * Initialize the driver with the UFM subsystem 3744 */ 3745 if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops, 3746 &nvme->n_ufmh, nvme) != 0) { 3747 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem"); 3748 goto fail; 3749 } 3750 mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL); 3751 ddi_ufm_update(nvme->n_ufmh); 3752 nvme->n_progress |= NVME_UFM_INIT; 3753 3754 mutex_init(&nvme->n_mgmt_mutex, NULL, MUTEX_DRIVER, NULL); 3755 nvme->n_progress |= NVME_MGMT_INIT; 3756 3757 /* 3758 * Identify and attach namespaces. 3759 */ 3760 mutex_enter(&nvme->n_mgmt_mutex); 3761 3762 for (i = 1; i <= nvme->n_namespace_count; i++) { 3763 nvme_namespace_t *ns = NVME_NSID2NS(nvme, i); 3764 int rv; 3765 3766 /* 3767 * Namespaces start out ignored. When nvme_init_ns() checks 3768 * their properties and finds they can be used, it will set 3769 * ns_ignore to B_FALSE. It will also use this state change 3770 * to keep an accurate count of attachable namespaces. 3771 */ 3772 ns->ns_ignore = B_TRUE; 3773 if (nvme_init_ns(nvme, i) != 0) { 3774 mutex_exit(&nvme->n_mgmt_mutex); 3775 goto fail; 3776 } 3777 3778 rv = nvme_attach_ns(nvme, i); 3779 if (rv != 0 && rv != ENOTSUP) { 3780 mutex_exit(&nvme->n_mgmt_mutex); 3781 goto fail; 3782 } 3783 3784 if (ddi_create_minor_node(nvme->n_dip, ns->ns_name, S_IFCHR, 3785 NVME_MINOR(ddi_get_instance(nvme->n_dip), i), 3786 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { 3787 mutex_exit(&nvme->n_mgmt_mutex); 3788 dev_err(dip, CE_WARN, 3789 "!failed to create minor node for namespace %d", i); 3790 goto fail; 3791 } 3792 } 3793 3794 mutex_exit(&nvme->n_mgmt_mutex); 3795 3796 if (ddi_create_minor_node(dip, "devctl", S_IFCHR, 3797 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) 3798 != DDI_SUCCESS) { 3799 dev_err(dip, CE_WARN, "nvme_attach: " 3800 "cannot create devctl minor node"); 3801 goto fail; 3802 } 3803 3804 return (DDI_SUCCESS); 3805 3806 fail: 3807 /* attach successful anyway so that FMA can retire the device */ 3808 if (nvme->n_dead) 3809 return (DDI_SUCCESS); 3810 3811 (void) nvme_detach(dip, DDI_DETACH); 3812 3813 return (DDI_FAILURE); 3814 } 3815 3816 static int 3817 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3818 { 3819 int instance, i; 3820 nvme_t *nvme; 3821 3822 if (cmd != DDI_DETACH) 3823 return (DDI_FAILURE); 3824 3825 instance = ddi_get_instance(dip); 3826 3827 nvme = ddi_get_soft_state(nvme_state, instance); 3828 3829 if (nvme == NULL) 3830 return (DDI_FAILURE); 3831 3832 ddi_remove_minor_node(dip, "devctl"); 3833 3834 if (nvme->n_ns) { 3835 for (i = 1; i <= nvme->n_namespace_count; i++) { 3836 nvme_namespace_t *ns = NVME_NSID2NS(nvme, i); 3837 3838 ddi_remove_minor_node(dip, ns->ns_name); 3839 3840 if (ns->ns_bd_hdl) { 3841 (void) bd_detach_handle(ns->ns_bd_hdl); 3842 bd_free_handle(ns->ns_bd_hdl); 3843 } 3844 3845 if (ns->ns_idns) 3846 kmem_free(ns->ns_idns, 3847 sizeof (nvme_identify_nsid_t)); 3848 if (ns->ns_devid) 3849 strfree(ns->ns_devid); 3850 } 3851 3852 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 3853 nvme->n_namespace_count); 3854 } 3855 3856 if (nvme->n_progress & NVME_MGMT_INIT) { 3857 mutex_destroy(&nvme->n_mgmt_mutex); 3858 } 3859 3860 if (nvme->n_progress & NVME_UFM_INIT) { 3861 ddi_ufm_fini(nvme->n_ufmh); 3862 mutex_destroy(&nvme->n_fwslot_mutex); 3863 } 3864 3865 if (nvme->n_progress & NVME_INTERRUPTS) 3866 nvme_release_interrupts(nvme); 3867 3868 for (i = 0; i < nvme->n_cq_count; i++) { 3869 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL) 3870 taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); 3871 } 3872 3873 if (nvme->n_progress & NVME_MUTEX_INIT) { 3874 mutex_destroy(&nvme->n_minor_mutex); 3875 } 3876 3877 if (nvme->n_ioq_count > 0) { 3878 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3879 if (nvme->n_ioq[i] != NULL) { 3880 /* TODO: send destroy queue commands */ 3881 nvme_free_qpair(nvme->n_ioq[i]); 3882 } 3883 } 3884 3885 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 3886 (nvme->n_ioq_count + 1)); 3887 } 3888 3889 if (nvme->n_prp_cache != NULL) { 3890 kmem_cache_destroy(nvme->n_prp_cache); 3891 } 3892 3893 if (nvme->n_progress & NVME_REGS_MAPPED) { 3894 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE); 3895 (void) nvme_reset(nvme, B_FALSE); 3896 } 3897 3898 if (nvme->n_progress & NVME_CTRL_LIMITS) 3899 sema_destroy(&nvme->n_abort_sema); 3900 3901 if (nvme->n_progress & NVME_ADMIN_QUEUE) 3902 nvme_free_qpair(nvme->n_adminq); 3903 3904 if (nvme->n_cq_count > 0) { 3905 nvme_destroy_cq_array(nvme, 0); 3906 nvme->n_cq = NULL; 3907 nvme->n_cq_count = 0; 3908 } 3909 3910 if (nvme->n_idctl) 3911 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); 3912 3913 if (nvme->n_progress & NVME_REGS_MAPPED) 3914 ddi_regs_map_free(&nvme->n_regh); 3915 3916 if (nvme->n_progress & NVME_FMA_INIT) { 3917 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3918 ddi_fm_handler_unregister(nvme->n_dip); 3919 3920 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3921 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3922 pci_ereport_teardown(nvme->n_dip); 3923 3924 ddi_fm_fini(nvme->n_dip); 3925 } 3926 3927 if (nvme->n_vendor != NULL) 3928 strfree(nvme->n_vendor); 3929 3930 if (nvme->n_product != NULL) 3931 strfree(nvme->n_product); 3932 3933 /* Clean up hot removal event handler. */ 3934 if (nvme->n_ev_rm_cb_id != NULL) { 3935 (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id); 3936 } 3937 nvme->n_ev_rm_cb_id = NULL; 3938 3939 ddi_soft_state_free(nvme_state, instance); 3940 3941 return (DDI_SUCCESS); 3942 } 3943 3944 static int 3945 nvme_quiesce(dev_info_t *dip) 3946 { 3947 int instance; 3948 nvme_t *nvme; 3949 3950 instance = ddi_get_instance(dip); 3951 3952 nvme = ddi_get_soft_state(nvme_state, instance); 3953 3954 if (nvme == NULL) 3955 return (DDI_FAILURE); 3956 3957 nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE); 3958 3959 (void) nvme_reset(nvme, B_TRUE); 3960 3961 return (DDI_FAILURE); 3962 } 3963 3964 static int 3965 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma) 3966 { 3967 nvme_t *nvme = cmd->nc_nvme; 3968 uint_t nprp_per_page, nprp; 3969 uint64_t *prp; 3970 const ddi_dma_cookie_t *cookie; 3971 uint_t idx; 3972 uint_t ncookies = ddi_dma_ncookies(dma); 3973 3974 if (ncookies == 0) 3975 return (DDI_FAILURE); 3976 3977 if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL) 3978 return (DDI_FAILURE); 3979 cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress; 3980 3981 if (ncookies == 1) { 3982 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 3983 return (DDI_SUCCESS); 3984 } else if (ncookies == 2) { 3985 if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL) 3986 return (DDI_FAILURE); 3987 cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress; 3988 return (DDI_SUCCESS); 3989 } 3990 3991 /* 3992 * At this point, we're always operating on cookies at 3993 * index >= 1 and writing the addresses of those cookies 3994 * into a new page. The address of that page is stored 3995 * as the second PRP entry. 3996 */ 3997 nprp_per_page = nvme->n_pagesize / sizeof (uint64_t); 3998 ASSERT(nprp_per_page > 0); 3999 4000 /* 4001 * We currently don't support chained PRPs and set up our DMA 4002 * attributes to reflect that. If we still get an I/O request 4003 * that needs a chained PRP something is very wrong. Account 4004 * for the first cookie here, which we've placed in d_prp[0]. 4005 */ 4006 nprp = howmany(ncookies - 1, nprp_per_page); 4007 VERIFY(nprp == 1); 4008 4009 /* 4010 * Allocate a page of pointers, in which we'll write the 4011 * addresses of cookies 1 to `ncookies`. 4012 */ 4013 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 4014 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 4015 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress; 4016 4017 prp = (uint64_t *)cmd->nc_prp->nd_memp; 4018 for (idx = 1; idx < ncookies; idx++) { 4019 if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL) 4020 return (DDI_FAILURE); 4021 *prp++ = cookie->dmac_laddress; 4022 } 4023 4024 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 4025 DDI_DMA_SYNC_FORDEV); 4026 return (DDI_SUCCESS); 4027 } 4028 4029 /* 4030 * The maximum number of requests supported for a deallocate request is 4031 * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and 4032 * unchanged through at least 1.4a). The definition of nvme_range_t is also 4033 * from the NVMe 1.1 spec. Together, the result is that all of the ranges for 4034 * a deallocate request will fit into the smallest supported namespace page 4035 * (4k). 4036 */ 4037 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096); 4038 4039 static int 4040 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize, 4041 int allocflag) 4042 { 4043 const dkioc_free_list_t *dfl = xfer->x_dfl; 4044 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 4045 nvme_t *nvme = cmd->nc_nvme; 4046 nvme_range_t *ranges = NULL; 4047 uint_t i; 4048 4049 /* 4050 * The number of ranges in the request is 0s based (that is 4051 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ..., 4052 * word10 == 255 -> 256 ranges). Therefore the allowed values are 4053 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request, 4054 * we either provided bad info in nvme_bd_driveinfo() or there is a bug 4055 * in blkdev. 4056 */ 4057 VERIFY3U(dfl->dfl_num_exts, >, 0); 4058 VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES); 4059 cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff; 4060 4061 cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE; 4062 4063 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag); 4064 if (cmd->nc_prp == NULL) 4065 return (DDI_FAILURE); 4066 4067 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 4068 ranges = (nvme_range_t *)cmd->nc_prp->nd_memp; 4069 4070 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress; 4071 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 4072 4073 for (i = 0; i < dfl->dfl_num_exts; i++) { 4074 uint64_t lba, len; 4075 4076 lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize; 4077 len = exts[i].dfle_length / blocksize; 4078 4079 VERIFY3U(len, <=, UINT32_MAX); 4080 4081 /* No context attributes for a deallocate request */ 4082 ranges[i].nr_ctxattr = 0; 4083 ranges[i].nr_len = len; 4084 ranges[i].nr_lba = lba; 4085 } 4086 4087 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 4088 DDI_DMA_SYNC_FORDEV); 4089 4090 return (DDI_SUCCESS); 4091 } 4092 4093 static nvme_cmd_t * 4094 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 4095 { 4096 nvme_t *nvme = ns->ns_nvme; 4097 nvme_cmd_t *cmd; 4098 int allocflag; 4099 4100 /* 4101 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 4102 */ 4103 allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP; 4104 cmd = nvme_alloc_cmd(nvme, allocflag); 4105 4106 if (cmd == NULL) 4107 return (NULL); 4108 4109 cmd->nc_sqe.sqe_opc = opc; 4110 cmd->nc_callback = nvme_bd_xfer_done; 4111 cmd->nc_xfer = xfer; 4112 4113 switch (opc) { 4114 case NVME_OPC_NVM_WRITE: 4115 case NVME_OPC_NVM_READ: 4116 VERIFY(xfer->x_nblks <= 0x10000); 4117 4118 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4119 4120 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 4121 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 4122 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 4123 4124 if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS) 4125 goto fail; 4126 break; 4127 4128 case NVME_OPC_NVM_FLUSH: 4129 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4130 break; 4131 4132 case NVME_OPC_NVM_DSET_MGMT: 4133 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4134 4135 if (nvme_fill_ranges(cmd, xfer, 4136 (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS) 4137 goto fail; 4138 break; 4139 4140 default: 4141 goto fail; 4142 } 4143 4144 return (cmd); 4145 4146 fail: 4147 nvme_free_cmd(cmd); 4148 return (NULL); 4149 } 4150 4151 static void 4152 nvme_bd_xfer_done(void *arg) 4153 { 4154 nvme_cmd_t *cmd = arg; 4155 bd_xfer_t *xfer = cmd->nc_xfer; 4156 int error = 0; 4157 4158 error = nvme_check_cmd_status(cmd); 4159 nvme_free_cmd(cmd); 4160 4161 bd_xfer_done(xfer, error); 4162 } 4163 4164 static void 4165 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 4166 { 4167 nvme_namespace_t *ns = arg; 4168 nvme_t *nvme = ns->ns_nvme; 4169 uint_t ns_count = MAX(1, nvme->n_namespaces_attachable); 4170 boolean_t mutex_exit_needed = B_TRUE; 4171 4172 /* 4173 * nvme_bd_driveinfo is called by blkdev in two situations: 4174 * - during bd_attach_handle(), which we call with the mutex held 4175 * - during bd_attach(), which may be called with or without the 4176 * mutex held 4177 */ 4178 if (mutex_owned(&nvme->n_mgmt_mutex)) 4179 mutex_exit_needed = B_FALSE; 4180 else 4181 mutex_enter(&nvme->n_mgmt_mutex); 4182 4183 /* 4184 * Set the blkdev qcount to the number of submission queues. 4185 * It will then create one waitq/runq pair for each submission 4186 * queue and spread I/O requests across the queues. 4187 */ 4188 drive->d_qcount = nvme->n_ioq_count; 4189 4190 /* 4191 * I/O activity to individual namespaces is distributed across 4192 * each of the d_qcount blkdev queues (which has been set to 4193 * the number of nvme submission queues). d_qsize is the number 4194 * of submitted and not completed I/Os within each queue that blkdev 4195 * will allow before it starts holding them in the waitq. 4196 * 4197 * Each namespace will create a child blkdev instance, for each one 4198 * we try and set the d_qsize so that each namespace gets an 4199 * equal portion of the submission queue. 4200 * 4201 * If post instantiation of the nvme drive, n_namespaces_attachable 4202 * changes and a namespace is attached it could calculate a 4203 * different d_qsize. It may even be that the sum of the d_qsizes is 4204 * now beyond the submission queue size. Should that be the case 4205 * and the I/O rate is such that blkdev attempts to submit more 4206 * I/Os than the size of the submission queue, the excess I/Os 4207 * will be held behind the semaphore nq_sema. 4208 */ 4209 drive->d_qsize = nvme->n_io_squeue_len / ns_count; 4210 4211 /* 4212 * Don't let the queue size drop below the minimum, though. 4213 */ 4214 drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN); 4215 4216 /* 4217 * d_maxxfer is not set, which means the value is taken from the DMA 4218 * attributes specified to bd_alloc_handle. 4219 */ 4220 4221 drive->d_removable = B_FALSE; 4222 drive->d_hotpluggable = B_FALSE; 4223 4224 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 4225 drive->d_target = ns->ns_id; 4226 drive->d_lun = 0; 4227 4228 drive->d_model = nvme->n_idctl->id_model; 4229 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 4230 drive->d_vendor = nvme->n_vendor; 4231 drive->d_vendor_len = strlen(nvme->n_vendor); 4232 drive->d_product = nvme->n_product; 4233 drive->d_product_len = strlen(nvme->n_product); 4234 drive->d_serial = nvme->n_idctl->id_serial; 4235 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 4236 drive->d_revision = nvme->n_idctl->id_fwrev; 4237 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 4238 4239 /* 4240 * If we support the dataset management command, the only restrictions 4241 * on a discard request are the maximum number of ranges (segments) 4242 * per single request. 4243 */ 4244 if (nvme->n_idctl->id_oncs.on_dset_mgmt) 4245 drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES; 4246 4247 if (mutex_exit_needed) 4248 mutex_exit(&nvme->n_mgmt_mutex); 4249 } 4250 4251 static int 4252 nvme_bd_mediainfo(void *arg, bd_media_t *media) 4253 { 4254 nvme_namespace_t *ns = arg; 4255 nvme_t *nvme = ns->ns_nvme; 4256 boolean_t mutex_exit_needed = B_TRUE; 4257 4258 if (nvme->n_dead) { 4259 return (EIO); 4260 } 4261 4262 /* 4263 * nvme_bd_mediainfo is called by blkdev in various situations, 4264 * most of them out of our control. There's one exception though: 4265 * When we call bd_state_change() in response to "namespace change" 4266 * notification, where the mutex is already being held by us. 4267 */ 4268 if (mutex_owned(&nvme->n_mgmt_mutex)) 4269 mutex_exit_needed = B_FALSE; 4270 else 4271 mutex_enter(&nvme->n_mgmt_mutex); 4272 4273 media->m_nblks = ns->ns_block_count; 4274 media->m_blksize = ns->ns_block_size; 4275 media->m_readonly = B_FALSE; 4276 media->m_solidstate = B_TRUE; 4277 4278 media->m_pblksize = ns->ns_best_block_size; 4279 4280 if (mutex_exit_needed) 4281 mutex_exit(&nvme->n_mgmt_mutex); 4282 4283 return (0); 4284 } 4285 4286 static int 4287 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 4288 { 4289 nvme_t *nvme = ns->ns_nvme; 4290 nvme_cmd_t *cmd; 4291 nvme_qpair_t *ioq; 4292 boolean_t poll; 4293 int ret; 4294 4295 if (nvme->n_dead) { 4296 return (EIO); 4297 } 4298 4299 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 4300 if (cmd == NULL) 4301 return (ENOMEM); 4302 4303 cmd->nc_sqid = xfer->x_qnum + 1; 4304 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4305 ioq = nvme->n_ioq[cmd->nc_sqid]; 4306 4307 /* 4308 * Get the polling flag before submitting the command. The command may 4309 * complete immediately after it was submitted, which means we must 4310 * treat both cmd and xfer as if they have been freed already. 4311 */ 4312 poll = (xfer->x_flags & BD_XFER_POLL) != 0; 4313 4314 ret = nvme_submit_io_cmd(ioq, cmd); 4315 4316 if (ret != 0) 4317 return (ret); 4318 4319 if (!poll) 4320 return (0); 4321 4322 do { 4323 cmd = nvme_retrieve_cmd(nvme, ioq); 4324 if (cmd != NULL) 4325 cmd->nc_callback(cmd); 4326 else 4327 drv_usecwait(10); 4328 } while (ioq->nq_active_cmds != 0); 4329 4330 return (0); 4331 } 4332 4333 static int 4334 nvme_bd_read(void *arg, bd_xfer_t *xfer) 4335 { 4336 nvme_namespace_t *ns = arg; 4337 4338 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 4339 } 4340 4341 static int 4342 nvme_bd_write(void *arg, bd_xfer_t *xfer) 4343 { 4344 nvme_namespace_t *ns = arg; 4345 4346 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 4347 } 4348 4349 static int 4350 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 4351 { 4352 nvme_namespace_t *ns = arg; 4353 4354 if (ns->ns_nvme->n_dead) 4355 return (EIO); 4356 4357 /* 4358 * If the volatile write cache is not present or not enabled the FLUSH 4359 * command is a no-op, so we can take a shortcut here. 4360 */ 4361 if (!ns->ns_nvme->n_write_cache_present) { 4362 bd_xfer_done(xfer, ENOTSUP); 4363 return (0); 4364 } 4365 4366 if (!ns->ns_nvme->n_write_cache_enabled) { 4367 bd_xfer_done(xfer, 0); 4368 return (0); 4369 } 4370 4371 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 4372 } 4373 4374 static int 4375 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 4376 { 4377 nvme_namespace_t *ns = arg; 4378 nvme_t *nvme = ns->ns_nvme; 4379 4380 if (nvme->n_dead) { 4381 return (EIO); 4382 } 4383 4384 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 4385 if (*(uint64_t *)ns->ns_eui64 != 0) { 4386 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN, 4387 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 4388 } else { 4389 return (ddi_devid_init(devinfo, DEVID_ENCAP, 4390 strlen(ns->ns_devid), ns->ns_devid, devid)); 4391 } 4392 } 4393 4394 static int 4395 nvme_bd_free_space(void *arg, bd_xfer_t *xfer) 4396 { 4397 nvme_namespace_t *ns = arg; 4398 4399 if (xfer->x_dfl == NULL) 4400 return (EINVAL); 4401 4402 if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt) 4403 return (ENOTSUP); 4404 4405 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT)); 4406 } 4407 4408 static int 4409 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 4410 { 4411 #ifndef __lock_lint 4412 _NOTE(ARGUNUSED(cred_p)); 4413 #endif 4414 minor_t minor = getminor(*devp); 4415 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4416 int nsid = NVME_MINOR_NSID(minor); 4417 nvme_minor_state_t *nm; 4418 int rv = 0; 4419 4420 if (otyp != OTYP_CHR) 4421 return (EINVAL); 4422 4423 if (nvme == NULL) 4424 return (ENXIO); 4425 4426 if (nsid > nvme->n_namespace_count) 4427 return (ENXIO); 4428 4429 if (nvme->n_dead) 4430 return (EIO); 4431 4432 mutex_enter(&nvme->n_minor_mutex); 4433 4434 /* 4435 * First check the devctl node and error out if it's been opened 4436 * exclusively already by any other thread. 4437 */ 4438 if (nvme->n_minor.nm_oexcl != NULL && 4439 nvme->n_minor.nm_oexcl != curthread) { 4440 rv = EBUSY; 4441 goto out; 4442 } 4443 4444 nm = nsid == 0 ? &nvme->n_minor : &(NVME_NSID2NS(nvme, nsid)->ns_minor); 4445 4446 if (flag & FEXCL) { 4447 if (nm->nm_oexcl != NULL || nm->nm_open) { 4448 rv = EBUSY; 4449 goto out; 4450 } 4451 4452 /* 4453 * If at least one namespace is already open, fail the 4454 * exclusive open of the devctl node. 4455 */ 4456 if (nsid == 0) { 4457 for (int i = 1; i <= nvme->n_namespace_count; i++) { 4458 if (NVME_NSID2NS(nvme, i)->ns_minor.nm_open) { 4459 rv = EBUSY; 4460 goto out; 4461 } 4462 } 4463 } 4464 4465 nm->nm_oexcl = curthread; 4466 } 4467 4468 nm->nm_open = B_TRUE; 4469 4470 out: 4471 mutex_exit(&nvme->n_minor_mutex); 4472 return (rv); 4473 4474 } 4475 4476 static int 4477 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 4478 { 4479 #ifndef __lock_lint 4480 _NOTE(ARGUNUSED(cred_p)); 4481 _NOTE(ARGUNUSED(flag)); 4482 #endif 4483 minor_t minor = getminor(dev); 4484 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4485 int nsid = NVME_MINOR_NSID(minor); 4486 nvme_minor_state_t *nm; 4487 4488 if (otyp != OTYP_CHR) 4489 return (ENXIO); 4490 4491 if (nvme == NULL) 4492 return (ENXIO); 4493 4494 if (nsid > nvme->n_namespace_count) 4495 return (ENXIO); 4496 4497 nm = nsid == 0 ? &nvme->n_minor : &(NVME_NSID2NS(nvme, nsid)->ns_minor); 4498 4499 mutex_enter(&nvme->n_minor_mutex); 4500 if (nm->nm_oexcl != NULL) { 4501 ASSERT(nm->nm_oexcl == curthread); 4502 nm->nm_oexcl = NULL; 4503 } 4504 4505 ASSERT(nm->nm_open); 4506 nm->nm_open = B_FALSE; 4507 mutex_exit(&nvme->n_minor_mutex); 4508 4509 return (0); 4510 } 4511 4512 static int 4513 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4514 cred_t *cred_p) 4515 { 4516 _NOTE(ARGUNUSED(cred_p)); 4517 int rv = 0; 4518 void *idctl; 4519 4520 if ((mode & FREAD) == 0) 4521 return (EPERM); 4522 4523 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) 4524 return (EINVAL); 4525 4526 if ((rv = nvme_identify(nvme, B_TRUE, nsid, (void **)&idctl)) != 0) 4527 return (rv); 4528 4529 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) 4530 != 0) 4531 rv = EFAULT; 4532 4533 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); 4534 4535 return (rv); 4536 } 4537 4538 /* 4539 * Execute commands on behalf of the various ioctls. 4540 */ 4541 static int 4542 nvme_ioc_cmd(nvme_t *nvme, nvme_sqe_t *sqe, boolean_t is_admin, void *data_addr, 4543 uint32_t data_len, int rwk, nvme_cqe_t *cqe, uint_t timeout) 4544 { 4545 nvme_cmd_t *cmd; 4546 nvme_qpair_t *ioq; 4547 int rv = 0; 4548 4549 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 4550 if (is_admin) { 4551 cmd->nc_sqid = 0; 4552 ioq = nvme->n_adminq; 4553 } else { 4554 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; 4555 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4556 ioq = nvme->n_ioq[cmd->nc_sqid]; 4557 } 4558 4559 /* 4560 * This function is used to facilitate requests from 4561 * userspace, so don't panic if the command fails. This 4562 * is especially true for admin passthru commands, where 4563 * the actual command data structure is entirely defined 4564 * by userspace. 4565 */ 4566 cmd->nc_dontpanic = B_TRUE; 4567 4568 cmd->nc_callback = nvme_wakeup_cmd; 4569 cmd->nc_sqe = *sqe; 4570 4571 if ((rwk & (FREAD | FWRITE)) != 0) { 4572 if (data_addr == NULL) { 4573 rv = EINVAL; 4574 goto free_cmd; 4575 } 4576 4577 if (nvme_zalloc_dma(nvme, data_len, DDI_DMA_READ, 4578 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 4579 dev_err(nvme->n_dip, CE_WARN, 4580 "!nvme_zalloc_dma failed for nvme_ioc_cmd()"); 4581 4582 rv = ENOMEM; 4583 goto free_cmd; 4584 } 4585 4586 if ((rv = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 4587 goto free_cmd; 4588 4589 if ((rwk & FWRITE) != 0) { 4590 if (ddi_copyin(data_addr, cmd->nc_dma->nd_memp, 4591 data_len, rwk & FKIOCTL) != 0) { 4592 rv = EFAULT; 4593 goto free_cmd; 4594 } 4595 } 4596 } 4597 4598 if (is_admin) { 4599 nvme_admin_cmd(cmd, timeout); 4600 } else { 4601 mutex_enter(&cmd->nc_mutex); 4602 4603 rv = nvme_submit_io_cmd(ioq, cmd); 4604 4605 if (rv == EAGAIN) { 4606 mutex_exit(&cmd->nc_mutex); 4607 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 4608 "!nvme_ioc_cmd() failed, I/O Q full"); 4609 goto free_cmd; 4610 } 4611 4612 nvme_wait_cmd(cmd, timeout); 4613 4614 mutex_exit(&cmd->nc_mutex); 4615 } 4616 4617 if (cqe != NULL) 4618 *cqe = cmd->nc_cqe; 4619 4620 if ((rv = nvme_check_cmd_status(cmd)) != 0) { 4621 dev_err(nvme->n_dip, CE_WARN, 4622 "!nvme_ioc_cmd() failed with sct = %x, sc = %x", 4623 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 4624 4625 goto free_cmd; 4626 } 4627 4628 if ((rwk & FREAD) != 0) { 4629 if (ddi_copyout(cmd->nc_dma->nd_memp, 4630 data_addr, data_len, rwk & FKIOCTL) != 0) 4631 rv = EFAULT; 4632 } 4633 4634 free_cmd: 4635 nvme_free_cmd(cmd); 4636 4637 return (rv); 4638 } 4639 4640 static int 4641 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4642 int mode, cred_t *cred_p) 4643 { 4644 _NOTE(ARGUNUSED(nsid, cred_p)); 4645 int rv = 0; 4646 nvme_reg_cap_t cap = { 0 }; 4647 nvme_capabilities_t nc; 4648 4649 if ((mode & FREAD) == 0) 4650 return (EPERM); 4651 4652 if (nioc->n_len < sizeof (nc)) 4653 return (EINVAL); 4654 4655 cap.r = nvme_get64(nvme, NVME_REG_CAP); 4656 4657 /* 4658 * The MPSMIN and MPSMAX fields in the CAP register use 0 to 4659 * specify the base page size of 4k (1<<12), so add 12 here to 4660 * get the real page size value. 4661 */ 4662 nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); 4663 nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); 4664 4665 if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) 4666 rv = EFAULT; 4667 4668 return (rv); 4669 } 4670 4671 static int 4672 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4673 int mode, cred_t *cred_p) 4674 { 4675 _NOTE(ARGUNUSED(cred_p)); 4676 void *log = NULL; 4677 size_t bufsize = 0; 4678 int rv = 0; 4679 4680 if ((mode & FREAD) == 0) 4681 return (EPERM); 4682 4683 switch (nioc->n_arg) { 4684 case NVME_LOGPAGE_ERROR: 4685 if (nsid != 0) 4686 return (EINVAL); 4687 break; 4688 case NVME_LOGPAGE_HEALTH: 4689 if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) 4690 return (EINVAL); 4691 4692 if (nsid == 0) 4693 nsid = (uint32_t)-1; 4694 4695 break; 4696 case NVME_LOGPAGE_FWSLOT: 4697 if (nsid != 0) 4698 return (EINVAL); 4699 break; 4700 default: 4701 if (!NVME_IS_VENDOR_SPECIFIC_LOGPAGE(nioc->n_arg)) 4702 return (EINVAL); 4703 if (nioc->n_len > NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE) { 4704 dev_err(nvme->n_dip, CE_NOTE, "!Vendor-specific log " 4705 "page size exceeds device maximum supported size: " 4706 "%lu", NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE); 4707 return (EINVAL); 4708 } 4709 if (nioc->n_len == 0) 4710 return (EINVAL); 4711 bufsize = nioc->n_len; 4712 if (nsid == 0) 4713 nsid = (uint32_t)-1; 4714 } 4715 4716 if (nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, nioc->n_arg, nsid) 4717 != DDI_SUCCESS) 4718 return (EIO); 4719 4720 if (nioc->n_len < bufsize) { 4721 kmem_free(log, bufsize); 4722 return (EINVAL); 4723 } 4724 4725 if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) 4726 rv = EFAULT; 4727 4728 nioc->n_len = bufsize; 4729 kmem_free(log, bufsize); 4730 4731 return (rv); 4732 } 4733 4734 static int 4735 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4736 int mode, cred_t *cred_p) 4737 { 4738 _NOTE(ARGUNUSED(cred_p)); 4739 void *buf = NULL; 4740 size_t bufsize = 0; 4741 uint32_t res = 0; 4742 uint8_t feature; 4743 int rv = 0; 4744 4745 if ((mode & FREAD) == 0) 4746 return (EPERM); 4747 4748 if ((nioc->n_arg >> 32) > 0xff) 4749 return (EINVAL); 4750 4751 feature = (uint8_t)(nioc->n_arg >> 32); 4752 4753 switch (feature) { 4754 case NVME_FEAT_ARBITRATION: 4755 case NVME_FEAT_POWER_MGMT: 4756 case NVME_FEAT_ERROR: 4757 case NVME_FEAT_NQUEUES: 4758 case NVME_FEAT_INTR_COAL: 4759 case NVME_FEAT_WRITE_ATOM: 4760 case NVME_FEAT_ASYNC_EVENT: 4761 case NVME_FEAT_PROGRESS: 4762 if (nsid != 0) 4763 return (EINVAL); 4764 break; 4765 4766 case NVME_FEAT_TEMPERATURE: 4767 if (nsid != 0) 4768 return (EINVAL); 4769 res = nioc->n_arg & 0xffffffffUL; 4770 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) { 4771 nvme_temp_threshold_t tt; 4772 4773 tt.r = res; 4774 if (tt.b.tt_thsel != NVME_TEMP_THRESH_OVER && 4775 tt.b.tt_thsel != NVME_TEMP_THRESH_UNDER) { 4776 return (EINVAL); 4777 } 4778 4779 if (tt.b.tt_tmpsel > NVME_TEMP_THRESH_MAX_SENSOR) { 4780 return (EINVAL); 4781 } 4782 } else if (res != 0) { 4783 return (ENOTSUP); 4784 } 4785 break; 4786 4787 case NVME_FEAT_INTR_VECT: 4788 if (nsid != 0) 4789 return (EINVAL); 4790 4791 res = nioc->n_arg & 0xffffffffUL; 4792 if (res >= nvme->n_intr_cnt) 4793 return (EINVAL); 4794 break; 4795 4796 case NVME_FEAT_LBA_RANGE: 4797 if (nvme->n_lba_range_supported == B_FALSE) 4798 return (EINVAL); 4799 4800 if (nsid == 0 || 4801 nsid > nvme->n_namespace_count) 4802 return (EINVAL); 4803 4804 break; 4805 4806 case NVME_FEAT_WRITE_CACHE: 4807 if (nsid != 0) 4808 return (EINVAL); 4809 4810 if (!nvme->n_write_cache_present) 4811 return (EINVAL); 4812 4813 break; 4814 4815 case NVME_FEAT_AUTO_PST: 4816 if (nsid != 0) 4817 return (EINVAL); 4818 4819 if (!nvme->n_auto_pst_supported) 4820 return (EINVAL); 4821 4822 break; 4823 4824 default: 4825 return (EINVAL); 4826 } 4827 4828 rv = nvme_get_features(nvme, B_TRUE, nsid, feature, &res, &buf, 4829 &bufsize); 4830 if (rv != 0) 4831 return (rv); 4832 4833 if (nioc->n_len < bufsize) { 4834 kmem_free(buf, bufsize); 4835 return (EINVAL); 4836 } 4837 4838 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) 4839 rv = EFAULT; 4840 4841 kmem_free(buf, bufsize); 4842 nioc->n_arg = res; 4843 nioc->n_len = bufsize; 4844 4845 return (rv); 4846 } 4847 4848 static int 4849 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4850 cred_t *cred_p) 4851 { 4852 _NOTE(ARGUNUSED(nsid, mode, cred_p)); 4853 4854 if ((mode & FREAD) == 0) 4855 return (EPERM); 4856 4857 nioc->n_arg = nvme->n_intr_cnt; 4858 return (0); 4859 } 4860 4861 static int 4862 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4863 cred_t *cred_p) 4864 { 4865 _NOTE(ARGUNUSED(nsid, cred_p)); 4866 int rv = 0; 4867 4868 if ((mode & FREAD) == 0) 4869 return (EPERM); 4870 4871 if (nioc->n_len < sizeof (nvme->n_version)) 4872 return (ENOMEM); 4873 4874 if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, 4875 sizeof (nvme->n_version), mode) != 0) 4876 rv = EFAULT; 4877 4878 return (rv); 4879 } 4880 4881 static int 4882 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4883 cred_t *cred_p) 4884 { 4885 _NOTE(ARGUNUSED(mode)); 4886 nvme_format_nvm_t frmt = { 0 }; 4887 int c_nsid = nsid != 0 ? nsid : 1; 4888 nvme_identify_nsid_t *idns; 4889 nvme_minor_state_t *nm; 4890 4891 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4892 return (EPERM); 4893 4894 nm = nsid == 0 ? &nvme->n_minor : &(NVME_NSID2NS(nvme, nsid)->ns_minor); 4895 if (nm->nm_oexcl != curthread) 4896 return (EACCES); 4897 4898 if (nsid != 0 && NVME_NSID2NS(nvme, nsid)->ns_attached) 4899 return (EBUSY); 4900 4901 frmt.r = nioc->n_arg & 0xffffffff; 4902 4903 /* 4904 * Check whether the FORMAT NVM command is supported. 4905 */ 4906 if (nvme->n_idctl->id_oacs.oa_format == 0) 4907 return (ENOTSUP); 4908 4909 /* 4910 * Don't allow format or secure erase of individual namespace if that 4911 * would cause a format or secure erase of all namespaces. 4912 */ 4913 if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) 4914 return (EINVAL); 4915 4916 if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && 4917 nvme->n_idctl->id_fna.fn_sec_erase != 0) 4918 return (EINVAL); 4919 4920 /* 4921 * Don't allow formatting with Protection Information. 4922 */ 4923 if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) 4924 return (EINVAL); 4925 4926 /* 4927 * Don't allow formatting using an illegal LBA format, or any LBA format 4928 * that uses metadata. 4929 */ 4930 idns = NVME_NSID2NS(nvme, c_nsid)->ns_idns; 4931 if (frmt.b.fm_lbaf > idns->id_nlbaf || 4932 idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) 4933 return (EINVAL); 4934 4935 /* 4936 * Don't allow formatting using an illegal Secure Erase setting. 4937 */ 4938 if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || 4939 (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && 4940 nvme->n_idctl->id_fna.fn_crypt_erase == 0)) 4941 return (EINVAL); 4942 4943 if (nsid == 0) 4944 nsid = (uint32_t)-1; 4945 4946 return (nvme_format_nvm(nvme, B_TRUE, nsid, frmt.b.fm_lbaf, B_FALSE, 0, 4947 B_FALSE, frmt.b.fm_ses)); 4948 } 4949 4950 static int 4951 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4952 cred_t *cred_p) 4953 { 4954 _NOTE(ARGUNUSED(nioc, mode)); 4955 int rv; 4956 4957 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4958 return (EPERM); 4959 4960 if (nsid == 0) 4961 return (EINVAL); 4962 4963 if (NVME_NSID2NS(nvme, nsid)->ns_minor.nm_oexcl != curthread) 4964 return (EACCES); 4965 4966 mutex_enter(&nvme->n_mgmt_mutex); 4967 4968 rv = nvme_detach_ns(nvme, nsid); 4969 4970 mutex_exit(&nvme->n_mgmt_mutex); 4971 4972 return (rv); 4973 } 4974 4975 static int 4976 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4977 cred_t *cred_p) 4978 { 4979 _NOTE(ARGUNUSED(nioc, mode)); 4980 int rv; 4981 4982 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4983 return (EPERM); 4984 4985 if (nsid == 0) 4986 return (EINVAL); 4987 4988 if (NVME_NSID2NS(nvme, nsid)->ns_minor.nm_oexcl != curthread) 4989 return (EACCES); 4990 4991 mutex_enter(&nvme->n_mgmt_mutex); 4992 4993 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) { 4994 mutex_exit(&nvme->n_mgmt_mutex); 4995 return (EIO); 4996 } 4997 4998 rv = nvme_attach_ns(nvme, nsid); 4999 5000 mutex_exit(&nvme->n_mgmt_mutex); 5001 return (rv); 5002 } 5003 5004 static void 5005 nvme_ufm_update(nvme_t *nvme) 5006 { 5007 mutex_enter(&nvme->n_fwslot_mutex); 5008 ddi_ufm_update(nvme->n_ufmh); 5009 if (nvme->n_fwslot != NULL) { 5010 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t)); 5011 nvme->n_fwslot = NULL; 5012 } 5013 mutex_exit(&nvme->n_fwslot_mutex); 5014 } 5015 5016 static int 5017 nvme_ioctl_firmware_download(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 5018 int mode, cred_t *cred_p) 5019 { 5020 int rv = 0; 5021 size_t len, copylen; 5022 offset_t offset; 5023 uintptr_t buf; 5024 nvme_cqe_t cqe = { 0 }; 5025 nvme_sqe_t sqe = { 5026 .sqe_opc = NVME_OPC_FW_IMAGE_LOAD 5027 }; 5028 5029 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 5030 return (EPERM); 5031 5032 if (nvme->n_idctl->id_oacs.oa_firmware == 0) 5033 return (ENOTSUP); 5034 5035 if (nsid != 0) 5036 return (EINVAL); 5037 5038 /* 5039 * The offset (in n_len) is restricted to the number of DWORDs in 5040 * 32 bits. 5041 */ 5042 if (nioc->n_len > NVME_FW_OFFSETB_MAX) 5043 return (EINVAL); 5044 5045 /* Confirm that both offset and length are a multiple of DWORD bytes */ 5046 if ((nioc->n_len & NVME_DWORD_MASK) != 0 || 5047 (nioc->n_arg & NVME_DWORD_MASK) != 0) 5048 return (EINVAL); 5049 5050 len = nioc->n_len; 5051 offset = nioc->n_arg; 5052 buf = (uintptr_t)nioc->n_buf; 5053 5054 nioc->n_arg = 0; 5055 5056 while (len > 0 && rv == 0) { 5057 /* 5058 * nvme_ioc_cmd() does not use SGLs or PRP lists. 5059 * It is limited to 2 PRPs per NVM command, so limit 5060 * the size of the data to 2 pages. 5061 */ 5062 copylen = MIN(2 * nvme->n_pagesize, len); 5063 5064 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1; 5065 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT); 5066 5067 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void *)buf, copylen, 5068 FWRITE, &cqe, nvme_admin_cmd_timeout); 5069 5070 /* 5071 * Regardless of whether the command succeeded or not, whether 5072 * there's an errno in rv to be returned, we'll return any 5073 * command-specific status code in n_arg. 5074 * 5075 * As n_arg isn't cleared in all other possible code paths 5076 * returning an error, we return the status code as a negative 5077 * value so it can be distinguished easily from whatever value 5078 * was passed in n_arg originally. This of course only works as 5079 * long as arguments passed in n_arg are less than INT64_MAX, 5080 * which they currently are. 5081 */ 5082 if (cqe.cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 5083 nioc->n_arg = (uint64_t)-cqe.cqe_sf.sf_sc; 5084 5085 buf += copylen; 5086 offset += copylen; 5087 len -= copylen; 5088 } 5089 5090 /* 5091 * Let the DDI UFM subsystem know that the firmware information for 5092 * this device has changed. 5093 */ 5094 nvme_ufm_update(nvme); 5095 5096 return (rv); 5097 } 5098 5099 static int 5100 nvme_ioctl_firmware_commit(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 5101 int mode, cred_t *cred_p) 5102 { 5103 nvme_firmware_commit_dw10_t fc_dw10 = { 0 }; 5104 uint32_t slot = nioc->n_arg & 0xffffffff; 5105 uint32_t action = nioc->n_arg >> 32; 5106 nvme_cqe_t cqe = { 0 }; 5107 nvme_sqe_t sqe = { 5108 .sqe_opc = NVME_OPC_FW_ACTIVATE 5109 }; 5110 int timeout; 5111 int rv; 5112 5113 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 5114 return (EPERM); 5115 5116 if (nvme->n_idctl->id_oacs.oa_firmware == 0) 5117 return (ENOTSUP); 5118 5119 if (nsid != 0) 5120 return (EINVAL); 5121 5122 /* Validate slot is in range. */ 5123 if (slot < NVME_FW_SLOT_MIN || slot > NVME_FW_SLOT_MAX) 5124 return (EINVAL); 5125 5126 switch (action) { 5127 case NVME_FWC_SAVE: 5128 case NVME_FWC_SAVE_ACTIVATE: 5129 timeout = nvme_commit_save_cmd_timeout; 5130 if (slot == 1 && nvme->n_idctl->id_frmw.fw_readonly) 5131 return (EROFS); 5132 break; 5133 case NVME_FWC_ACTIVATE: 5134 case NVME_FWC_ACTIVATE_IMMED: 5135 timeout = nvme_admin_cmd_timeout; 5136 break; 5137 default: 5138 return (EINVAL); 5139 } 5140 5141 fc_dw10.b.fc_slot = slot; 5142 fc_dw10.b.fc_action = action; 5143 sqe.sqe_cdw10 = fc_dw10.r; 5144 5145 nioc->n_arg = 0; 5146 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, NULL, 0, 0, &cqe, timeout); 5147 5148 /* 5149 * Regardless of whether the command succeeded or not, whether 5150 * there's an errno in rv to be returned, we'll return any 5151 * command-specific status code in n_arg. 5152 * 5153 * As n_arg isn't cleared in all other possible code paths 5154 * returning an error, we return the status code as a negative 5155 * value so it can be distinguished easily from whatever value 5156 * was passed in n_arg originally. This of course only works as 5157 * long as arguments passed in n_arg are less than INT64_MAX, 5158 * which they currently are. 5159 */ 5160 if (cqe.cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 5161 nioc->n_arg = (uint64_t)-cqe.cqe_sf.sf_sc; 5162 5163 /* 5164 * Let the DDI UFM subsystem know that the firmware information for 5165 * this device has changed. 5166 */ 5167 nvme_ufm_update(nvme); 5168 5169 return (rv); 5170 } 5171 5172 /* 5173 * Helper to copy in a passthru command from userspace, handling 5174 * different data models. 5175 */ 5176 static int 5177 nvme_passthru_copy_cmd_in(const void *buf, nvme_passthru_cmd_t *cmd, int mode) 5178 { 5179 #ifdef _MULTI_DATAMODEL 5180 switch (ddi_model_convert_from(mode & FMODELS)) { 5181 case DDI_MODEL_ILP32: { 5182 nvme_passthru_cmd32_t cmd32; 5183 if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0) 5184 return (-1); 5185 cmd->npc_opcode = cmd32.npc_opcode; 5186 cmd->npc_timeout = cmd32.npc_timeout; 5187 cmd->npc_flags = cmd32.npc_flags; 5188 cmd->npc_cdw12 = cmd32.npc_cdw12; 5189 cmd->npc_cdw13 = cmd32.npc_cdw13; 5190 cmd->npc_cdw14 = cmd32.npc_cdw14; 5191 cmd->npc_cdw15 = cmd32.npc_cdw15; 5192 cmd->npc_buflen = cmd32.npc_buflen; 5193 cmd->npc_buf = cmd32.npc_buf; 5194 break; 5195 } 5196 case DDI_MODEL_NONE: 5197 #endif 5198 if (ddi_copyin(buf, (void*)cmd, sizeof (nvme_passthru_cmd_t), 5199 mode) != 0) 5200 return (-1); 5201 #ifdef _MULTI_DATAMODEL 5202 break; 5203 } 5204 #endif 5205 return (0); 5206 } 5207 5208 /* 5209 * Helper to copy out a passthru command result to userspace, handling 5210 * different data models. 5211 */ 5212 static int 5213 nvme_passthru_copy_cmd_out(const nvme_passthru_cmd_t *cmd, void *buf, int mode) 5214 { 5215 #ifdef _MULTI_DATAMODEL 5216 switch (ddi_model_convert_from(mode & FMODELS)) { 5217 case DDI_MODEL_ILP32: { 5218 nvme_passthru_cmd32_t cmd32; 5219 bzero(&cmd32, sizeof (cmd32)); 5220 cmd32.npc_opcode = cmd->npc_opcode; 5221 cmd32.npc_status = cmd->npc_status; 5222 cmd32.npc_err = cmd->npc_err; 5223 cmd32.npc_timeout = cmd->npc_timeout; 5224 cmd32.npc_flags = cmd->npc_flags; 5225 cmd32.npc_cdw0 = cmd->npc_cdw0; 5226 cmd32.npc_cdw12 = cmd->npc_cdw12; 5227 cmd32.npc_cdw13 = cmd->npc_cdw13; 5228 cmd32.npc_cdw14 = cmd->npc_cdw14; 5229 cmd32.npc_cdw15 = cmd->npc_cdw15; 5230 cmd32.npc_buflen = (size32_t)cmd->npc_buflen; 5231 cmd32.npc_buf = (uintptr32_t)cmd->npc_buf; 5232 if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0) 5233 return (-1); 5234 break; 5235 } 5236 case DDI_MODEL_NONE: 5237 #endif 5238 if (ddi_copyout(cmd, buf, sizeof (nvme_passthru_cmd_t), 5239 mode) != 0) 5240 return (-1); 5241 #ifdef _MULTI_DATAMODEL 5242 break; 5243 } 5244 #endif 5245 return (0); 5246 } 5247 5248 /* 5249 * Run an arbitrary vendor-specific admin command on the device. 5250 */ 5251 static int 5252 nvme_ioctl_passthru(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 5253 cred_t *cred_p) 5254 { 5255 int rv = 0; 5256 uint_t timeout = 0; 5257 int rwk = 0; 5258 nvme_passthru_cmd_t cmd; 5259 size_t expected_passthru_size = 0; 5260 nvme_sqe_t sqe; 5261 nvme_cqe_t cqe; 5262 5263 bzero(&cmd, sizeof (cmd)); 5264 bzero(&sqe, sizeof (sqe)); 5265 bzero(&cqe, sizeof (cqe)); 5266 5267 /* 5268 * Basic checks: permissions, data model, argument size. 5269 */ 5270 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 5271 return (EPERM); 5272 5273 /* 5274 * Compute the expected size of the argument buffer 5275 */ 5276 #ifdef _MULTI_DATAMODEL 5277 switch (ddi_model_convert_from(mode & FMODELS)) { 5278 case DDI_MODEL_ILP32: 5279 expected_passthru_size = sizeof (nvme_passthru_cmd32_t); 5280 break; 5281 case DDI_MODEL_NONE: 5282 #endif 5283 expected_passthru_size = sizeof (nvme_passthru_cmd_t); 5284 #ifdef _MULTI_DATAMODEL 5285 break; 5286 } 5287 #endif 5288 5289 if (nioc->n_len != expected_passthru_size) { 5290 cmd.npc_err = NVME_PASSTHRU_ERR_CMD_SIZE; 5291 rv = EINVAL; 5292 goto out; 5293 } 5294 5295 /* 5296 * Ensure the device supports the standard vendor specific 5297 * admin command format. 5298 */ 5299 if (!nvme->n_idctl->id_nvscc.nv_spec) { 5300 cmd.npc_err = NVME_PASSTHRU_ERR_NOT_SUPPORTED; 5301 rv = ENOTSUP; 5302 goto out; 5303 } 5304 5305 if (nvme_passthru_copy_cmd_in((const void*)nioc->n_buf, &cmd, mode)) 5306 return (EFAULT); 5307 5308 if (!NVME_IS_VENDOR_SPECIFIC_CMD(cmd.npc_opcode)) { 5309 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_OPCODE; 5310 rv = EINVAL; 5311 goto out; 5312 } 5313 5314 /* 5315 * This restriction is not mandated by the spec, so future work 5316 * could relax this if it's necessary to support commands that both 5317 * read and write. 5318 */ 5319 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0 && 5320 (cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) { 5321 cmd.npc_err = NVME_PASSTHRU_ERR_READ_AND_WRITE; 5322 rv = EINVAL; 5323 goto out; 5324 } 5325 if (cmd.npc_timeout > nvme_vendor_specific_admin_cmd_max_timeout) { 5326 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_TIMEOUT; 5327 rv = EINVAL; 5328 goto out; 5329 } 5330 timeout = cmd.npc_timeout; 5331 5332 /* 5333 * Passed-thru command buffer verification: 5334 * - Size is multiple of DWords 5335 * - Non-null iff the length is non-zero 5336 * - Null if neither reading nor writing data. 5337 * - Non-null if reading or writing. 5338 * - Maximum buffer size. 5339 */ 5340 if ((cmd.npc_buflen % sizeof (uint32_t)) != 0) { 5341 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5342 rv = EINVAL; 5343 goto out; 5344 } 5345 if (((void*)cmd.npc_buf != NULL && cmd.npc_buflen == 0) || 5346 ((void*)cmd.npc_buf == NULL && cmd.npc_buflen != 0)) { 5347 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5348 rv = EINVAL; 5349 goto out; 5350 } 5351 if (cmd.npc_flags == 0 && (void*)cmd.npc_buf != NULL) { 5352 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5353 rv = EINVAL; 5354 goto out; 5355 } 5356 if ((cmd.npc_flags != 0) && ((void*)cmd.npc_buf == NULL)) { 5357 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5358 rv = EINVAL; 5359 goto out; 5360 } 5361 if (cmd.npc_buflen > nvme_vendor_specific_admin_cmd_size) { 5362 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5363 rv = EINVAL; 5364 goto out; 5365 } 5366 if ((cmd.npc_buflen >> NVME_DWORD_SHIFT) > UINT32_MAX) { 5367 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5368 rv = EINVAL; 5369 goto out; 5370 } 5371 5372 sqe.sqe_opc = cmd.npc_opcode; 5373 sqe.sqe_nsid = nsid; 5374 sqe.sqe_cdw10 = (uint32_t)(cmd.npc_buflen >> NVME_DWORD_SHIFT); 5375 sqe.sqe_cdw12 = cmd.npc_cdw12; 5376 sqe.sqe_cdw13 = cmd.npc_cdw13; 5377 sqe.sqe_cdw14 = cmd.npc_cdw14; 5378 sqe.sqe_cdw15 = cmd.npc_cdw15; 5379 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0) 5380 rwk = FREAD; 5381 else if ((cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) 5382 rwk = FWRITE; 5383 5384 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void*)cmd.npc_buf, 5385 cmd.npc_buflen, rwk, &cqe, timeout); 5386 cmd.npc_status = cqe.cqe_sf.sf_sc; 5387 cmd.npc_cdw0 = cqe.cqe_dw0; 5388 5389 out: 5390 if (nvme_passthru_copy_cmd_out(&cmd, (void*)nioc->n_buf, mode)) 5391 rv = EFAULT; 5392 return (rv); 5393 } 5394 5395 static int 5396 nvme_ioctl_is_ignored_ns(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 5397 cred_t *cred_p) 5398 { 5399 _NOTE(ARGUNUSED(cred_p)); 5400 5401 if ((mode & FREAD) == 0) 5402 return (EPERM); 5403 5404 if (nsid == 0) 5405 return (EINVAL); 5406 5407 if (NVME_NSID2NS(nvme, nsid)->ns_ignore) 5408 nioc->n_arg = 1; 5409 else 5410 nioc->n_arg = 0; 5411 5412 return (0); 5413 } 5414 5415 static int 5416 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, 5417 int *rval_p) 5418 { 5419 #ifndef __lock_lint 5420 _NOTE(ARGUNUSED(rval_p)); 5421 #endif 5422 minor_t minor = getminor(dev); 5423 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 5424 int nsid = NVME_MINOR_NSID(minor); 5425 int rv = 0; 5426 nvme_ioctl_t nioc; 5427 5428 int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { 5429 NULL, 5430 nvme_ioctl_identify, 5431 nvme_ioctl_identify, 5432 nvme_ioctl_capabilities, 5433 nvme_ioctl_get_logpage, 5434 nvme_ioctl_get_features, 5435 nvme_ioctl_intr_cnt, 5436 nvme_ioctl_version, 5437 nvme_ioctl_format, 5438 nvme_ioctl_detach, 5439 nvme_ioctl_attach, 5440 nvme_ioctl_firmware_download, 5441 nvme_ioctl_firmware_commit, 5442 nvme_ioctl_passthru, 5443 nvme_ioctl_is_ignored_ns 5444 }; 5445 5446 if (nvme == NULL) 5447 return (ENXIO); 5448 5449 if (nsid > nvme->n_namespace_count) 5450 return (ENXIO); 5451 5452 if (IS_DEVCTL(cmd)) 5453 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); 5454 5455 #ifdef _MULTI_DATAMODEL 5456 switch (ddi_model_convert_from(mode & FMODELS)) { 5457 case DDI_MODEL_ILP32: { 5458 nvme_ioctl32_t nioc32; 5459 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), 5460 mode) != 0) 5461 return (EFAULT); 5462 nioc.n_len = nioc32.n_len; 5463 nioc.n_buf = nioc32.n_buf; 5464 nioc.n_arg = nioc32.n_arg; 5465 break; 5466 } 5467 case DDI_MODEL_NONE: 5468 #endif 5469 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) 5470 != 0) 5471 return (EFAULT); 5472 #ifdef _MULTI_DATAMODEL 5473 break; 5474 } 5475 #endif 5476 5477 if (nvme->n_dead && cmd != NVME_IOC_DETACH) 5478 return (EIO); 5479 5480 5481 if (cmd == NVME_IOC_IDENTIFY_CTRL) { 5482 /* 5483 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and 5484 * attachment point nodes. 5485 */ 5486 nsid = 0; 5487 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { 5488 /* 5489 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it 5490 * will always return identify data for namespace 1. 5491 */ 5492 nsid = 1; 5493 } 5494 5495 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) 5496 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, 5497 cred_p); 5498 else 5499 rv = EINVAL; 5500 5501 #ifdef _MULTI_DATAMODEL 5502 switch (ddi_model_convert_from(mode & FMODELS)) { 5503 case DDI_MODEL_ILP32: { 5504 nvme_ioctl32_t nioc32; 5505 5506 nioc32.n_len = (size32_t)nioc.n_len; 5507 nioc32.n_buf = (uintptr32_t)nioc.n_buf; 5508 nioc32.n_arg = nioc.n_arg; 5509 5510 if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), 5511 mode) != 0) 5512 return (EFAULT); 5513 break; 5514 } 5515 case DDI_MODEL_NONE: 5516 #endif 5517 if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) 5518 != 0) 5519 return (EFAULT); 5520 #ifdef _MULTI_DATAMODEL 5521 break; 5522 } 5523 #endif 5524 5525 return (rv); 5526 } 5527 5528 /* 5529 * DDI UFM Callbacks 5530 */ 5531 static int 5532 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5533 ddi_ufm_image_t *img) 5534 { 5535 nvme_t *nvme = arg; 5536 5537 if (imgno != 0) 5538 return (EINVAL); 5539 5540 ddi_ufm_image_set_desc(img, "Firmware"); 5541 ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot); 5542 5543 return (0); 5544 } 5545 5546 /* 5547 * Fill out firmware slot information for the requested slot. The firmware 5548 * slot information is gathered by requesting the Firmware Slot Information log 5549 * page. The format of the page is described in section 5.10.1.3. 5550 * 5551 * We lazily cache the log page on the first call and then invalidate the cache 5552 * data after a successful firmware download or firmware commit command. 5553 * The cached data is protected by a mutex as the state can change 5554 * asynchronous to this callback. 5555 */ 5556 static int 5557 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5558 uint_t slotno, ddi_ufm_slot_t *slot) 5559 { 5560 nvme_t *nvme = arg; 5561 void *log = NULL; 5562 size_t bufsize; 5563 ddi_ufm_attr_t attr = 0; 5564 char fw_ver[NVME_FWVER_SZ + 1]; 5565 int ret; 5566 5567 if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1)) 5568 return (EINVAL); 5569 5570 mutex_enter(&nvme->n_fwslot_mutex); 5571 if (nvme->n_fwslot == NULL) { 5572 ret = nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, 5573 NVME_LOGPAGE_FWSLOT, 0); 5574 if (ret != DDI_SUCCESS || 5575 bufsize != sizeof (nvme_fwslot_log_t)) { 5576 if (log != NULL) 5577 kmem_free(log, bufsize); 5578 mutex_exit(&nvme->n_fwslot_mutex); 5579 return (EIO); 5580 } 5581 nvme->n_fwslot = (nvme_fwslot_log_t *)log; 5582 } 5583 5584 /* 5585 * NVMe numbers firmware slots starting at 1 5586 */ 5587 if (slotno == (nvme->n_fwslot->fw_afi - 1)) 5588 attr |= DDI_UFM_ATTR_ACTIVE; 5589 5590 if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0) 5591 attr |= DDI_UFM_ATTR_WRITEABLE; 5592 5593 if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') { 5594 attr |= DDI_UFM_ATTR_EMPTY; 5595 } else { 5596 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno], 5597 NVME_FWVER_SZ); 5598 fw_ver[NVME_FWVER_SZ] = '\0'; 5599 ddi_ufm_slot_set_version(slot, fw_ver); 5600 } 5601 mutex_exit(&nvme->n_fwslot_mutex); 5602 5603 ddi_ufm_slot_set_attrs(slot, attr); 5604 5605 return (0); 5606 } 5607 5608 static int 5609 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps) 5610 { 5611 *caps = DDI_UFM_CAP_REPORT; 5612 return (0); 5613 } 5614