1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2016 The MathWorks, Inc. All rights reserved. 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2019 Unix Software Ltd. 16 * Copyright 2025 Oxide Computer Company. 17 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 18 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 19 */ 20 21 #ifndef _NVME_VAR_H 22 #define _NVME_VAR_H 23 24 #include <sys/ddi.h> 25 #include <sys/sunddi.h> 26 #include <sys/blkdev.h> 27 #include <sys/taskq_impl.h> 28 #include <sys/list.h> 29 #include <sys/ddi_ufm.h> 30 #include <nvme_common.h> 31 32 /* 33 * NVMe driver state 34 */ 35 36 #ifdef __cplusplus 37 extern "C" { 38 #endif 39 40 #define NVME_MODULE_NAME "nvme" 41 42 typedef enum { 43 NVME_PCI_CONFIG = 1 << 0, 44 NVME_FMA_INIT = 1 << 1, 45 NVME_REGS_MAPPED = 1 << 2, 46 NVME_ADMIN_QUEUE = 1 << 3, 47 NVME_CTRL_LIMITS = 1 << 4, 48 NVME_INTERRUPTS = 1 << 5, 49 NVME_UFM_INIT = 1 << 6, 50 NVME_MUTEX_INIT = 1 << 7, 51 NVME_MGMT_INIT = 1 << 8, 52 NVME_STAT_INIT = 1 << 9 53 } nvme_progress_t; 54 55 typedef enum { 56 NVME_NS_LOCK = 1 << 0, 57 /* 58 * This flag indicates whether or not we've created a minor node for 59 * this namespace. We limit the number of minor nodes that we actually 60 * create in the file system due to minor node constraints. The 61 * controller minors are preferred to the namespace minors, so the lack 62 * of such a minor is considered a non-fatal condition. Minor nodes are 63 * removed all in one go right now when we detach, so this currently 64 * serves as an internal signifier. 65 */ 66 NVME_NS_MINOR = 1 << 1 67 } nvme_ns_progress_t; 68 69 typedef enum { 70 /* 71 * The controller fails to properly process commands on the admin queue 72 * if the first one has CID 0. Subsequent use of CID 0 doesn't present 73 * a problem. 74 */ 75 NVME_QUIRK_START_CID = 1 << 0, 76 } nvme_quirk_t; 77 78 #define NVME_MIN_ADMIN_QUEUE_LEN 16 79 #define NVME_MIN_IO_QUEUE_LEN 16 80 #define NVME_DEFAULT_ADMIN_QUEUE_LEN 256 81 #define NVME_DEFAULT_IO_QUEUE_LEN 1024 82 #define NVME_DEFAULT_ASYNC_EVENT_LIMIT 10 83 #define NVME_MIN_ASYNC_EVENT_LIMIT 1 84 #define NVME_DEFAULT_MIN_BLOCK_SIZE 512 85 86 typedef struct nvme nvme_t; 87 typedef struct nvme_namespace nvme_namespace_t; 88 typedef struct nvme_minor nvme_minor_t; 89 typedef struct nvme_lock nvme_lock_t; 90 typedef struct nvme_minor_lock_info nvme_minor_lock_info_t; 91 typedef struct nvme_dma nvme_dma_t; 92 typedef struct nvme_cmd nvme_cmd_t; 93 typedef struct nvme_cq nvme_cq_t; 94 typedef struct nvme_qpair nvme_qpair_t; 95 typedef struct nvme_task_arg nvme_task_arg_t; 96 typedef struct nvme_device_stat nvme_device_stat_t; 97 typedef struct nvme_admin_stat nvme_admin_stat_t; 98 99 /* 100 * These states represent the minor's perspective. That is, of a minor's 101 * namespace and controller lock, where is it? 102 */ 103 typedef enum { 104 NVME_LOCK_STATE_UNLOCKED = 0, 105 NVME_LOCK_STATE_BLOCKED, 106 NVME_LOCK_STATE_ACQUIRED 107 } nvme_minor_lock_state_t; 108 109 struct nvme_minor_lock_info { 110 list_node_t nli_node; 111 nvme_lock_t *nli_lock; 112 nvme_minor_lock_state_t nli_state; 113 nvme_lock_level_t nli_curlevel; 114 /* 115 * While the minor points back to itself and the nvme_t should always 116 * point to the current controller, the namespace should only point to 117 * one if this is a particular namespace lock. The former two are 118 * initialized at minor initialization time. 119 */ 120 nvme_minor_t *nli_minor; 121 nvme_t *nli_nvme; 122 nvme_namespace_t *nli_ns; 123 /* 124 * This is the common ioctl information that should be filled in when 125 * we're being woken up for any reason other than an interrupted signal. 126 * This should only be set while blocking. 127 */ 128 nvme_ioctl_common_t *nli_ioc; 129 /* 130 * The following are provided for debugging purposes. In particular, 131 * information like the kthread_t and related that performed this should 132 * be considered suspect as it represents who took the operation, not 133 * who performed the operation (unless we're actively blocking). 134 */ 135 hrtime_t nli_last_change; 136 uintptr_t nli_acq_kthread; 137 pid_t nli_acq_pid; 138 }; 139 140 struct nvme_minor { 141 /* 142 * The following three fields are set when this is created. 143 */ 144 id_t nm_minor; 145 nvme_t *nm_ctrl; 146 nvme_namespace_t *nm_ns; 147 /* 148 * This link is used to index this minor on the global list of active 149 * open-related minors. This is only manipulated under the 150 * nvme_open_minors_mutex. 151 */ 152 avl_node_t nm_avl; 153 /* 154 * Information related to locking. Note, there is no pointer to a locked 155 * controller as the only one can be the one specified here. This data 156 * is protected by the controller's n_minor_mutex. 157 */ 158 kcondvar_t nm_cv; 159 nvme_minor_lock_info_t nm_ctrl_lock; 160 nvme_minor_lock_info_t nm_ns_lock; 161 }; 162 163 struct nvme_lock { 164 nvme_minor_lock_info_t *nl_writer; 165 list_t nl_readers; 166 list_t nl_pend_readers; 167 list_t nl_pend_writers; 168 /* 169 * The following are stats to indicate how often certain locking 170 * activities have occurred for debugging purposes. 171 */ 172 uint32_t nl_nwrite_locks; 173 uint32_t nl_nread_locks; 174 uint32_t nl_npend_writes; 175 uint32_t nl_npend_reads; 176 uint32_t nl_nnonblock; 177 uint32_t nl_nsignals; 178 uint32_t nl_nsig_unlock; 179 uint32_t nl_nsig_blocks; 180 uint32_t nl_nsig_acq; 181 }; 182 183 struct nvme_dma { 184 ddi_dma_handle_t nd_dmah; 185 ddi_acc_handle_t nd_acch; 186 ddi_dma_cookie_t nd_cookie; 187 uint_t nd_ncookie; 188 caddr_t nd_memp; 189 size_t nd_len; 190 boolean_t nd_cached; 191 }; 192 193 typedef enum { 194 NVME_CMD_ALLOCATED = 0, 195 NVME_CMD_SUBMITTED, 196 NVME_CMD_QUEUED, 197 NVME_CMD_COMPLETED, 198 NVME_CMD_LOST 199 } nvme_cmd_state_t; 200 201 typedef enum { 202 NVME_CMD_F_DONTPANIC = 1 << 0, 203 NVME_CMD_F_USELOCK = 1 << 1, 204 } nvme_cmd_flag_t; 205 206 /* 207 * This command structure is shared between admin and I/O commands. When used 208 * for an admin command, nc_mutex and nc_cv are used to synchronise access to 209 * various fields, and to signal command completion. NVME_CMD_F_USELOCK in 210 * nc_flags indicates whether the lock and CV are in use. For I/O commands, 211 * these are neither initialised nor used. 212 */ 213 struct nvme_cmd { 214 struct list_node nc_list; 215 216 nvme_sqe_t nc_sqe; 217 nvme_cqe_t nc_cqe; 218 219 void (*nc_callback)(void *); 220 bd_xfer_t *nc_xfer; 221 222 uint32_t nc_timeout; 223 nvme_cmd_flag_t nc_flags; 224 nvme_cmd_state_t nc_state; /* Protected by nc_mutex iff F_USELOCK */ 225 uint16_t nc_sqid; 226 227 hrtime_t nc_submit_ts; 228 hrtime_t nc_queue_ts; 229 230 nvme_dma_t *nc_dma; 231 nvme_dma_t *nc_prp; /* DMA for PRP lists */ 232 233 kmutex_t nc_mutex; 234 kcondvar_t nc_cv; 235 236 taskq_ent_t nc_tqent; 237 nvme_t *nc_nvme; 238 }; 239 240 struct nvme_cq { 241 size_t ncq_nentry; 242 uint16_t ncq_id; 243 244 nvme_dma_t *ncq_dma; 245 nvme_cqe_t *ncq_cq; 246 uint_t ncq_head; 247 uintptr_t ncq_hdbl; 248 int ncq_phase; 249 250 taskq_t *ncq_cmd_taskq; 251 252 kmutex_t ncq_mutex; 253 }; 254 255 struct nvme_qpair { 256 size_t nq_nentry; 257 258 /* submission fields */ 259 nvme_dma_t *nq_sqdma; 260 nvme_sqe_t *nq_sq; 261 uint_t nq_sqhead; 262 uint_t nq_sqtail; 263 uintptr_t nq_sqtdbl; 264 265 /* completion */ 266 nvme_cq_t *nq_cq; 267 268 /* shared structures for completion and submission */ 269 nvme_cmd_t **nq_cmd; /* active command array */ 270 uint16_t nq_next_cmd; /* next potential empty queue slot */ 271 uint_t nq_active_cmds; /* number of active cmds */ 272 uint32_t nq_active_timeout; /* sum of the timeouts of active cmds */ 273 274 kmutex_t nq_mutex; /* protects shared state */ 275 ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */ 276 }; 277 278 typedef struct nvme_mgmt_lock { 279 kmutex_t nml_lock; 280 kcondvar_t nml_cv; 281 uintptr_t nml_bd_own; 282 } nvme_mgmt_lock_t; 283 284 struct nvme_device_stat { 285 /* Errors detected by driver */ 286 kstat_named_t nds_dma_bind_err; 287 kstat_named_t nds_abort_timeout; 288 kstat_named_t nds_abort_failed; 289 kstat_named_t nds_abort_successful; 290 kstat_named_t nds_abort_unsuccessful; 291 kstat_named_t nds_cmd_timeout; 292 kstat_named_t nds_wrong_logpage; 293 kstat_named_t nds_unknown_logpage; 294 kstat_named_t nds_too_many_cookies; 295 kstat_named_t nds_unknown_cid; 296 297 /* Errors detected by hardware */ 298 kstat_named_t nds_inv_cmd_err; 299 kstat_named_t nds_inv_field_err; 300 kstat_named_t nds_inv_nsfmt_err; 301 kstat_named_t nds_data_xfr_err; 302 kstat_named_t nds_internal_err; 303 kstat_named_t nds_abort_rq_err; 304 kstat_named_t nds_abort_pwrloss_err; 305 kstat_named_t nds_abort_sq_del; 306 kstat_named_t nds_nvm_cap_exc; 307 kstat_named_t nds_nvm_ns_notrdy; 308 kstat_named_t nds_nvm_ns_formatting; 309 kstat_named_t nds_inv_cq_err; 310 kstat_named_t nds_inv_qid_err; 311 kstat_named_t nds_max_qsz_exc; 312 kstat_named_t nds_inv_int_vect; 313 kstat_named_t nds_inv_log_page; 314 kstat_named_t nds_inv_format; 315 kstat_named_t nds_inv_q_del; 316 kstat_named_t nds_cnfl_attr; 317 kstat_named_t nds_inv_prot; 318 kstat_named_t nds_readonly; 319 kstat_named_t nds_inv_fwslot; 320 kstat_named_t nds_inv_fwimg; 321 kstat_named_t nds_fwact_creset; 322 kstat_named_t nds_fwact_nssr; 323 kstat_named_t nds_fwact_reset; 324 kstat_named_t nds_fwact_mtfa; 325 kstat_named_t nds_fwact_prohibited; 326 kstat_named_t nds_fw_overlap; 327 kstat_named_t nds_inv_cmdseq_err; 328 329 /* Errors reported by asynchronous events */ 330 kstat_named_t nds_diagfail_event; 331 kstat_named_t nds_persistent_event; 332 kstat_named_t nds_transient_event; 333 kstat_named_t nds_fw_load_event; 334 kstat_named_t nds_reliability_event; 335 kstat_named_t nds_temperature_event; 336 kstat_named_t nds_spare_event; 337 kstat_named_t nds_vendor_event; 338 kstat_named_t nds_notice_event; 339 kstat_named_t nds_unknown_event; 340 }; 341 342 #define NAS_CNT 0 343 #define NAS_AVG 1 344 #define NAS_MAX 2 345 struct nvme_admin_stat { 346 kstat_named_t nas_getlogpage[3]; 347 kstat_named_t nas_identify[3]; 348 kstat_named_t nas_abort[3]; 349 kstat_named_t nas_fwactivate[3]; 350 kstat_named_t nas_fwimgload[3]; 351 kstat_named_t nas_nsformat[3]; 352 kstat_named_t nas_vendor[3]; 353 kstat_named_t nas_other[3]; 354 }; 355 356 struct nvme { 357 dev_info_t *n_dip; 358 nvme_progress_t n_progress; 359 nvme_quirk_t n_quirks; 360 361 caddr_t n_regs; 362 ddi_acc_handle_t n_regh; 363 364 kmem_cache_t *n_cmd_cache; 365 kmem_cache_t *n_prp_cache; 366 367 size_t n_inth_sz; 368 ddi_intr_handle_t *n_inth; 369 int n_intr_cnt; 370 uint_t n_intr_pri; 371 int n_intr_cap; 372 int n_intr_type; 373 int n_intr_types; 374 375 ddi_acc_handle_t n_pcicfg_handle; 376 uint16_t n_vendor_id; 377 uint16_t n_device_id; 378 uint16_t n_subsystem_vendor_id; 379 uint16_t n_subsystem_device_id; 380 uint8_t n_revision_id; 381 382 char *n_product; 383 char *n_vendor; 384 385 nvme_version_t n_version; 386 boolean_t n_dead; 387 nvme_ioctl_errno_t n_dead_status; 388 taskq_ent_t n_dead_tqent; 389 boolean_t n_strict_version; 390 boolean_t n_ignore_unknown_vendor_status; 391 uint32_t n_admin_queue_len; 392 uint32_t n_io_squeue_len; 393 uint32_t n_io_cqueue_len; 394 uint16_t n_async_event_limit; 395 uint_t n_min_block_size; 396 uint16_t n_abort_command_limit; 397 uint64_t n_max_data_transfer_size; 398 boolean_t n_write_cache_present; 399 boolean_t n_write_cache_enabled; 400 int n_error_log_len; 401 boolean_t n_async_event_supported; 402 int n_submission_queues; 403 int n_completion_queues; 404 405 int n_nssr_supported; 406 int n_doorbell_stride; 407 int n_timeout; 408 int n_arbitration_mechanisms; 409 int n_cont_queues_reqd; 410 int n_max_queue_entries; 411 int n_pageshift; 412 int n_pagesize; 413 414 uint32_t n_namespace_count; 415 uint_t n_namespaces_attachable; 416 uint_t n_ioq_count; 417 uint_t n_cq_count; 418 419 /* 420 * This is cached identify controller and common namespace data that 421 * exists in the system. This generally can be used in the kernel; 422 * however, we have to be careful about what we use here because these 423 * values are not refreshed after attach. Therefore these are good for 424 * answering the question what does the controller support or what is in 425 * the common namespace information, but not otherwise. That means you 426 * shouldn't use this to try to answer how much capacity is still in the 427 * controller because this information is just cached. 428 */ 429 nvme_identify_ctrl_t *n_idctl; 430 nvme_identify_nsid_t *n_idcomns; 431 432 /* Pointer to the admin queue, which is always queue 0 in n_ioq. */ 433 nvme_qpair_t *n_adminq; 434 /* 435 * All command queues, including the admin queue. 436 * Its length is: n_ioq_count + 1. 437 */ 438 nvme_qpair_t **n_ioq; 439 nvme_cq_t **n_cq; 440 441 nvme_namespace_t *n_ns; 442 443 ddi_dma_attr_t n_queue_dma_attr; 444 ddi_dma_attr_t n_prp_dma_attr; 445 ddi_dma_attr_t n_sgl_dma_attr; 446 ddi_device_acc_attr_t n_reg_acc_attr; 447 ddi_iblock_cookie_t n_fm_ibc; 448 int n_fm_cap; 449 450 ksema_t n_abort_sema; 451 452 /* protects namespace management operations */ 453 nvme_mgmt_lock_t n_mgmt; 454 455 /* 456 * This lock protects the minor node locking state across the controller 457 * and all related namespaces. 458 */ 459 kmutex_t n_minor_mutex; 460 nvme_lock_t n_lock; 461 462 kstat_t *n_device_kstat; 463 nvme_device_stat_t n_device_stat; 464 465 kstat_t *n_admin_kstat; 466 kmutex_t n_admin_stat_mutex; 467 nvme_admin_stat_t n_admin_stat; 468 469 /* hot removal NDI event handling */ 470 ddi_eventcookie_t n_rm_cookie; 471 ddi_callback_id_t n_ev_rm_cb_id; 472 473 /* DDI UFM handle */ 474 ddi_ufm_handle_t *n_ufmh; 475 /* Cached Firmware Slot Information log page */ 476 nvme_fwslot_log_t *n_fwslot; 477 /* Lock protecting the cached firmware slot info */ 478 kmutex_t n_fwslot_mutex; 479 }; 480 481 struct nvme_namespace { 482 nvme_t *ns_nvme; 483 nvme_ns_progress_t ns_progress; 484 uint8_t ns_eui64[8]; 485 uint8_t ns_nguid[16]; 486 char ns_name[11]; 487 488 bd_handle_t ns_bd_hdl; 489 490 uint32_t ns_id; 491 size_t ns_block_count; 492 size_t ns_block_size; 493 size_t ns_best_block_size; 494 495 boolean_t ns_allocated; 496 boolean_t ns_active; 497 boolean_t ns_ignore; 498 boolean_t ns_attached; 499 500 nvme_identify_nsid_t *ns_idns; 501 502 /* 503 * Namespace lock, see the theory statement for more information. 504 */ 505 nvme_lock_t ns_lock; 506 507 /* 508 * If a namespace has neither NGUID nor EUI64, we create a devid in 509 * nvme_prepare_devid(). 510 */ 511 char *ns_devid; 512 }; 513 514 struct nvme_task_arg { 515 nvme_t *nt_nvme; 516 nvme_cmd_t *nt_cmd; 517 }; 518 519 typedef enum { 520 /* 521 * This indicates that there is no exclusive access required for this 522 * operation. However, this operation will fail if someone attempts to 523 * perform this operation and someone else holds a write lock. 524 */ 525 NVME_IOCTL_EXCL_NONE = 0, 526 /* 527 * This indicates that a write lock is required to perform the 528 * operation. 529 */ 530 NVME_IOCTL_EXCL_WRITE, 531 /* 532 * This indicates that the exclusive check should be skipped. The only 533 * case this should be used in is the lock and unlock ioctls as they 534 * should be able to proceed even when the controller is being used 535 * exclusively. 536 */ 537 NVME_IOCTL_EXCL_SKIP 538 } nvme_ioctl_excl_t; 539 540 /* 541 * This structure represents the set of checks that we apply to ioctl's using 542 * the nvme_ioctl_common_t structure as part of validation. 543 */ 544 typedef struct nvme_ioctl_check { 545 /* 546 * This indicates whether or not the command in question allows a 547 * namespace to be specified at all. If this is false, a namespace minor 548 * cannot be used and a controller minor must leave the nsid set to 549 * zero. 550 */ 551 boolean_t nck_ns_ok; 552 /* 553 * This indicates that a minor node corresponding to a namespace is 554 * allowed to issue this. 555 */ 556 boolean_t nck_ns_minor_ok; 557 /* 558 * This indicates that the controller should be skipped from all of the 559 * following processing behavior. That is, it's allowed to specify 560 * whatever it wants in the nsid field, regardless if it is valid or 561 * not. This is required for some of the Identify Command options that 562 * list endpoints. This should generally not be used and the driver 563 * should still validate the nuance here. 564 */ 565 boolean_t nck_skip_ctrl; 566 /* 567 * This indicates that if we're on the controller's minor and we don't 568 * have an explicit namespace ID (i.e. 0), should the namespace be 569 * rewritten to be the broadcast namespace. 570 */ 571 boolean_t nck_ctrl_rewrite; 572 /* 573 * This indicates whether or not the broadcast NSID is acceptable for 574 * the controller node. 575 */ 576 boolean_t nck_bcast_ok; 577 578 /* 579 * This indicates to the lock checking code what kind of exclusive 580 * access is required. This check occurs after any namespace rewriting 581 * has occurred. When looking at exclusivity, a broadcast namespace or 582 * namespace 0 indicate that the controller is the target, otherwise the 583 * target namespace will be checked for a write lock. 584 */ 585 nvme_ioctl_excl_t nck_excl; 586 } nvme_ioctl_check_t; 587 588 /* 589 * Constants 590 */ 591 extern uint_t nvme_vendor_specific_admin_cmd_max_timeout; 592 extern uint32_t nvme_vendor_specific_admin_cmd_size; 593 594 /* 595 * Common functions. 596 */ 597 extern nvme_namespace_t *nvme_nsid2ns(nvme_t *, uint32_t); 598 extern boolean_t nvme_ioctl_error(nvme_ioctl_common_t *, nvme_ioctl_errno_t, 599 uint32_t, uint32_t); 600 extern boolean_t nvme_ctrl_atleast(nvme_t *, const nvme_version_t *); 601 extern void nvme_ioctl_success(nvme_ioctl_common_t *); 602 603 /* 604 * Validation related functions and kernel tunable limits. 605 */ 606 extern boolean_t nvme_validate_logpage(nvme_t *, nvme_ioctl_get_logpage_t *); 607 extern boolean_t nvme_validate_identify(nvme_t *, nvme_ioctl_identify_t *, 608 boolean_t); 609 extern boolean_t nvme_validate_get_feature(nvme_t *, 610 nvme_ioctl_get_feature_t *); 611 extern boolean_t nvme_validate_vuc(nvme_t *, nvme_ioctl_passthru_t *); 612 extern boolean_t nvme_validate_format(nvme_t *, nvme_ioctl_format_t *); 613 extern boolean_t nvme_validate_fw_load(nvme_t *, nvme_ioctl_fw_load_t *); 614 extern boolean_t nvme_validate_fw_commit(nvme_t *, nvme_ioctl_fw_commit_t *); 615 616 /* 617 * Locking functions 618 */ 619 extern void nvme_rwlock(nvme_minor_t *, nvme_ioctl_lock_t *); 620 extern void nvme_rwunlock(nvme_minor_lock_info_t *, nvme_lock_t *); 621 extern void nvme_rwlock_ctrl_dead(void *); 622 extern void nvme_lock_init(nvme_lock_t *); 623 extern void nvme_lock_fini(nvme_lock_t *); 624 625 /* 626 * Statistics functions 627 */ 628 extern boolean_t nvme_stat_init(nvme_t *); 629 extern void nvme_stat_cleanup(nvme_t *); 630 extern void nvme_admin_stat_cmd(nvme_t *, nvme_cmd_t *); 631 632 #ifdef __cplusplus 633 } 634 #endif 635 636 #endif /* _NVME_VAR_H */ 637