1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2016 The MathWorks, Inc. All rights reserved. 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2019 Unix Software Ltd. 16 * Copyright 2024 Oxide Computer Company. 17 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 18 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 19 */ 20 21 #ifndef _NVME_VAR_H 22 #define _NVME_VAR_H 23 24 #include <sys/ddi.h> 25 #include <sys/sunddi.h> 26 #include <sys/blkdev.h> 27 #include <sys/taskq_impl.h> 28 #include <sys/list.h> 29 #include <sys/ddi_ufm.h> 30 #include <nvme_common.h> 31 32 /* 33 * NVMe driver state 34 */ 35 36 #ifdef __cplusplus 37 extern "C" { 38 #endif 39 40 typedef enum { 41 NVME_PCI_CONFIG = 1 << 0, 42 NVME_FMA_INIT = 1 << 1, 43 NVME_REGS_MAPPED = 1 << 2, 44 NVME_ADMIN_QUEUE = 1 << 3, 45 NVME_CTRL_LIMITS = 1 << 4, 46 NVME_INTERRUPTS = 1 << 5, 47 NVME_UFM_INIT = 1 << 6, 48 NVME_MUTEX_INIT = 1 << 7, 49 NVME_MGMT_INIT = 1 << 8 50 } nvme_progress_t; 51 52 typedef enum { 53 NVME_NS_LOCK = 1 << 0 54 } nvme_ns_progress_t; 55 56 typedef enum { 57 /* 58 * The controller fails to properly process commands on the admin queue 59 * if the first one has CID 0. Subsequent use of CID 0 doesn't present 60 * a problem. 61 */ 62 NVME_QUIRK_START_CID = 1 << 0, 63 } nvme_quirk_t; 64 65 #define NVME_MIN_ADMIN_QUEUE_LEN 16 66 #define NVME_MIN_IO_QUEUE_LEN 16 67 #define NVME_DEFAULT_ADMIN_QUEUE_LEN 256 68 #define NVME_DEFAULT_IO_QUEUE_LEN 1024 69 #define NVME_DEFAULT_ASYNC_EVENT_LIMIT 10 70 #define NVME_MIN_ASYNC_EVENT_LIMIT 1 71 #define NVME_DEFAULT_MIN_BLOCK_SIZE 512 72 73 74 typedef struct nvme nvme_t; 75 typedef struct nvme_namespace nvme_namespace_t; 76 typedef struct nvme_minor nvme_minor_t; 77 typedef struct nvme_lock nvme_lock_t; 78 typedef struct nvme_minor_lock_info nvme_minor_lock_info_t; 79 typedef struct nvme_dma nvme_dma_t; 80 typedef struct nvme_cmd nvme_cmd_t; 81 typedef struct nvme_cq nvme_cq_t; 82 typedef struct nvme_qpair nvme_qpair_t; 83 typedef struct nvme_task_arg nvme_task_arg_t; 84 85 /* 86 * These states represent the minor's perspective. That is, of a minor's 87 * namespace and controller lock, where is it? 88 */ 89 typedef enum { 90 NVME_LOCK_STATE_UNLOCKED = 0, 91 NVME_LOCK_STATE_BLOCKED, 92 NVME_LOCK_STATE_ACQUIRED 93 } nvme_minor_lock_state_t; 94 95 struct nvme_minor_lock_info { 96 list_node_t nli_node; 97 nvme_lock_t *nli_lock; 98 nvme_minor_lock_state_t nli_state; 99 nvme_lock_level_t nli_curlevel; 100 /* 101 * While the minor points back to itself and the nvme_t should always 102 * point to the current controller, the namespace should only point to 103 * one if this is a particular namespace lock. The former two are 104 * initialized at minor initialization time. 105 */ 106 nvme_minor_t *nli_minor; 107 nvme_t *nli_nvme; 108 nvme_namespace_t *nli_ns; 109 /* 110 * This is the common ioctl information that should be filled in when 111 * we're being woken up for any reason other than an interrupted signal. 112 * This should only be set while blocking. 113 */ 114 nvme_ioctl_common_t *nli_ioc; 115 /* 116 * The following are provided for debugging purposes. In particular, 117 * information like the kthread_t and related that performed this should 118 * be considered suspect as it represents who took the operation, not 119 * who performed the operation (unless we're actively blocking). 120 */ 121 hrtime_t nli_last_change; 122 uintptr_t nli_acq_kthread; 123 pid_t nli_acq_pid; 124 }; 125 126 struct nvme_minor { 127 /* 128 * The following three fields are set when this is created. 129 */ 130 id_t nm_minor; 131 nvme_t *nm_ctrl; 132 nvme_namespace_t *nm_ns; 133 /* 134 * This link is used to index this minor on the global list of active 135 * open-related minors. This is only manipulated under the 136 * nvme_open_minors_mutex. 137 */ 138 avl_node_t nm_avl; 139 /* 140 * Information related to locking. Note, there is no pointer to a locked 141 * controller as the only one can be the one specified here. This data 142 * is protected by the controller's n_minor_mutex. 143 */ 144 kcondvar_t nm_cv; 145 nvme_minor_lock_info_t nm_ctrl_lock; 146 nvme_minor_lock_info_t nm_ns_lock; 147 }; 148 149 struct nvme_lock { 150 nvme_minor_lock_info_t *nl_writer; 151 list_t nl_readers; 152 list_t nl_pend_readers; 153 list_t nl_pend_writers; 154 /* 155 * The following are stats to indicate how often certain locking 156 * activities have occurred for debugging purposes. 157 */ 158 uint32_t nl_nwrite_locks; 159 uint32_t nl_nread_locks; 160 uint32_t nl_npend_writes; 161 uint32_t nl_npend_reads; 162 uint32_t nl_nnonblock; 163 uint32_t nl_nsignals; 164 uint32_t nl_nsig_unlock; 165 uint32_t nl_nsig_blocks; 166 uint32_t nl_nsig_acq; 167 }; 168 169 struct nvme_dma { 170 ddi_dma_handle_t nd_dmah; 171 ddi_acc_handle_t nd_acch; 172 ddi_dma_cookie_t nd_cookie; 173 uint_t nd_ncookie; 174 caddr_t nd_memp; 175 size_t nd_len; 176 boolean_t nd_cached; 177 }; 178 179 struct nvme_cmd { 180 struct list_node nc_list; 181 182 nvme_sqe_t nc_sqe; 183 nvme_cqe_t nc_cqe; 184 185 void (*nc_callback)(void *); 186 bd_xfer_t *nc_xfer; 187 boolean_t nc_completed; 188 boolean_t nc_dontpanic; 189 uint16_t nc_sqid; 190 191 nvme_dma_t *nc_dma; 192 nvme_dma_t *nc_prp; /* DMA for PRP lists */ 193 194 kmutex_t nc_mutex; 195 kcondvar_t nc_cv; 196 197 taskq_ent_t nc_tqent; 198 nvme_t *nc_nvme; 199 }; 200 201 struct nvme_cq { 202 size_t ncq_nentry; 203 uint16_t ncq_id; 204 205 nvme_dma_t *ncq_dma; 206 nvme_cqe_t *ncq_cq; 207 uint_t ncq_head; 208 uint_t ncq_tail; 209 uintptr_t ncq_hdbl; 210 int ncq_phase; 211 212 taskq_t *ncq_cmd_taskq; 213 214 kmutex_t ncq_mutex; 215 }; 216 217 struct nvme_qpair { 218 size_t nq_nentry; 219 220 /* submission fields */ 221 nvme_dma_t *nq_sqdma; 222 nvme_sqe_t *nq_sq; 223 uint_t nq_sqhead; 224 uint_t nq_sqtail; 225 uintptr_t nq_sqtdbl; 226 227 /* completion */ 228 nvme_cq_t *nq_cq; 229 230 /* shared structures for completion and submission */ 231 nvme_cmd_t **nq_cmd; /* active command array */ 232 uint16_t nq_next_cmd; /* next potential empty queue slot */ 233 uint_t nq_active_cmds; /* number of active cmds */ 234 235 kmutex_t nq_mutex; /* protects shared state */ 236 ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */ 237 }; 238 239 typedef struct nvme_mgmt_lock { 240 kmutex_t nml_lock; 241 kcondvar_t nml_cv; 242 uintptr_t nml_bd_own; 243 } nvme_mgmt_lock_t; 244 245 struct nvme { 246 dev_info_t *n_dip; 247 nvme_progress_t n_progress; 248 nvme_quirk_t n_quirks; 249 250 caddr_t n_regs; 251 ddi_acc_handle_t n_regh; 252 253 kmem_cache_t *n_cmd_cache; 254 kmem_cache_t *n_prp_cache; 255 256 size_t n_inth_sz; 257 ddi_intr_handle_t *n_inth; 258 int n_intr_cnt; 259 uint_t n_intr_pri; 260 int n_intr_cap; 261 int n_intr_type; 262 int n_intr_types; 263 264 ddi_acc_handle_t n_pcicfg_handle; 265 uint16_t n_vendor_id; 266 uint16_t n_device_id; 267 uint16_t n_subsystem_vendor_id; 268 uint16_t n_subsystem_device_id; 269 uint8_t n_revision_id; 270 271 char *n_product; 272 char *n_vendor; 273 274 nvme_version_t n_version; 275 boolean_t n_dead; 276 nvme_ioctl_errno_t n_dead_status; 277 taskq_ent_t n_dead_tqent; 278 boolean_t n_strict_version; 279 boolean_t n_ignore_unknown_vendor_status; 280 uint32_t n_admin_queue_len; 281 uint32_t n_io_squeue_len; 282 uint32_t n_io_cqueue_len; 283 uint16_t n_async_event_limit; 284 uint_t n_min_block_size; 285 uint16_t n_abort_command_limit; 286 uint64_t n_max_data_transfer_size; 287 boolean_t n_write_cache_present; 288 boolean_t n_write_cache_enabled; 289 int n_error_log_len; 290 boolean_t n_async_event_supported; 291 int n_submission_queues; 292 int n_completion_queues; 293 294 int n_nssr_supported; 295 int n_doorbell_stride; 296 int n_timeout; 297 int n_arbitration_mechanisms; 298 int n_cont_queues_reqd; 299 int n_max_queue_entries; 300 int n_pageshift; 301 int n_pagesize; 302 303 uint32_t n_namespace_count; 304 uint_t n_namespaces_attachable; 305 uint_t n_ioq_count; 306 uint_t n_cq_count; 307 308 /* 309 * This is cached identify controller and common namespace data that 310 * exists in the system. This generally can be used in the kernel; 311 * however, we have to be careful about what we use here because these 312 * values are not refreshed after attach. Therefore these are good for 313 * answering the question what does the controller support or what is in 314 * the common namespace information, but not otherwise. That means you 315 * shouldn't use this to try to answer how much capacity is still in the 316 * controller because this information is just cached. 317 */ 318 nvme_identify_ctrl_t *n_idctl; 319 nvme_identify_nsid_t *n_idcomns; 320 321 /* Pointer to the admin queue, which is always queue 0 in n_ioq. */ 322 nvme_qpair_t *n_adminq; 323 /* 324 * All command queues, including the admin queue. 325 * Its length is: n_ioq_count + 1. 326 */ 327 nvme_qpair_t **n_ioq; 328 nvme_cq_t **n_cq; 329 330 nvme_namespace_t *n_ns; 331 332 ddi_dma_attr_t n_queue_dma_attr; 333 ddi_dma_attr_t n_prp_dma_attr; 334 ddi_dma_attr_t n_sgl_dma_attr; 335 ddi_device_acc_attr_t n_reg_acc_attr; 336 ddi_iblock_cookie_t n_fm_ibc; 337 int n_fm_cap; 338 339 ksema_t n_abort_sema; 340 341 /* protects namespace management operations */ 342 nvme_mgmt_lock_t n_mgmt; 343 344 /* 345 * This lock protects the minor node locking state across the controller 346 * and all related namespaces. 347 */ 348 kmutex_t n_minor_mutex; 349 nvme_lock_t n_lock; 350 351 /* errors detected by driver */ 352 uint32_t n_dma_bind_err; 353 uint32_t n_abort_failed; 354 uint32_t n_cmd_timeout; 355 uint32_t n_cmd_aborted; 356 uint32_t n_wrong_logpage; 357 uint32_t n_unknown_logpage; 358 uint32_t n_too_many_cookies; 359 uint32_t n_unknown_cid; 360 361 /* errors detected by hardware */ 362 uint32_t n_data_xfr_err; 363 uint32_t n_internal_err; 364 uint32_t n_abort_rq_err; 365 uint32_t n_abort_sq_del; 366 uint32_t n_nvm_cap_exc; 367 uint32_t n_nvm_ns_notrdy; 368 uint32_t n_nvm_ns_formatting; 369 uint32_t n_inv_cq_err; 370 uint32_t n_inv_qid_err; 371 uint32_t n_max_qsz_exc; 372 uint32_t n_inv_int_vect; 373 uint32_t n_inv_log_page; 374 uint32_t n_inv_format; 375 uint32_t n_inv_q_del; 376 uint32_t n_cnfl_attr; 377 uint32_t n_inv_prot; 378 uint32_t n_readonly; 379 380 /* errors reported by asynchronous events */ 381 uint32_t n_diagfail_event; 382 uint32_t n_persistent_event; 383 uint32_t n_transient_event; 384 uint32_t n_fw_load_event; 385 uint32_t n_reliability_event; 386 uint32_t n_temperature_event; 387 uint32_t n_spare_event; 388 uint32_t n_vendor_event; 389 uint32_t n_notice_event; 390 uint32_t n_unknown_event; 391 392 /* hot removal NDI event handling */ 393 ddi_eventcookie_t n_rm_cookie; 394 ddi_callback_id_t n_ev_rm_cb_id; 395 396 /* DDI UFM handle */ 397 ddi_ufm_handle_t *n_ufmh; 398 /* Cached Firmware Slot Information log page */ 399 nvme_fwslot_log_t *n_fwslot; 400 /* Lock protecting the cached firmware slot info */ 401 kmutex_t n_fwslot_mutex; 402 }; 403 404 struct nvme_namespace { 405 nvme_t *ns_nvme; 406 nvme_ns_progress_t ns_progress; 407 uint8_t ns_eui64[8]; 408 uint8_t ns_nguid[16]; 409 char ns_name[11]; 410 411 bd_handle_t ns_bd_hdl; 412 413 uint32_t ns_id; 414 size_t ns_block_count; 415 size_t ns_block_size; 416 size_t ns_best_block_size; 417 418 boolean_t ns_allocated; 419 boolean_t ns_active; 420 boolean_t ns_ignore; 421 boolean_t ns_attached; 422 423 nvme_identify_nsid_t *ns_idns; 424 425 /* 426 * Namespace lock, see the theory statement for more information. 427 */ 428 nvme_lock_t ns_lock; 429 430 /* 431 * If a namespace has neither NGUID nor EUI64, we create a devid in 432 * nvme_prepare_devid(). 433 */ 434 char *ns_devid; 435 }; 436 437 struct nvme_task_arg { 438 nvme_t *nt_nvme; 439 nvme_cmd_t *nt_cmd; 440 }; 441 442 typedef enum { 443 /* 444 * This indicates that there is no exclusive access required for this 445 * operation. However, this operation will fail if someone attempts to 446 * perform this operation and someone else holds a write lock. 447 */ 448 NVME_IOCTL_EXCL_NONE = 0, 449 /* 450 * This indicates that a write lock is required to perform the 451 * operation. 452 */ 453 NVME_IOCTL_EXCL_WRITE, 454 /* 455 * This indicates that the exclusive check should be skipped. The only 456 * case this should be used in is the lock and unlock ioctls as they 457 * should be able to proceed even when the controller is being used 458 * exclusively. 459 */ 460 NVME_IOCTL_EXCL_SKIP 461 } nvme_ioctl_excl_t; 462 463 /* 464 * This structure represents the set of checks that we apply to ioctl's using 465 * the nvme_ioctl_common_t structure as part of validation. 466 */ 467 typedef struct nvme_ioctl_check { 468 /* 469 * This indicates whether or not the command in question allows a 470 * namespace to be specified at all. If this is false, a namespace minor 471 * cannot be used and a controller minor must leave the nsid set to 472 * zero. 473 */ 474 boolean_t nck_ns_ok; 475 /* 476 * This indicates that a minor node corresponding to a namespace is 477 * allowed to issue this. 478 */ 479 boolean_t nck_ns_minor_ok; 480 /* 481 * This indicates that the controller should be skipped from all of the 482 * following processing behavior. That is, it's allowed to specify 483 * whatever it wants in the nsid field, regardless if it is valid or 484 * not. This is required for some of the Identify Command options that 485 * list endpoints. This should generally not be used and the driver 486 * should still validate the nuance here. 487 */ 488 boolean_t nck_skip_ctrl; 489 /* 490 * This indicates that if we're on the controller's minor and we don't 491 * have an explicit namespace ID (i.e. 0), should the namespace be 492 * rewritten to be the broadcast namespace. 493 */ 494 boolean_t nck_ctrl_rewrite; 495 /* 496 * This indicates whether or not the broadcast NSID is acceptable for 497 * the controller node. 498 */ 499 boolean_t nck_bcast_ok; 500 501 /* 502 * This indicates to the lock checking code what kind of exclusive 503 * access is required. This check occurs after any namespace rewriting 504 * has occurred. When looking at exclusivity, a broadcast namespace or 505 * namespace 0 indicate that the controller is the target, otherwise the 506 * target namespace will be checked for a write lock. 507 */ 508 nvme_ioctl_excl_t nck_excl; 509 } nvme_ioctl_check_t; 510 511 /* 512 * Constants 513 */ 514 extern uint_t nvme_vendor_specific_admin_cmd_max_timeout; 515 extern uint32_t nvme_vendor_specific_admin_cmd_size; 516 517 /* 518 * Common functions. 519 */ 520 extern nvme_namespace_t *nvme_nsid2ns(nvme_t *, uint32_t); 521 extern boolean_t nvme_ioctl_error(nvme_ioctl_common_t *, nvme_ioctl_errno_t, 522 uint32_t, uint32_t); 523 extern boolean_t nvme_ctrl_atleast(nvme_t *, const nvme_version_t *); 524 extern void nvme_ioctl_success(nvme_ioctl_common_t *); 525 526 /* 527 * Validation related functions and kernel tunable limits. 528 */ 529 extern boolean_t nvme_validate_logpage(nvme_t *, nvme_ioctl_get_logpage_t *); 530 extern boolean_t nvme_validate_identify(nvme_t *, nvme_ioctl_identify_t *, 531 boolean_t); 532 extern boolean_t nvme_validate_get_feature(nvme_t *, 533 nvme_ioctl_get_feature_t *); 534 extern boolean_t nvme_validate_vuc(nvme_t *, nvme_ioctl_passthru_t *); 535 extern boolean_t nvme_validate_format(nvme_t *, nvme_ioctl_format_t *); 536 extern boolean_t nvme_validate_fw_load(nvme_t *, nvme_ioctl_fw_load_t *); 537 extern boolean_t nvme_validate_fw_commit(nvme_t *, nvme_ioctl_fw_commit_t *); 538 539 /* 540 * Locking functions 541 */ 542 extern void nvme_rwlock(nvme_minor_t *, nvme_ioctl_lock_t *); 543 extern void nvme_rwunlock(nvme_minor_lock_info_t *, nvme_lock_t *); 544 extern void nvme_rwlock_ctrl_dead(void *); 545 extern void nvme_lock_init(nvme_lock_t *); 546 extern void nvme_lock_fini(nvme_lock_t *); 547 548 #ifdef __cplusplus 549 } 550 #endif 551 552 #endif /* _NVME_VAR_H */ 553