xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme_var.h (revision 9b9d39d2a32ff806d2431dbcc50968ef1e6d46b2)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 The MathWorks, Inc. All rights reserved.
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2019 Unix Software Ltd.
16  * Copyright 2024 Oxide Computer Company.
17  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
18  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
19  */
20 
21 #ifndef _NVME_VAR_H
22 #define	_NVME_VAR_H
23 
24 #include <sys/ddi.h>
25 #include <sys/sunddi.h>
26 #include <sys/blkdev.h>
27 #include <sys/taskq_impl.h>
28 #include <sys/list.h>
29 #include <sys/ddi_ufm.h>
30 #include <nvme_common.h>
31 
32 /*
33  * NVMe driver state
34  */
35 
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 
40 typedef enum {
41 	NVME_PCI_CONFIG			= 1 << 0,
42 	NVME_FMA_INIT			= 1 << 1,
43 	NVME_REGS_MAPPED		= 1 << 2,
44 	NVME_ADMIN_QUEUE		= 1 << 3,
45 	NVME_CTRL_LIMITS		= 1 << 4,
46 	NVME_INTERRUPTS			= 1 << 5,
47 	NVME_UFM_INIT			= 1 << 6,
48 	NVME_MUTEX_INIT			= 1 << 7,
49 	NVME_MGMT_INIT			= 1 << 8
50 } nvme_progress_t;
51 
52 typedef enum {
53 	NVME_NS_LOCK	= 1 << 0
54 } nvme_ns_progress_t;
55 
56 typedef enum {
57 	/*
58 	 * The controller fails to properly process commands on the admin queue
59 	 * if the first one has CID 0. Subsequent use of CID 0 doesn't present
60 	 * a problem.
61 	 */
62 	NVME_QUIRK_START_CID		= 1 << 0,
63 } nvme_quirk_t;
64 
65 #define	NVME_MIN_ADMIN_QUEUE_LEN	16
66 #define	NVME_MIN_IO_QUEUE_LEN		16
67 #define	NVME_DEFAULT_ADMIN_QUEUE_LEN	256
68 #define	NVME_DEFAULT_IO_QUEUE_LEN	1024
69 #define	NVME_DEFAULT_ASYNC_EVENT_LIMIT	10
70 #define	NVME_MIN_ASYNC_EVENT_LIMIT	1
71 #define	NVME_DEFAULT_MIN_BLOCK_SIZE	512
72 
73 
74 typedef struct nvme nvme_t;
75 typedef struct nvme_namespace nvme_namespace_t;
76 typedef struct nvme_minor nvme_minor_t;
77 typedef struct nvme_lock nvme_lock_t;
78 typedef struct nvme_minor_lock_info nvme_minor_lock_info_t;
79 typedef struct nvme_dma nvme_dma_t;
80 typedef struct nvme_cmd nvme_cmd_t;
81 typedef struct nvme_cq nvme_cq_t;
82 typedef struct nvme_qpair nvme_qpair_t;
83 typedef struct nvme_task_arg nvme_task_arg_t;
84 
85 /*
86  * These states represent the minor's perspective. That is, of a minor's
87  * namespace and controller lock, where is it?
88  */
89 typedef enum {
90 	NVME_LOCK_STATE_UNLOCKED	= 0,
91 	NVME_LOCK_STATE_BLOCKED,
92 	NVME_LOCK_STATE_ACQUIRED
93 } nvme_minor_lock_state_t;
94 
95 struct nvme_minor_lock_info {
96 	list_node_t nli_node;
97 	nvme_lock_t *nli_lock;
98 	nvme_minor_lock_state_t nli_state;
99 	nvme_lock_level_t nli_curlevel;
100 	/*
101 	 * While the minor points back to itself and the nvme_t should always
102 	 * point to the current controller, the namespace should only point to
103 	 * one if this is a particular namespace lock. The former two are
104 	 * initialized at minor initialization time.
105 	 */
106 	nvme_minor_t *nli_minor;
107 	nvme_t *nli_nvme;
108 	nvme_namespace_t *nli_ns;
109 	/*
110 	 * This is the common ioctl information that should be filled in when
111 	 * we're being woken up for any reason other than an interrupted signal.
112 	 * This should only be set while blocking.
113 	 */
114 	nvme_ioctl_common_t *nli_ioc;
115 	/*
116 	 * The following are provided for debugging purposes. In particular,
117 	 * information like the kthread_t and related that performed this should
118 	 * be considered suspect as it represents who took the operation, not
119 	 * who performed the operation (unless we're actively blocking).
120 	 */
121 	hrtime_t nli_last_change;
122 	uintptr_t nli_acq_kthread;
123 	pid_t nli_acq_pid;
124 };
125 
126 struct nvme_minor {
127 	/*
128 	 * The following three fields are set when this is created.
129 	 */
130 	id_t nm_minor;
131 	nvme_t *nm_ctrl;
132 	nvme_namespace_t *nm_ns;
133 	/*
134 	 * This link is used to index this minor on the global list of active
135 	 * open-related minors. This is only manipulated under the
136 	 * nvme_open_minors_mutex.
137 	 */
138 	avl_node_t nm_avl;
139 	/*
140 	 * Information related to locking. Note, there is no pointer to a locked
141 	 * controller as the only one can be the one specified here. This data
142 	 * is protected by the controller's n_minor_mutex.
143 	 */
144 	kcondvar_t nm_cv;
145 	nvme_minor_lock_info_t nm_ctrl_lock;
146 	nvme_minor_lock_info_t nm_ns_lock;
147 };
148 
149 struct nvme_lock {
150 	nvme_minor_lock_info_t *nl_writer;
151 	list_t nl_readers;
152 	list_t nl_pend_readers;
153 	list_t nl_pend_writers;
154 	/*
155 	 * The following are stats to indicate how often certain locking
156 	 * activities have occurred for debugging purposes.
157 	 */
158 	uint32_t nl_nwrite_locks;
159 	uint32_t nl_nread_locks;
160 	uint32_t nl_npend_writes;
161 	uint32_t nl_npend_reads;
162 	uint32_t nl_nnonblock;
163 	uint32_t nl_nsignals;
164 	uint32_t nl_nsig_unlock;
165 	uint32_t nl_nsig_blocks;
166 	uint32_t nl_nsig_acq;
167 };
168 
169 struct nvme_dma {
170 	ddi_dma_handle_t nd_dmah;
171 	ddi_acc_handle_t nd_acch;
172 	ddi_dma_cookie_t nd_cookie;
173 	uint_t nd_ncookie;
174 	caddr_t nd_memp;
175 	size_t nd_len;
176 	boolean_t nd_cached;
177 };
178 
179 struct nvme_cmd {
180 	struct list_node nc_list;
181 
182 	nvme_sqe_t nc_sqe;
183 	nvme_cqe_t nc_cqe;
184 
185 	void (*nc_callback)(void *);
186 	bd_xfer_t *nc_xfer;
187 	boolean_t nc_completed;
188 	boolean_t nc_dontpanic;
189 	uint16_t nc_sqid;
190 
191 	nvme_dma_t *nc_dma;
192 	nvme_dma_t *nc_prp; /* DMA for PRP lists */
193 
194 	kmutex_t nc_mutex;
195 	kcondvar_t nc_cv;
196 
197 	taskq_ent_t nc_tqent;
198 	nvme_t *nc_nvme;
199 };
200 
201 struct nvme_cq {
202 	size_t ncq_nentry;
203 	uint16_t ncq_id;
204 
205 	nvme_dma_t *ncq_dma;
206 	nvme_cqe_t *ncq_cq;
207 	uint_t ncq_head;
208 	uintptr_t ncq_hdbl;
209 	int ncq_phase;
210 
211 	taskq_t *ncq_cmd_taskq;
212 
213 	kmutex_t ncq_mutex;
214 };
215 
216 struct nvme_qpair {
217 	size_t nq_nentry;
218 
219 	/* submission fields */
220 	nvme_dma_t *nq_sqdma;
221 	nvme_sqe_t *nq_sq;
222 	uint_t nq_sqhead;
223 	uint_t nq_sqtail;
224 	uintptr_t nq_sqtdbl;
225 
226 	/* completion */
227 	nvme_cq_t *nq_cq;
228 
229 	/* shared structures for completion and submission */
230 	nvme_cmd_t **nq_cmd;	/* active command array */
231 	uint16_t nq_next_cmd;	/* next potential empty queue slot */
232 	uint_t nq_active_cmds;	/* number of active cmds */
233 
234 	kmutex_t nq_mutex;	/* protects shared state */
235 	ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */
236 };
237 
238 typedef struct nvme_mgmt_lock {
239 	kmutex_t nml_lock;
240 	kcondvar_t nml_cv;
241 	uintptr_t nml_bd_own;
242 } nvme_mgmt_lock_t;
243 
244 struct nvme {
245 	dev_info_t *n_dip;
246 	nvme_progress_t n_progress;
247 	nvme_quirk_t n_quirks;
248 
249 	caddr_t n_regs;
250 	ddi_acc_handle_t n_regh;
251 
252 	kmem_cache_t *n_cmd_cache;
253 	kmem_cache_t *n_prp_cache;
254 
255 	size_t n_inth_sz;
256 	ddi_intr_handle_t *n_inth;
257 	int n_intr_cnt;
258 	uint_t n_intr_pri;
259 	int n_intr_cap;
260 	int n_intr_type;
261 	int n_intr_types;
262 
263 	ddi_acc_handle_t n_pcicfg_handle;
264 	uint16_t n_vendor_id;
265 	uint16_t n_device_id;
266 	uint16_t n_subsystem_vendor_id;
267 	uint16_t n_subsystem_device_id;
268 	uint8_t n_revision_id;
269 
270 	char *n_product;
271 	char *n_vendor;
272 
273 	nvme_version_t n_version;
274 	boolean_t n_dead;
275 	nvme_ioctl_errno_t n_dead_status;
276 	taskq_ent_t n_dead_tqent;
277 	boolean_t n_strict_version;
278 	boolean_t n_ignore_unknown_vendor_status;
279 	uint32_t n_admin_queue_len;
280 	uint32_t n_io_squeue_len;
281 	uint32_t n_io_cqueue_len;
282 	uint16_t n_async_event_limit;
283 	uint_t n_min_block_size;
284 	uint16_t n_abort_command_limit;
285 	uint64_t n_max_data_transfer_size;
286 	boolean_t n_write_cache_present;
287 	boolean_t n_write_cache_enabled;
288 	int n_error_log_len;
289 	boolean_t n_async_event_supported;
290 	int n_submission_queues;
291 	int n_completion_queues;
292 
293 	int n_nssr_supported;
294 	int n_doorbell_stride;
295 	int n_timeout;
296 	int n_arbitration_mechanisms;
297 	int n_cont_queues_reqd;
298 	int n_max_queue_entries;
299 	int n_pageshift;
300 	int n_pagesize;
301 
302 	uint32_t n_namespace_count;
303 	uint_t n_namespaces_attachable;
304 	uint_t n_ioq_count;
305 	uint_t n_cq_count;
306 
307 	/*
308 	 * This is cached identify controller and common namespace data that
309 	 * exists in the system. This generally can be used in the kernel;
310 	 * however, we have to be careful about what we use here because these
311 	 * values are not refreshed after attach. Therefore these are good for
312 	 * answering the question what does the controller support or what is in
313 	 * the common namespace information, but not otherwise. That means you
314 	 * shouldn't use this to try to answer how much capacity is still in the
315 	 * controller because this information is just cached.
316 	 */
317 	nvme_identify_ctrl_t *n_idctl;
318 	nvme_identify_nsid_t *n_idcomns;
319 
320 	/* Pointer to the admin queue, which is always queue 0 in n_ioq. */
321 	nvme_qpair_t *n_adminq;
322 	/*
323 	 * All command queues, including the admin queue.
324 	 * Its length is: n_ioq_count + 1.
325 	 */
326 	nvme_qpair_t **n_ioq;
327 	nvme_cq_t **n_cq;
328 
329 	nvme_namespace_t *n_ns;
330 
331 	ddi_dma_attr_t n_queue_dma_attr;
332 	ddi_dma_attr_t n_prp_dma_attr;
333 	ddi_dma_attr_t n_sgl_dma_attr;
334 	ddi_device_acc_attr_t n_reg_acc_attr;
335 	ddi_iblock_cookie_t n_fm_ibc;
336 	int n_fm_cap;
337 
338 	ksema_t n_abort_sema;
339 
340 	/* protects namespace management operations */
341 	nvme_mgmt_lock_t n_mgmt;
342 
343 	/*
344 	 * This lock protects the minor node locking state across the controller
345 	 * and all related namespaces.
346 	 */
347 	kmutex_t n_minor_mutex;
348 	nvme_lock_t n_lock;
349 
350 	/* errors detected by driver */
351 	uint32_t n_dma_bind_err;
352 	uint32_t n_abort_failed;
353 	uint32_t n_cmd_timeout;
354 	uint32_t n_cmd_aborted;
355 	uint32_t n_wrong_logpage;
356 	uint32_t n_unknown_logpage;
357 	uint32_t n_too_many_cookies;
358 	uint32_t n_unknown_cid;
359 
360 	/* errors detected by hardware */
361 	uint32_t n_data_xfr_err;
362 	uint32_t n_internal_err;
363 	uint32_t n_abort_rq_err;
364 	uint32_t n_abort_sq_del;
365 	uint32_t n_nvm_cap_exc;
366 	uint32_t n_nvm_ns_notrdy;
367 	uint32_t n_nvm_ns_formatting;
368 	uint32_t n_inv_cq_err;
369 	uint32_t n_inv_qid_err;
370 	uint32_t n_max_qsz_exc;
371 	uint32_t n_inv_int_vect;
372 	uint32_t n_inv_log_page;
373 	uint32_t n_inv_format;
374 	uint32_t n_inv_q_del;
375 	uint32_t n_cnfl_attr;
376 	uint32_t n_inv_prot;
377 	uint32_t n_readonly;
378 
379 	/* errors reported by asynchronous events */
380 	uint32_t n_diagfail_event;
381 	uint32_t n_persistent_event;
382 	uint32_t n_transient_event;
383 	uint32_t n_fw_load_event;
384 	uint32_t n_reliability_event;
385 	uint32_t n_temperature_event;
386 	uint32_t n_spare_event;
387 	uint32_t n_vendor_event;
388 	uint32_t n_notice_event;
389 	uint32_t n_unknown_event;
390 
391 	/* hot removal NDI event handling */
392 	ddi_eventcookie_t n_rm_cookie;
393 	ddi_callback_id_t n_ev_rm_cb_id;
394 
395 	/* DDI UFM handle */
396 	ddi_ufm_handle_t *n_ufmh;
397 	/* Cached Firmware Slot Information log page */
398 	nvme_fwslot_log_t *n_fwslot;
399 	/* Lock protecting the cached firmware slot info */
400 	kmutex_t n_fwslot_mutex;
401 };
402 
403 struct nvme_namespace {
404 	nvme_t *ns_nvme;
405 	nvme_ns_progress_t ns_progress;
406 	uint8_t ns_eui64[8];
407 	uint8_t	ns_nguid[16];
408 	char	ns_name[11];
409 
410 	bd_handle_t ns_bd_hdl;
411 
412 	uint32_t ns_id;
413 	size_t ns_block_count;
414 	size_t ns_block_size;
415 	size_t ns_best_block_size;
416 
417 	boolean_t ns_allocated;
418 	boolean_t ns_active;
419 	boolean_t ns_ignore;
420 	boolean_t ns_attached;
421 
422 	nvme_identify_nsid_t *ns_idns;
423 
424 	/*
425 	 * Namespace lock, see the theory statement for more information.
426 	 */
427 	nvme_lock_t ns_lock;
428 
429 	/*
430 	 * If a namespace has neither NGUID nor EUI64, we create a devid in
431 	 * nvme_prepare_devid().
432 	 */
433 	char *ns_devid;
434 };
435 
436 struct nvme_task_arg {
437 	nvme_t *nt_nvme;
438 	nvme_cmd_t *nt_cmd;
439 };
440 
441 typedef enum {
442 	/*
443 	 * This indicates that there is no exclusive access required for this
444 	 * operation. However, this operation will fail if someone attempts to
445 	 * perform this operation and someone else holds a write lock.
446 	 */
447 	NVME_IOCTL_EXCL_NONE	= 0,
448 	/*
449 	 * This indicates that a write lock is required to perform the
450 	 * operation.
451 	 */
452 	NVME_IOCTL_EXCL_WRITE,
453 	/*
454 	 * This indicates that the exclusive check should be skipped. The only
455 	 * case this should be used in is the lock and unlock ioctls as they
456 	 * should be able to proceed even when the controller is being used
457 	 * exclusively.
458 	 */
459 	NVME_IOCTL_EXCL_SKIP
460 } nvme_ioctl_excl_t;
461 
462 /*
463  * This structure represents the set of checks that we apply to ioctl's using
464  * the nvme_ioctl_common_t structure as part of validation.
465  */
466 typedef struct nvme_ioctl_check {
467 	/*
468 	 * This indicates whether or not the command in question allows a
469 	 * namespace to be specified at all. If this is false, a namespace minor
470 	 * cannot be used and a controller minor must leave the nsid set to
471 	 * zero.
472 	 */
473 	boolean_t nck_ns_ok;
474 	/*
475 	 * This indicates that a minor node corresponding to a namespace is
476 	 * allowed to issue this.
477 	 */
478 	boolean_t nck_ns_minor_ok;
479 	/*
480 	 * This indicates that the controller should be skipped from all of the
481 	 * following processing behavior. That is, it's allowed to specify
482 	 * whatever it wants in the nsid field, regardless if it is valid or
483 	 * not. This is required for some of the Identify Command options that
484 	 * list endpoints. This should generally not be used and the driver
485 	 * should still validate the nuance here.
486 	 */
487 	boolean_t nck_skip_ctrl;
488 	/*
489 	 * This indicates that if we're on the controller's minor and we don't
490 	 * have an explicit namespace ID (i.e. 0), should the namespace be
491 	 * rewritten to be the broadcast namespace.
492 	 */
493 	boolean_t nck_ctrl_rewrite;
494 	/*
495 	 * This indicates whether or not the broadcast NSID is acceptable for
496 	 * the controller node.
497 	 */
498 	boolean_t nck_bcast_ok;
499 
500 	/*
501 	 * This indicates to the lock checking code what kind of exclusive
502 	 * access is required. This check occurs after any namespace rewriting
503 	 * has occurred. When looking at exclusivity, a broadcast namespace or
504 	 * namespace 0 indicate that the controller is the target, otherwise the
505 	 * target namespace will be checked for a write lock.
506 	 */
507 	nvme_ioctl_excl_t nck_excl;
508 } nvme_ioctl_check_t;
509 
510 /*
511  * Constants
512  */
513 extern uint_t nvme_vendor_specific_admin_cmd_max_timeout;
514 extern uint32_t nvme_vendor_specific_admin_cmd_size;
515 
516 /*
517  * Common functions.
518  */
519 extern nvme_namespace_t *nvme_nsid2ns(nvme_t *, uint32_t);
520 extern boolean_t nvme_ioctl_error(nvme_ioctl_common_t *, nvme_ioctl_errno_t,
521     uint32_t, uint32_t);
522 extern boolean_t nvme_ctrl_atleast(nvme_t *, const nvme_version_t *);
523 extern void nvme_ioctl_success(nvme_ioctl_common_t *);
524 
525 /*
526  * Validation related functions and kernel tunable limits.
527  */
528 extern boolean_t nvme_validate_logpage(nvme_t *, nvme_ioctl_get_logpage_t *);
529 extern boolean_t nvme_validate_identify(nvme_t *, nvme_ioctl_identify_t *,
530     boolean_t);
531 extern boolean_t nvme_validate_get_feature(nvme_t *,
532     nvme_ioctl_get_feature_t *);
533 extern boolean_t nvme_validate_vuc(nvme_t *, nvme_ioctl_passthru_t *);
534 extern boolean_t nvme_validate_format(nvme_t *, nvme_ioctl_format_t *);
535 extern boolean_t nvme_validate_fw_load(nvme_t *, nvme_ioctl_fw_load_t *);
536 extern boolean_t nvme_validate_fw_commit(nvme_t *, nvme_ioctl_fw_commit_t *);
537 
538 /*
539  * Locking functions
540  */
541 extern void nvme_rwlock(nvme_minor_t *, nvme_ioctl_lock_t *);
542 extern void nvme_rwunlock(nvme_minor_lock_info_t *, nvme_lock_t *);
543 extern void nvme_rwlock_ctrl_dead(void *);
544 extern void nvme_lock_init(nvme_lock_t *);
545 extern void nvme_lock_fini(nvme_lock_t *);
546 
547 #ifdef __cplusplus
548 }
549 #endif
550 
551 #endif /* _NVME_VAR_H */
552