xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme_var.h (revision e00bdde3c6d406f40f53f3025defadc22f7ec31a)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 The MathWorks, Inc. All rights reserved.
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2019 Unix Software Ltd.
16  * Copyright 2024 Oxide Computer Company.
17  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
18  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
19  */
20 
21 #ifndef _NVME_VAR_H
22 #define	_NVME_VAR_H
23 
24 #include <sys/ddi.h>
25 #include <sys/sunddi.h>
26 #include <sys/blkdev.h>
27 #include <sys/taskq_impl.h>
28 #include <sys/list.h>
29 #include <sys/ddi_ufm.h>
30 #include <nvme_common.h>
31 
32 /*
33  * NVMe driver state
34  */
35 
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 
40 typedef enum {
41 	NVME_PCI_CONFIG			= 1 << 0,
42 	NVME_FMA_INIT			= 1 << 1,
43 	NVME_REGS_MAPPED		= 1 << 2,
44 	NVME_ADMIN_QUEUE		= 1 << 3,
45 	NVME_CTRL_LIMITS		= 1 << 4,
46 	NVME_INTERRUPTS			= 1 << 5,
47 	NVME_UFM_INIT			= 1 << 6,
48 	NVME_MUTEX_INIT			= 1 << 7,
49 	NVME_MGMT_INIT			= 1 << 8
50 } nvme_progress_t;
51 
52 typedef enum {
53 	NVME_NS_LOCK	= 1 << 0
54 } nvme_ns_progress_t;
55 
56 typedef enum {
57 	/*
58 	 * The controller fails to properly process commands on the admin queue
59 	 * if the first one has CID 0. Subsequent use of CID 0 doesn't present
60 	 * a problem.
61 	 */
62 	NVME_QUIRK_START_CID		= 1 << 0,
63 } nvme_quirk_t;
64 
65 #define	NVME_MIN_ADMIN_QUEUE_LEN	16
66 #define	NVME_MIN_IO_QUEUE_LEN		16
67 #define	NVME_DEFAULT_ADMIN_QUEUE_LEN	256
68 #define	NVME_DEFAULT_IO_QUEUE_LEN	1024
69 #define	NVME_DEFAULT_ASYNC_EVENT_LIMIT	10
70 #define	NVME_MIN_ASYNC_EVENT_LIMIT	1
71 #define	NVME_DEFAULT_MIN_BLOCK_SIZE	512
72 
73 
74 typedef struct nvme nvme_t;
75 typedef struct nvme_namespace nvme_namespace_t;
76 typedef struct nvme_minor nvme_minor_t;
77 typedef struct nvme_lock nvme_lock_t;
78 typedef struct nvme_minor_lock_info nvme_minor_lock_info_t;
79 typedef struct nvme_dma nvme_dma_t;
80 typedef struct nvme_cmd nvme_cmd_t;
81 typedef struct nvme_cq nvme_cq_t;
82 typedef struct nvme_qpair nvme_qpair_t;
83 typedef struct nvme_task_arg nvme_task_arg_t;
84 
85 /*
86  * These states represent the minor's perspective. That is, of a minor's
87  * namespace and controller lock, where is it?
88  */
89 typedef enum {
90 	NVME_LOCK_STATE_UNLOCKED	= 0,
91 	NVME_LOCK_STATE_BLOCKED,
92 	NVME_LOCK_STATE_ACQUIRED
93 } nvme_minor_lock_state_t;
94 
95 struct nvme_minor_lock_info {
96 	list_node_t nli_node;
97 	nvme_lock_t *nli_lock;
98 	nvme_minor_lock_state_t nli_state;
99 	nvme_lock_level_t nli_curlevel;
100 	/*
101 	 * While the minor points back to itself and the nvme_t should always
102 	 * point to the current controller, the namespace should only point to
103 	 * one if this is a particular namespace lock. The former two are
104 	 * initialized at minor initialization time.
105 	 */
106 	nvme_minor_t *nli_minor;
107 	nvme_t *nli_nvme;
108 	nvme_namespace_t *nli_ns;
109 	/*
110 	 * This is the common ioctl information that should be filled in when
111 	 * we're being woken up for any reason other than an interrupted signal.
112 	 * This should only be set while blocking.
113 	 */
114 	nvme_ioctl_common_t *nli_ioc;
115 	/*
116 	 * The following are provided for debugging purposes. In particular,
117 	 * information like the kthread_t and related that performed this should
118 	 * be considered suspect as it represents who took the operation, not
119 	 * who performed the operation (unless we're actively blocking).
120 	 */
121 	hrtime_t nli_last_change;
122 	uintptr_t nli_acq_kthread;
123 	pid_t nli_acq_pid;
124 };
125 
126 struct nvme_minor {
127 	/*
128 	 * The following three fields are set when this is created.
129 	 */
130 	id_t nm_minor;
131 	nvme_t *nm_ctrl;
132 	nvme_namespace_t *nm_ns;
133 	/*
134 	 * This link is used to index this minor on the global list of active
135 	 * open-related minors. This is only manipulated under the
136 	 * nvme_open_minors_mutex.
137 	 */
138 	avl_node_t nm_avl;
139 	/*
140 	 * Information related to locking. Note, there is no pointer to a locked
141 	 * controller as the only one can be the one specified here. This data
142 	 * is protected by the controller's n_minor_mutex.
143 	 */
144 	kcondvar_t nm_cv;
145 	nvme_minor_lock_info_t nm_ctrl_lock;
146 	nvme_minor_lock_info_t nm_ns_lock;
147 };
148 
149 struct nvme_lock {
150 	nvme_minor_lock_info_t *nl_writer;
151 	list_t nl_readers;
152 	list_t nl_pend_readers;
153 	list_t nl_pend_writers;
154 	/*
155 	 * The following are stats to indicate how often certain locking
156 	 * activities have occurred for debugging purposes.
157 	 */
158 	uint32_t nl_nwrite_locks;
159 	uint32_t nl_nread_locks;
160 	uint32_t nl_npend_writes;
161 	uint32_t nl_npend_reads;
162 	uint32_t nl_nnonblock;
163 	uint32_t nl_nsignals;
164 	uint32_t nl_nsig_unlock;
165 	uint32_t nl_nsig_blocks;
166 	uint32_t nl_nsig_acq;
167 };
168 
169 struct nvme_dma {
170 	ddi_dma_handle_t nd_dmah;
171 	ddi_acc_handle_t nd_acch;
172 	ddi_dma_cookie_t nd_cookie;
173 	uint_t nd_ncookie;
174 	caddr_t nd_memp;
175 	size_t nd_len;
176 	boolean_t nd_cached;
177 };
178 
179 struct nvme_cmd {
180 	struct list_node nc_list;
181 
182 	nvme_sqe_t nc_sqe;
183 	nvme_cqe_t nc_cqe;
184 
185 	void (*nc_callback)(void *);
186 	bd_xfer_t *nc_xfer;
187 	boolean_t nc_completed;
188 	boolean_t nc_dontpanic;
189 	uint16_t nc_sqid;
190 
191 	nvme_dma_t *nc_dma;
192 	nvme_dma_t *nc_prp; /* DMA for PRP lists */
193 
194 	kmutex_t nc_mutex;
195 	kcondvar_t nc_cv;
196 
197 	taskq_ent_t nc_tqent;
198 	nvme_t *nc_nvme;
199 };
200 
201 struct nvme_cq {
202 	size_t ncq_nentry;
203 	uint16_t ncq_id;
204 
205 	nvme_dma_t *ncq_dma;
206 	nvme_cqe_t *ncq_cq;
207 	uint_t ncq_head;
208 	uint_t ncq_tail;
209 	uintptr_t ncq_hdbl;
210 	int ncq_phase;
211 
212 	taskq_t *ncq_cmd_taskq;
213 
214 	kmutex_t ncq_mutex;
215 };
216 
217 struct nvme_qpair {
218 	size_t nq_nentry;
219 
220 	/* submission fields */
221 	nvme_dma_t *nq_sqdma;
222 	nvme_sqe_t *nq_sq;
223 	uint_t nq_sqhead;
224 	uint_t nq_sqtail;
225 	uintptr_t nq_sqtdbl;
226 
227 	/* completion */
228 	nvme_cq_t *nq_cq;
229 
230 	/* shared structures for completion and submission */
231 	nvme_cmd_t **nq_cmd;	/* active command array */
232 	uint16_t nq_next_cmd;	/* next potential empty queue slot */
233 	uint_t nq_active_cmds;	/* number of active cmds */
234 
235 	kmutex_t nq_mutex;	/* protects shared state */
236 	ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */
237 };
238 
239 struct nvme {
240 	dev_info_t *n_dip;
241 	nvme_progress_t n_progress;
242 	nvme_quirk_t n_quirks;
243 
244 	caddr_t n_regs;
245 	ddi_acc_handle_t n_regh;
246 
247 	kmem_cache_t *n_cmd_cache;
248 	kmem_cache_t *n_prp_cache;
249 
250 	size_t n_inth_sz;
251 	ddi_intr_handle_t *n_inth;
252 	int n_intr_cnt;
253 	uint_t n_intr_pri;
254 	int n_intr_cap;
255 	int n_intr_type;
256 	int n_intr_types;
257 
258 	ddi_acc_handle_t n_pcicfg_handle;
259 	uint16_t n_vendor_id;
260 	uint16_t n_device_id;
261 	uint16_t n_subsystem_vendor_id;
262 	uint16_t n_subsystem_device_id;
263 	uint8_t n_revision_id;
264 
265 	char *n_product;
266 	char *n_vendor;
267 
268 	nvme_version_t n_version;
269 	boolean_t n_dead;
270 	nvme_ioctl_errno_t n_dead_status;
271 	taskq_ent_t n_dead_tqent;
272 	boolean_t n_strict_version;
273 	boolean_t n_ignore_unknown_vendor_status;
274 	uint32_t n_admin_queue_len;
275 	uint32_t n_io_squeue_len;
276 	uint32_t n_io_cqueue_len;
277 	uint16_t n_async_event_limit;
278 	uint_t n_min_block_size;
279 	uint16_t n_abort_command_limit;
280 	uint64_t n_max_data_transfer_size;
281 	boolean_t n_write_cache_present;
282 	boolean_t n_write_cache_enabled;
283 	int n_error_log_len;
284 	boolean_t n_async_event_supported;
285 	int n_submission_queues;
286 	int n_completion_queues;
287 
288 	int n_nssr_supported;
289 	int n_doorbell_stride;
290 	int n_timeout;
291 	int n_arbitration_mechanisms;
292 	int n_cont_queues_reqd;
293 	int n_max_queue_entries;
294 	int n_pageshift;
295 	int n_pagesize;
296 
297 	uint32_t n_namespace_count;
298 	uint_t n_namespaces_attachable;
299 	uint_t n_ioq_count;
300 	uint_t n_cq_count;
301 
302 	/*
303 	 * This is cached identify controller and common namespace data that
304 	 * exists in the system. This generally can be used in the kernel;
305 	 * however, we have to be careful about what we use here because these
306 	 * values are not refreshed after attach. Therefore these are good for
307 	 * answering the question what does the controller support or what is in
308 	 * the common namespace information, but not otherwise. That means you
309 	 * shouldn't use this to try to answer how much capacity is still in the
310 	 * controller because this information is just cached.
311 	 */
312 	nvme_identify_ctrl_t *n_idctl;
313 	nvme_identify_nsid_t *n_idcomns;
314 
315 	/* Pointer to the admin queue, which is always queue 0 in n_ioq. */
316 	nvme_qpair_t *n_adminq;
317 	/*
318 	 * All command queues, including the admin queue.
319 	 * Its length is: n_ioq_count + 1.
320 	 */
321 	nvme_qpair_t **n_ioq;
322 	nvme_cq_t **n_cq;
323 
324 	nvme_namespace_t *n_ns;
325 
326 	ddi_dma_attr_t n_queue_dma_attr;
327 	ddi_dma_attr_t n_prp_dma_attr;
328 	ddi_dma_attr_t n_sgl_dma_attr;
329 	ddi_device_acc_attr_t n_reg_acc_attr;
330 	ddi_iblock_cookie_t n_fm_ibc;
331 	int n_fm_cap;
332 
333 	ksema_t n_abort_sema;
334 
335 	/* protects namespace management operations */
336 	kmutex_t n_mgmt_mutex;
337 
338 	/*
339 	 * This lock protects the minor node locking state across the controller
340 	 * and all related namespaces.
341 	 */
342 	kmutex_t n_minor_mutex;
343 	nvme_lock_t n_lock;
344 
345 	/* errors detected by driver */
346 	uint32_t n_dma_bind_err;
347 	uint32_t n_abort_failed;
348 	uint32_t n_cmd_timeout;
349 	uint32_t n_cmd_aborted;
350 	uint32_t n_wrong_logpage;
351 	uint32_t n_unknown_logpage;
352 	uint32_t n_too_many_cookies;
353 	uint32_t n_unknown_cid;
354 
355 	/* errors detected by hardware */
356 	uint32_t n_data_xfr_err;
357 	uint32_t n_internal_err;
358 	uint32_t n_abort_rq_err;
359 	uint32_t n_abort_sq_del;
360 	uint32_t n_nvm_cap_exc;
361 	uint32_t n_nvm_ns_notrdy;
362 	uint32_t n_nvm_ns_formatting;
363 	uint32_t n_inv_cq_err;
364 	uint32_t n_inv_qid_err;
365 	uint32_t n_max_qsz_exc;
366 	uint32_t n_inv_int_vect;
367 	uint32_t n_inv_log_page;
368 	uint32_t n_inv_format;
369 	uint32_t n_inv_q_del;
370 	uint32_t n_cnfl_attr;
371 	uint32_t n_inv_prot;
372 	uint32_t n_readonly;
373 
374 	/* errors reported by asynchronous events */
375 	uint32_t n_diagfail_event;
376 	uint32_t n_persistent_event;
377 	uint32_t n_transient_event;
378 	uint32_t n_fw_load_event;
379 	uint32_t n_reliability_event;
380 	uint32_t n_temperature_event;
381 	uint32_t n_spare_event;
382 	uint32_t n_vendor_event;
383 	uint32_t n_notice_event;
384 	uint32_t n_unknown_event;
385 
386 	/* hot removal NDI event handling */
387 	ddi_eventcookie_t n_rm_cookie;
388 	ddi_callback_id_t n_ev_rm_cb_id;
389 
390 	/* DDI UFM handle */
391 	ddi_ufm_handle_t *n_ufmh;
392 	/* Cached Firmware Slot Information log page */
393 	nvme_fwslot_log_t *n_fwslot;
394 	/* Lock protecting the cached firmware slot info */
395 	kmutex_t n_fwslot_mutex;
396 };
397 
398 struct nvme_namespace {
399 	nvme_t *ns_nvme;
400 	nvme_ns_progress_t ns_progress;
401 	uint8_t ns_eui64[8];
402 	uint8_t	ns_nguid[16];
403 	char	ns_name[11];
404 
405 	bd_handle_t ns_bd_hdl;
406 
407 	uint32_t ns_id;
408 	size_t ns_block_count;
409 	size_t ns_block_size;
410 	size_t ns_best_block_size;
411 
412 	boolean_t ns_allocated;
413 	boolean_t ns_active;
414 	boolean_t ns_ignore;
415 	boolean_t ns_attached;
416 
417 	nvme_identify_nsid_t *ns_idns;
418 
419 	/*
420 	 * Namespace lock, see the theory statement for more information.
421 	 */
422 	nvme_lock_t ns_lock;
423 
424 	/*
425 	 * If a namespace has neither NGUID nor EUI64, we create a devid in
426 	 * nvme_prepare_devid().
427 	 */
428 	char *ns_devid;
429 };
430 
431 struct nvme_task_arg {
432 	nvme_t *nt_nvme;
433 	nvme_cmd_t *nt_cmd;
434 };
435 
436 typedef enum {
437 	/*
438 	 * This indicates that there is no exclusive access required for this
439 	 * operation. However, this operation will fail if someone attempts to
440 	 * perform this operation and someone else holds a write lock.
441 	 */
442 	NVME_IOCTL_EXCL_NONE	= 0,
443 	/*
444 	 * This indicates that a write lock is required to perform the
445 	 * operation.
446 	 */
447 	NVME_IOCTL_EXCL_WRITE,
448 	/*
449 	 * This indicates that the exclusive check should be skipped. The only
450 	 * case this should be used in is the lock and unlock ioctls as they
451 	 * should be able to proceed even when the controller is being used
452 	 * exclusively.
453 	 */
454 	NVME_IOCTL_EXCL_SKIP
455 } nvme_ioctl_excl_t;
456 
457 /*
458  * This structure represents the set of checks that we apply to ioctl's using
459  * the nvme_ioctl_common_t structure as part of validation.
460  */
461 typedef struct nvme_ioctl_check {
462 	/*
463 	 * This indicates whether or not the command in question allows a
464 	 * namespace to be specified at all. If this is false, a namespace minor
465 	 * cannot be used and a controller minor must leave the nsid set to
466 	 * zero.
467 	 */
468 	boolean_t nck_ns_ok;
469 	/*
470 	 * This indicates that a minor node corresponding to a namespace is
471 	 * allowed to issue this.
472 	 */
473 	boolean_t nck_ns_minor_ok;
474 	/*
475 	 * This indicates that the controller should be skipped from all of the
476 	 * following processing behavior. That is, it's allowed to specify
477 	 * whatever it wants in the nsid field, regardless if it is valid or
478 	 * not. This is required for some of the Identify Command options that
479 	 * list endpoints. This should generally not be used and the driver
480 	 * should still validate the nuance here.
481 	 */
482 	boolean_t nck_skip_ctrl;
483 	/*
484 	 * This indicates that if we're on the controller's minor and we don't
485 	 * have an explicit namespace ID (i.e. 0), should the namespace be
486 	 * rewritten to be the broadcast namespace.
487 	 */
488 	boolean_t nck_ctrl_rewrite;
489 	/*
490 	 * This indicates whether or not the broadcast NSID is acceptable for
491 	 * the controller node.
492 	 */
493 	boolean_t nck_bcast_ok;
494 
495 	/*
496 	 * This indicates to the lock checking code what kind of exclusive
497 	 * access is required. This check occurs after any namespace rewriting
498 	 * has occurred. When looking at exclusivity, a broadcast namespace or
499 	 * namespace 0 indicate that the controller is the target, otherwise the
500 	 * target namespace will be checked for a write lock.
501 	 */
502 	nvme_ioctl_excl_t nck_excl;
503 } nvme_ioctl_check_t;
504 
505 /*
506  * Constants
507  */
508 extern uint_t nvme_vendor_specific_admin_cmd_max_timeout;
509 extern uint32_t nvme_vendor_specific_admin_cmd_size;
510 
511 /*
512  * Common functions.
513  */
514 extern nvme_namespace_t *nvme_nsid2ns(nvme_t *, uint32_t);
515 extern boolean_t nvme_ioctl_error(nvme_ioctl_common_t *, nvme_ioctl_errno_t,
516     uint32_t, uint32_t);
517 extern boolean_t nvme_ctrl_atleast(nvme_t *, const nvme_version_t *);
518 extern void nvme_ioctl_success(nvme_ioctl_common_t *);
519 
520 /*
521  * Validation related functions and kernel tunable limits.
522  */
523 extern boolean_t nvme_validate_logpage(nvme_t *, nvme_ioctl_get_logpage_t *);
524 extern boolean_t nvme_validate_identify(nvme_t *, nvme_ioctl_identify_t *,
525     boolean_t);
526 extern boolean_t nvme_validate_get_feature(nvme_t *,
527     nvme_ioctl_get_feature_t *);
528 extern boolean_t nvme_validate_vuc(nvme_t *, nvme_ioctl_passthru_t *);
529 extern boolean_t nvme_validate_format(nvme_t *, nvme_ioctl_format_t *);
530 extern boolean_t nvme_validate_fw_load(nvme_t *, nvme_ioctl_fw_load_t *);
531 extern boolean_t nvme_validate_fw_commit(nvme_t *, nvme_ioctl_fw_commit_t *);
532 
533 /*
534  * Locking functions
535  */
536 extern void nvme_rwlock(nvme_minor_t *, nvme_ioctl_lock_t *);
537 extern void nvme_rwunlock(nvme_minor_lock_info_t *, nvme_lock_t *);
538 extern void nvme_rwlock_ctrl_dead(void *);
539 extern void nvme_lock_init(nvme_lock_t *);
540 extern void nvme_lock_fini(nvme_lock_t *);
541 
542 #ifdef __cplusplus
543 }
544 #endif
545 
546 #endif /* _NVME_VAR_H */
547