xref: /illumos-gate/usr/src/lib/libnvme/common/libnvme.c (revision fbd5b8684156f1cae0891d3e690e5bdf7644c840)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2025 Oxide Computer Company
14  */
15 
16 /*
17  * Programmatic interface to NVMe Devices
18  *
19  * libnvme exists to provide a means of performing non-I/O related operations on
20  * an NVMe device. This is intended to allow software, regardless of whether it
21  * is part of illumos or not, to operate on NVMe devices and perform most of the
22  * administrative and operator tasks that might come up. This library does not
23  * provide a stable interface yet. The rest of this block comment goes into the
24  * organization and background into why it looks the way it does.
25  *
26  * --------------------
27  * Library Organization
28  * --------------------
29  *
30  * There are two large classes of source files that make up this library
31  * currently:
32  *
33  *   1. Source code that implements the library's interfaces is found alongside
34  *      this file in lib/libnvme/common. This code is generally organized based
35  *      around the portion of the NVMe specification that it implements. So for
36  *      example, code that implements logic related to the features is found
37  *      in libnvme_feature.c, formatting namespaces in libnvme_format.c, log
38  *      pages in libnvme_log.c, etc. All files in the library begin with
39  *      'libnvme_' as a way to help namespace the file names from the second set
40  *      of files.
41  *
42  *   2. Validation logic that is shared between libnvme and the kernel is found
43  *      in common/nvme/. While the kernel must validate requests regardless, we
44  *      leverage this shared information as a means for trying to ensure that we
45  *      have useful errors early. That code is factored in a way to facilitate
46  *      easier unit testing.
47  *
48  * Because of the nature of this split, all of the opaque structures that we
49  * create and their relationships are all maintained in the library (group 1).
50  * All of the logic in group 2 is designed to be constant data tables and
51  * functions that are fed information about the controller they are operating on
52  * to answer them.
53  *
54  * There are several general classes of interfaces and related structures that
55  * we have in the library. We break them into the following general categories
56  * based on their purpose:
57  *
58  * DISCOVERY
59  *
60  * One of the large responsibilities of this library is helping someone discover
61  * information about something, whether that be a controller, a namespace, a log
62  * page, a feature, a unique command, etc. Information about one of these items
63  * is contained in a generally opaque discovery structure. For example, the
64  * nvme_log_disc_t.
65  *
66  * The goal of these structures is to contain all of the metadata for working
67  * with the object in question. Continuing on the log page discovery example, it
68  * can tell us information about what fields are required, whether or not the
69  * log might be supported, whether it operates on a controller, a namespace, or
70  * something else, as well as more human-usable things such as names and
71  * descriptions.
72  *
73  * Discovery objects are both for humans and for programmatic consumption. There
74  * are several cases where requests can be created directly from discovery
75  * objects. A well designed discovery object can allow a general implementation
76  * of a consumer such as nvmeadm to build up a request without having to
77  * hardcode everything about what is needed for each request (though most
78  * consumers still need to have information about the actual contents, meaning,
79  * and semantics of a log or feature).
80  *
81  * Discovery objects are obtained in two general ways. The first is using one of
82  * the iterator/callback based functions to discover a given class of data. The
83  * second path is that several of the functions which operate based on the name
84  * of something, e.g. nvme_log_req_init_by_name(),
85  * nvme_get_feat_req_init_by_name(), etc. will return a discovery object.
86  *
87  * When a discovery object is returned based on iteration (more below), the
88  * memory is owned by the iterator. When it is returned by a request
89  * initialization function, then it has its own life time and must be freed.
90  * We try to make this distinction clear in the API based on whether or not the
91  * discovery object is 'const'.
92  *
93  * All discovery objects should be fully filled out before they are handed back
94  * to a caller. It is an explicit design goal that every function that gets data
95  * from the discovery structure operates on a const version of the pointer. This
96  * is the hint that you cannot perform additional I/O or related after handing
97  * out the discovery structure. Attempts to loosen this constraint should be
98  * considered carefully due to how we communicate ownership.
99  *
100  * ITERATORS
101  *
102  * A common pattern of the library is iterating over items. This includes
103  * controllers and namespaces, but also as part of discovering what specific
104  * logs, commands, features, etc. are actually supported by the device.
105  * Iteration always follows the same general pattern:
106  *
107  * 1. An iterator is initialized with a call to nvme_<name>_discover_init().
108  * This will generally return a structure of the form nvme_<name>_iter_t. This
109  * structure contains the memory for the corresponding value that is returned
110  * from step in (2).
111  *
112  * 2. To actually pull values out of an iterator, one must call the
113  * nvme_<name>_step() function for the iterator. This will return a
114  * corresponding nvme_<name>_disc_t structure that is opaque and has a suite of
115  * functions that are usable for getting information out from it. This structure
116  * is valid only until the next time the nvme_<name>_step() is called. The
117  * return value of step indicates the state of the data and indicates whether or
118  * not there is an error, the iterator has finished, or we successfully stepped
119  * and the data is filled out.
120  *
121  * If discovery data needs to outlive a given iteration, then it can be
122  * duplicated which will give it a separate lifetime, though that comes with
123  * the responsibility that it must then be freed.
124  *
125  * 3. To finish using iterators, one finally calls the corresponding
126  * nvme_<name>_discover_fini(). That will deallocate the iterator structure and
127  * finish everything up.
128  *
129  * REQUESTS
130  *
131  * One of the chief goals of this library is to be able to perform requests.
132  * Each request has a structure that can be initialized, filled out, and then
133  * executed. A request structure can be reused multiple times with minor
134  * adjustments in-between (though changes aren't required). Request structures
135  * are either initialized in a blank mode where every value must be filled out
136  * or they can be initialized through their discovery object (or the common name
137  * of such an object).
138  *
139  * When a request structure is initialized through a discovery object, it
140  * automatically sets several of the fields, knows which ones are still required
141  * to be set, and which fields cannot be set. For example, if you create a get
142  * log page request from a log discovery object, it will not allow you to change
143  * the log page you're requesting; however, in return you don't have to specify
144  * the command set interface or log identifier.
145  *
146  * Request objects are tied to a controller. See 'Parallelism, Thread Safety,
147  * and Errors' for more information.
148  *
149  * INFORMATION SNAPSHOTS
150  *
151  * To get information about a namespace or controller, one has to take an
152  * information snapshot. Once an information snapshot is obtained, this snapshot
153  * answers all questions about the controller with a mostly consistent set of
154  * point-in-time data. The main reason for this design was to try and simplify
155  * where errors can occur and to provide a straightforward serialization point
156  * so that way the raw underlying data could be gathered at one system and then
157  * interpreted later on another.
158  *
159  * The only reason that there are some fallible operations on the snapshot are
160  * things that are not guaranteed to exist for all such NVMe controllers.
161  *
162  * LIBRARY, CONTROLLER, NAMESPACE and SNAPSHOT HANDLES
163  *
164  * The last major set of types used in this library are opaque handles. As you
165  * might have guessed given the request structures, all of the objects which
166  * represent something are opaque. Each library handle is independent of one
167  * another and each controller handle is independent of one another. In general,
168  * it is expected that only a single controller handle is used at a given time
169  * for a given library handle, but this is not currently enforced.  Error
170  * information and parallelism is tied into this, see 'Parallelism, Thread
171  * Safety, and Errors' for more information.
172  *
173  * -----------------
174  * Opaque Structures
175  * -----------------
176  *
177  * One of the things that might stand out in libnvme is the use of opaque
178  * structures everywhere with functions to access every arbitrary piece of data.
179  * This and the function pattern around building up a request were done to try
180  * and deal with the evolutionary nature of the NVMe specification. If you look
181  * at the various requests, with the exception of firmware download, almost
182  * every request has added additional features through the spec revisions. NVMe
183  * 2.0 changed most things again with the requirement to specify the command set
184  * interface.
185  *
186  * While the way that the NVMe specification has done this is quite reasonable,
187  * it makes it much more difficult to use a traditional series of arguments to
188  * functions or a structure without having to try to version the symbol through
189  * clever games. If instead we accept that the specification will change and
190  * that the specification is always taking these additional arguments out of
191  * values that must be zero, then an opaque request structure where you have to
192  * make an explicit function call and recompile to get slightly different
193  * behavior is mostly reasonable. We may not be able to be perfect given we're
194  * at the mercy of the specification, but at least this is better than the
195  * alternative.
196  *
197  * This is ultimately why all the request structures are opaque and use a
198  * pseudo-builder pattern to fill out the request information. Further evidence
199  * to this point is that there was no way to avoid changing every kernel
200  * structure here while retaining semantic operations. No one wants to manually
201  * assemble cdw12-15 here. That's not how we can add value for the library.
202  *
203  * Similarly, for all discovery objects we ended up utilizing opaque objects.
204  * The main reason here is that we want to be able to embed this library as a
205  * committed interface in other languages and having the discovery structures be
206  * something that everyone can see means it'll be harder to extend it. While
207  * this concern is somewhat more theoretical given the iterator pattern, given
208  * the other bits in the request structure we decided to lean into the
209  * opaqueness.
210  *
211  * --------------------------------------
212  * Parallelism, Thread Safety, and Errors
213  * --------------------------------------
214  *
215  * One of the library's major design points is how do we achieve thread-safety,
216  * how does ownership work, where do errors appear, and what is the degree of
217  * parallelism that is achievable. To work through this we look at a few
218  * different things:
219  *
220  * 1. The degree to which the hardware allows for parallelism
221  * 2. The degree to which users might desire parallelism
222  * 3. The ergonomics of getting and storing errors
223  *
224  * The NVMe specification allows for different degrees of admin command
225  * parallelism on a per-command basis. This is discoverable, but the main point
226  * is that there are a class of commands where only one can be outstanding at a
227  * time, which likely fall into the case of most of the destructive commands
228  * like Format NVM, Activate Firmware, etc. Our expectation to some extent is
229  * that most admin queue commands don't need to be issued in parallel; however,
230  * beyond how we structure the library and error handling, we don't try to
231  * enforce that here. The kernel does do some enforcement through requiring
232  * mandatory write locks to perform some operations.
233  *
234  * When we get to how do folks want to use this, during the initial design phase
235  * we mostly theorized based on how nvmeadm is using it today and how various
236  * daemons like a FRU monitor or an appliance kit's software might want to
237  * interact with it. Our general starting assumption is that it's very
238  * reasonable for each discovered controller to be handled in parallel, but that
239  * operations on a controller itself are likely serial given that we're not
240  * issuing I/O through this mechanism. If we were, then that'd be an entirely
241  * different set of constraints.
242  *
243  * To discuss the perceived ergonomics, we need to first discuss what error
244  * information we want to be able to have. It's an important goal of both the
245  * NVMe driver and this library to give useful semantic errors. In particular,
246  * for any operation we want to make sure that we include the following
247  * information:
248  *
249  *   o A hopefully distinguishable semantic error
250  *   o Saving errno as a system error if relevant (e.g if open(2) failed)
251  *   o A message for humans that gives more specifics about what happened and is
252  *     intended to be passed along to the output of a command or another error
253  *     message.
254  *   o If a controller error occurs, we want to be able to provide the
255  *     controller's sc (status code) and sct (status code type).
256  *
257  * With this we get to the questions around ergonomics and related which are
258  * entirely subjective. Given that we want to capture that information how do we
259  * best do this given the tooling that we have. When the library was first being
260  * prototyped all errors were on the nvme_t, basically the top-level handle.
261  * This meant that each operation on a controller had to be done serially or you
262  * would have to use different handles. However, the simplicity was that there
263  * was one thing to check.
264  *
265  * This evolution changed slightly when we introduced information snapshots.
266  * Because the information snapshots are meant to be separate entities whose
267  * lifetime can extend beyond the nvme_t library handle, they ended up
268  * developing their own error codes and functions. This has been okay because
269  * there aren't too many use cases there, though the need to duplicate error
270  * handling functions is a bit painful.
271  *
272  * From there, we did consider what if each request had its own error
273  * information that could be extracted. That would turn into a lot of functions
274  * to get at that data. The controller's allowed parallelism for admin commands
275  * varies based on each command. Some commands must occur when there are no
276  * other admin commands on the controller and others when there there is nothing
277  * on the namespace. However, due to that nuance, it would lead to forcing the
278  * consumer to understand the controller's specifics more than is often
279  * necessary for a given request. To add to that, it'd also just be a pain to
280  * try to get all the error information out in a different way and the consumers
281  * we started writing in this fashion were not looking good.
282  *
283  * We also considered whether we could consolidate all the error functions on
284  * each request into one structure that we get, but that didn't move the needle
285  * too much. It also raised some more concerns around how we minimize races and
286  * how data changes around that.
287  *
288  * So all of this led us to our current compromise position: we allow for
289  * parallelism at the controller level. More specifically:
290  *
291  * 1. Operations which take the nvme_t handle set errors on it and must operate
292  *    serially. That is the nvme_t should only be used from one thread at any
293  *    time, but may move between threads. Errors are set on it.
294  *
295  * 2. The nvme_ctrl_t has its own error information. A given nvme_ctrl_t should
296  *    only be used serially; however, different ones can be used in parallel. A
297  *    controller doesn't guarantee exclusivity. That requires an explicit
298  *    locking operation.
299  *
300  * 3. Both request structures and namespaces place their errors on the
301  *    corresponding controller that they were created from. Therefore the
302  *    per-controller serialization in (2) applies here as well. If two requests
303  *    are tied to different controllers, they can proceed in parallel.
304  *
305  * 4. Once a controller or namespace snapshot is obtained, they fall into a
306  *    similar pattern: each one can be operated on in parallel, but generally
307  *    one should only operate on a single one serially.
308  *
309  * Other than the constraints defined above, the library does not care which
310  * threads that an operation occurs on. These can be moved to wherever it needs
311  * to be. Locking and related in the kernel is based on the open file descriptor
312  * to the controller.
313  *
314  * ----------------
315  * Field Validation
316  * ----------------
317  *
318  * Every request is made up of fields that correspond to parts of the NVMe
319  * specification. Our requests operate in terms of the logical fields that we
320  * opt to expose and that the kernel knows how to consume. In general, we don't
321  * expose the raw cdw values that make up the commands (except for the vendor
322  * unique commands or arguments that are explicitly that way ala get features).
323  * While operating on raw cdw arguments would be a simple way to create ABI
324  * stability, it would leave everyone having to break up all the fields
325  * themselves and we believe end up somewhat more error prone than the
326  * interfaces we expose today.
327  *
328  * Requests are created in one of two ways today: they are either initialized
329  * from corresponding discovery data e.g. nvme_log_req_init_by_disc() and
330  * nvme_get_feat_req_init_by_name(), or one creates a raw request ala
331  * nvme_get_feat_req_init(). In the former cases, we fill out a bunch of the
332  * fields that would normally need to be set such as the log or feature ID. We
333  * also will note which fields are allowed and expected. For example, the health
334  * log page does not take or expect a lsp (log specific parameter) or related
335  * and therefore we can flag that with an _UNUSE class error. Conversely,
336  * requests that are created from their raw form will not have any such error
337  * checking performed until they are finalized and checked by the kernel. The
338  * set of fields that can be set in a request is usually tracked in the
339  * structure with a member of the form <prefix>_allow.
340  *
341  * One set of library error checking that is uniform between both types is that
342  * of missing fields. There are minimum fields that must be set for different
343  * types of requests. That check will always be performed regardless of the path
344  * that is taken through the system. Tracking which members must still be set is
345  * done by a member of the form <prefix>_need.
346  *
347  * When we perform validation, we try to push the vast majority of it into the
348  * common validation code that is shared between the kernel and userland. This
349  * is wrapped up through the nvme_field_check_one() logic. The common code will
350  * check if the field is supported by the controller (generating an _UNSUP class
351  * error if not) and if the value of the field is within a valid range
352  * (generating a _RANGE class error if not).
353  *
354  * While we try to fold the majority of such checks into the common code as
355  * possible, it isn't perfect and some things have to be checked outside of
356  * that. Those consist of the following general cases:
357  *
358  * 1) Items that are not semantically fields in the actual command but are
359  * things that we are tracking ourselves in the library. An example of this
360  * would be fields in the vuc request structure that we are synthesizing
361  * ourselves.
362  *
363  * 2) While the field logic has the specifics of what controller is being
364  * operated upon, it doesn't have all the knowledge of what things can be
365  * combined or not. It can answer the specifics about its field, but cannot look
366  * at the broader request.
367  *
368  * As a result, there are some duplicated checks in the library and the kernel,
369  * though several are left just to the kernel. However, the vast majority of
370  * validation does happen through these common routines which leaves the library
371  * nvme_<type>_req_set_<field> functions generally wrappers around checking
372  * common code and updating our tracking around what fields are set or not so we
373  * can issue an ioctl.
374  */
375 
376 #include <stdlib.h>
377 #include <stdarg.h>
378 #include <libdevinfo.h>
379 #include <unistd.h>
380 #include <string.h>
381 #include <sys/types.h>
382 #include <sys/stat.h>
383 #include <fcntl.h>
384 #include <upanic.h>
385 
386 #include "libnvme_impl.h"
387 
388 bool
nvme_vers_ctrl_atleast(const nvme_ctrl_t * ctrl,const nvme_version_t * targ)389 nvme_vers_ctrl_atleast(const nvme_ctrl_t *ctrl, const nvme_version_t *targ)
390 {
391 	return (nvme_vers_atleast(&ctrl->nc_vers, targ));
392 }
393 
394 bool
nvme_vers_ctrl_info_atleast(const nvme_ctrl_info_t * ci,const nvme_version_t * targ)395 nvme_vers_ctrl_info_atleast(const nvme_ctrl_info_t *ci,
396     const nvme_version_t *targ)
397 {
398 	return (nvme_vers_atleast(&ci->nci_vers, targ));
399 }
400 
401 bool
nvme_vers_ns_info_atleast(const nvme_ns_info_t * info,const nvme_version_t * targ)402 nvme_vers_ns_info_atleast(const nvme_ns_info_t *info,
403     const nvme_version_t *targ)
404 {
405 	return (nvme_vers_atleast(&info->nni_vers, targ));
406 }
407 
408 bool
nvme_guid_valid(const nvme_ctrl_t * ctrl,const uint8_t guid[16])409 nvme_guid_valid(const nvme_ctrl_t *ctrl, const uint8_t guid[16])
410 {
411 	const uint8_t zero_guid[16] = { 0 };
412 
413 	return (nvme_vers_ctrl_atleast(ctrl, &nvme_vers_1v2) &&
414 	    memcmp(zero_guid, guid, sizeof (zero_guid)) != 0);
415 }
416 
417 bool
nvme_eui64_valid(const nvme_ctrl_t * ctrl,const uint8_t eui64[8])418 nvme_eui64_valid(const nvme_ctrl_t *ctrl, const uint8_t eui64[8])
419 {
420 	const uint8_t zero_eui[8] = { 0 };
421 
422 	return (nvme_vers_ctrl_atleast(ctrl, &nvme_vers_1v1) &&
423 	    memcmp(zero_eui, eui64, sizeof (zero_eui)) != 0);
424 }
425 
426 int
nvme_format_nguid(const uint8_t nguid[16],char * buf,size_t len)427 nvme_format_nguid(const uint8_t nguid[16], char *buf, size_t len)
428 {
429 	return (snprintf(buf, len, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X"
430 	    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
431 	    nguid[0], nguid[1], nguid[2], nguid[3], nguid[4], nguid[5],
432 	    nguid[6], nguid[7], nguid[8], nguid[9], nguid[10], nguid[11],
433 	    nguid[12], nguid[13], nguid[14], nguid[15]));
434 }
435 
436 int
nvme_format_eui64(const uint8_t eui64[8],char * buf,size_t len)437 nvme_format_eui64(const uint8_t eui64[8], char *buf, size_t len)
438 {
439 	return (snprintf(buf, len, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
440 	    eui64[0], eui64[1], eui64[2], eui64[3], eui64[4], eui64[5],
441 	    eui64[6], eui64[7]));
442 }
443 
444 void
nvme_fini(nvme_t * nvme)445 nvme_fini(nvme_t *nvme)
446 {
447 	if (nvme == NULL)
448 		return;
449 
450 	if (nvme->nh_devinfo != DI_NODE_NIL) {
451 		di_fini(nvme->nh_devinfo);
452 	}
453 
454 	free(nvme);
455 }
456 
457 nvme_t *
nvme_init(void)458 nvme_init(void)
459 {
460 	nvme_t *nvme;
461 
462 	nvme = calloc(1, sizeof (nvme_t));
463 	if (nvme == NULL) {
464 		return (NULL);
465 	}
466 
467 	nvme->nh_devinfo = di_init("/", DINFOCPYALL);
468 	if (nvme->nh_devinfo == DI_NODE_NIL) {
469 		nvme_fini(nvme);
470 		return (NULL);
471 	}
472 
473 	return (nvme);
474 }
475 
476 void
nvme_ctrl_discover_fini(nvme_ctrl_iter_t * iter)477 nvme_ctrl_discover_fini(nvme_ctrl_iter_t *iter)
478 {
479 	free(iter);
480 }
481 
482 nvme_iter_t
nvme_ctrl_discover_step(nvme_ctrl_iter_t * iter,const nvme_ctrl_disc_t ** discp)483 nvme_ctrl_discover_step(nvme_ctrl_iter_t *iter, const nvme_ctrl_disc_t **discp)
484 {
485 	di_minor_t m;
486 
487 	*discp = NULL;
488 	if (iter->ni_done) {
489 		return (NVME_ITER_DONE);
490 	}
491 
492 	for (;;) {
493 		if (iter->ni_cur == NULL) {
494 			iter->ni_cur = di_drv_first_node("nvme",
495 			    iter->ni_nvme->nh_devinfo);
496 		} else {
497 			iter->ni_cur = di_drv_next_node(iter->ni_cur);
498 		}
499 
500 		if (iter->ni_cur == NULL) {
501 			iter->ni_done = true;
502 			return (NVME_ITER_DONE);
503 		}
504 
505 		for (m = di_minor_next(iter->ni_cur, DI_MINOR_NIL);
506 		    m != DI_MINOR_NIL; m = di_minor_next(iter->ni_cur, m)) {
507 			if (strcmp(di_minor_nodetype(m),
508 			    DDI_NT_NVME_NEXUS) == 0) {
509 				break;
510 			}
511 		}
512 
513 		if (m == DI_MINOR_NIL) {
514 			continue;
515 		}
516 
517 		iter->ni_disc.ncd_devi = iter->ni_cur;
518 		iter->ni_disc.ncd_minor = m;
519 		*discp = &iter->ni_disc;
520 		return (NVME_ITER_VALID);
521 	}
522 
523 	return (NVME_ITER_DONE);
524 }
525 
526 bool
nvme_ctrl_discover_init(nvme_t * nvme,nvme_ctrl_iter_t ** iterp)527 nvme_ctrl_discover_init(nvme_t *nvme, nvme_ctrl_iter_t **iterp)
528 {
529 	nvme_ctrl_iter_t *iter;
530 
531 	if (iterp == NULL) {
532 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
533 		    "invalid nvme_ctrl_iter_t output pointer: %p", iterp));
534 	}
535 
536 	iter = calloc(1, sizeof (nvme_ctrl_iter_t));
537 	if (iter == NULL) {
538 		int e = errno;
539 		return (nvme_error(nvme, NVME_ERR_NO_MEM, e, "failed to "
540 		    "allocate memory for a new nvme_ctrl_iter_t: %s",
541 		    strerror(e)));
542 	}
543 	iter->ni_nvme = nvme;
544 	*iterp = iter;
545 	return (nvme_success(nvme));
546 }
547 
548 bool
nvme_ctrl_discover(nvme_t * nvme,nvme_ctrl_disc_f func,void * arg)549 nvme_ctrl_discover(nvme_t *nvme, nvme_ctrl_disc_f func, void *arg)
550 {
551 	nvme_ctrl_iter_t *iter;
552 	const nvme_ctrl_disc_t *disc;
553 	nvme_iter_t ret;
554 
555 	if (func == NULL) {
556 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
557 		    "invalid nvme_ctrl_disc_f function pointer: %p", func));
558 	}
559 
560 	if (!nvme_ctrl_discover_init(nvme, &iter)) {
561 		return (false);
562 	}
563 
564 	while ((ret = nvme_ctrl_discover_step(iter, &disc)) ==
565 	    NVME_ITER_VALID) {
566 		if (!func(nvme, disc, arg))
567 			break;
568 	}
569 
570 	nvme_ctrl_discover_fini(iter);
571 	if (ret == NVME_ITER_ERROR) {
572 		return (false);
573 	}
574 
575 	return (nvme_success(nvme));
576 }
577 
578 di_node_t
nvme_ctrl_disc_devi(const nvme_ctrl_disc_t * discp)579 nvme_ctrl_disc_devi(const nvme_ctrl_disc_t *discp)
580 {
581 	return (discp->ncd_devi);
582 }
583 
584 di_minor_t
nvme_ctrl_disc_minor(const nvme_ctrl_disc_t * discp)585 nvme_ctrl_disc_minor(const nvme_ctrl_disc_t *discp)
586 {
587 	return (discp->ncd_minor);
588 }
589 
590 void
nvme_ctrl_fini(nvme_ctrl_t * ctrl)591 nvme_ctrl_fini(nvme_ctrl_t *ctrl)
592 {
593 	if (ctrl == NULL) {
594 		return;
595 	}
596 
597 	if (ctrl->nc_sup_logs != NULL) {
598 		free(ctrl->nc_sup_logs);
599 	}
600 
601 	if (ctrl->nc_sup_logs_err != NULL) {
602 		free(ctrl->nc_sup_logs_err);
603 	}
604 
605 	if (ctrl->nc_devi_path != NULL) {
606 		di_devfs_path_free(ctrl->nc_devi_path);
607 	}
608 
609 	if (ctrl->nc_fd >= 0) {
610 		(void) close(ctrl->nc_fd);
611 		ctrl->nc_fd = -1;
612 	}
613 
614 	free(ctrl);
615 }
616 
617 bool
nvme_ctrl_init(nvme_t * nvme,di_node_t di,nvme_ctrl_t ** outp)618 nvme_ctrl_init(nvme_t *nvme, di_node_t di, nvme_ctrl_t **outp)
619 {
620 	const char *drv;
621 	int32_t inst;
622 	di_minor_t minor;
623 	char *path, buf[PATH_MAX];
624 	nvme_ctrl_t *ctrl;
625 	nvme_ioctl_ctrl_info_t ctrl_info;
626 
627 	if (di == DI_NODE_NIL) {
628 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
629 		    "invalid di_node_t: %p", di));
630 	}
631 
632 	if (outp == NULL) {
633 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
634 		    "invalid nvme_ctrl_t output pointer: %p", outp));
635 	}
636 	*outp = NULL;
637 
638 	drv = di_driver_name(di);
639 	inst = di_instance(di);
640 	if (drv == NULL || inst < 0) {
641 		return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s has "
642 		    "no driver attached", di_node_name(di)));
643 	}
644 
645 	if (strcmp(drv, "nvme") != 0) {
646 		return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s isn't "
647 		    "attached to nvme, found %s", di_node_name(di), drv));
648 	}
649 
650 	/*
651 	 * We have an NVMe node. Find the right minor that corresponds to the
652 	 * attachment point. Once we find that then we can go ahead and open a
653 	 * path to that and construct the device.
654 	 */
655 	minor = DI_MINOR_NIL;
656 	while ((minor = di_minor_next(di, minor)) != DI_MINOR_NIL) {
657 		if (strcmp(di_minor_nodetype(minor), DDI_NT_NVME_NEXUS) == 0) {
658 			break;
659 		}
660 	}
661 
662 	if (minor == DI_MINOR_NIL) {
663 		return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s isn't "
664 		    "attached to nvme, found %s", di_node_name(di), drv));
665 	}
666 
667 	path = di_devfs_minor_path(minor);
668 	if (path == NULL) {
669 		int e = errno;
670 		return (nvme_error(nvme, NVME_ERR_LIBDEVINFO, e, "failed to "
671 		    "obtain /devices path for the requested minor: %s",
672 		    strerror(e)));
673 	}
674 
675 	if (snprintf(buf, sizeof (buf), "/devices%s", path) >= sizeof (buf)) {
676 		di_devfs_path_free(path);
677 		return (nvme_error(nvme, NVME_ERR_INTERNAL, 0, "failed to "
678 		    "construct full /devices minor path, would have overflown "
679 		    "internal buffer"));
680 	}
681 	di_devfs_path_free(path);
682 
683 	ctrl = calloc(1, sizeof (*ctrl));
684 	if (ctrl == NULL) {
685 		int e = errno;
686 		return (nvme_error(nvme, NVME_ERR_NO_MEM, e, "failed to "
687 		    "allocate memory for a new nvme_ctrl_t: %s", strerror(e)));
688 	}
689 
690 	ctrl->nc_nvme = nvme;
691 	ctrl->nc_devi = di;
692 	ctrl->nc_minor = minor;
693 	ctrl->nc_inst = inst;
694 	ctrl->nc_fd = open(buf, O_RDWR | O_CLOEXEC);
695 	if (ctrl->nc_fd < 0) {
696 		int e = errno;
697 		nvme_ctrl_fini(ctrl);
698 		return (nvme_error(nvme, NVME_ERR_OPEN_DEV, e, "failed to open "
699 		    "device path %s: %s", buf, strerror(e)));
700 	}
701 
702 	ctrl->nc_devi_path = di_devfs_path(di);
703 	if (ctrl->nc_devi_path == NULL) {
704 		int e = errno;
705 		nvme_ctrl_fini(ctrl);
706 		return (nvme_error(nvme, NVME_ERR_LIBDEVINFO, e, "failed to "
707 		    "obtain /devices path for the controller: %s",
708 		    strerror(e)));
709 	}
710 
711 	if (!nvme_ioc_ctrl_info(ctrl, &ctrl_info)) {
712 		nvme_err_data_t err;
713 
714 		nvme_ctrl_err_save(ctrl, &err);
715 		nvme_err_set(nvme, &err);
716 		nvme_ctrl_fini(ctrl);
717 		return (false);
718 	}
719 
720 	ctrl->nc_vers = ctrl_info.nci_vers;
721 	ctrl->nc_info = ctrl_info.nci_ctrl_id;
722 
723 	nvme_vendor_map_ctrl(ctrl);
724 
725 	*outp = ctrl;
726 	return (nvme_success(nvme));
727 }
728 
729 typedef struct {
730 	bool ncia_found;
731 	int32_t ncia_inst;
732 	nvme_ctrl_t *ncia_ctrl;
733 	nvme_err_data_t ncia_err;
734 } nvme_ctrl_init_arg_t;
735 
736 bool
nvme_ctrl_init_by_instance_cb(nvme_t * nvme,const nvme_ctrl_disc_t * disc,void * arg)737 nvme_ctrl_init_by_instance_cb(nvme_t *nvme, const nvme_ctrl_disc_t *disc,
738     void *arg)
739 {
740 	nvme_ctrl_init_arg_t *init = arg;
741 
742 	if (di_instance(disc->ncd_devi) != init->ncia_inst) {
743 		return (true);
744 	}
745 
746 	/*
747 	 * If we fail to open the controller, we need to save the error
748 	 * information because it's going to end up being clobbered because this
749 	 * is a callback function surrounded by other libnvme callers.
750 	 */
751 	init->ncia_found = true;
752 	if (!nvme_ctrl_init(nvme, disc->ncd_devi, &init->ncia_ctrl)) {
753 		nvme_err_save(nvme, &init->ncia_err);
754 	}
755 
756 	return (false);
757 }
758 
759 bool
nvme_ctrl_init_by_instance(nvme_t * nvme,int32_t inst,nvme_ctrl_t ** outp)760 nvme_ctrl_init_by_instance(nvme_t *nvme, int32_t inst, nvme_ctrl_t **outp)
761 {
762 	nvme_ctrl_init_arg_t init;
763 
764 	if (inst < 0) {
765 		return (nvme_error(nvme, NVME_ERR_ILLEGAL_INSTANCE, 0,
766 		    "encountered illegal negative instance number: %d", inst));
767 	}
768 
769 	if (outp == NULL) {
770 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
771 		    "invalid nvme_ctrl_t output pointer: %p", outp));
772 	}
773 
774 	init.ncia_found = false;
775 	init.ncia_inst = inst;
776 	init.ncia_ctrl = NULL;
777 
778 	if (!nvme_ctrl_discover(nvme, nvme_ctrl_init_by_instance_cb, &init)) {
779 		return (false);
780 	}
781 
782 	if (!init.ncia_found) {
783 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
784 		    "failed to find NVMe controller nvme%d", inst));
785 	}
786 
787 	/*
788 	 * If we don't have an NVMe controller structure but we did find the
789 	 * instance, then we must have had an error constructing this will which
790 	 * be on our handle. We have to reconstruct the error from saved
791 	 * information as nvme_ctrl_discover will have clobbered it.
792 	 */
793 	if (init.ncia_ctrl == NULL) {
794 		nvme_err_set(nvme, &init.ncia_err);
795 		return (false);
796 	}
797 
798 	*outp = init.ncia_ctrl;
799 	return (nvme_success(nvme));
800 }
801 
802 bool
nvme_ctrl_devi(nvme_ctrl_t * ctrl,di_node_t * devip)803 nvme_ctrl_devi(nvme_ctrl_t *ctrl, di_node_t *devip)
804 {
805 	*devip = ctrl->nc_devi;
806 	return (nvme_ctrl_success(ctrl));
807 }
808 
809 bool
nvme_ioc_ctrl_info(nvme_ctrl_t * ctrl,nvme_ioctl_ctrl_info_t * info)810 nvme_ioc_ctrl_info(nvme_ctrl_t *ctrl, nvme_ioctl_ctrl_info_t *info)
811 {
812 	(void) memset(info, 0, sizeof (nvme_ioctl_ctrl_info_t));
813 
814 	if (ioctl(ctrl->nc_fd, NVME_IOC_CTRL_INFO, info) != 0) {
815 		int e = errno;
816 		return (nvme_ioctl_syserror(ctrl, e, "controller info"));
817 	}
818 
819 	if (info->nci_common.nioc_drv_err != NVME_IOCTL_E_OK) {
820 		return (nvme_ioctl_error(ctrl, &info->nci_common,
821 		    "controller info"));
822 	}
823 
824 	return (true);
825 }
826 
827 bool
nvme_ioc_ns_info(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_ioctl_ns_info_t * info)828 nvme_ioc_ns_info(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_ioctl_ns_info_t *info)
829 {
830 	(void) memset(info, 0, sizeof (nvme_ioctl_ns_info_t));
831 	info->nni_common.nioc_nsid = nsid;
832 
833 	if (ioctl(ctrl->nc_fd, NVME_IOC_NS_INFO, info) != 0) {
834 		int e = errno;
835 		return (nvme_ioctl_syserror(ctrl, e, "namespace info"));
836 	}
837 
838 	if (info->nni_common.nioc_drv_err != NVME_IOCTL_E_OK) {
839 		return (nvme_ioctl_error(ctrl, &info->nni_common,
840 		    "namespace info"));
841 	}
842 
843 	return (true);
844 }
845 
846 const char *
nvme_tporttostr(nvme_ctrl_transport_t tport)847 nvme_tporttostr(nvme_ctrl_transport_t tport)
848 {
849 	switch (tport) {
850 	case NVME_CTRL_TRANSPORT_PCI:
851 		return ("PCI");
852 	case NVME_CTRL_TRANSPORT_TCP:
853 		return ("TCP");
854 	case NVME_CTRL_TRANSPORT_RDMA:
855 		return ("RDMA");
856 	default:
857 		return ("unknown transport");
858 	}
859 }
860 
861 static bool
nvme_ns_discover_validate(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level)862 nvme_ns_discover_validate(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level)
863 {
864 	switch (level) {
865 	case NVME_NS_DISC_F_ALL:
866 	case NVME_NS_DISC_F_ALLOCATED:
867 	case NVME_NS_DISC_F_ACTIVE:
868 	case NVME_NS_DISC_F_NOT_IGNORED:
869 	case NVME_NS_DISC_F_BLKDEV:
870 		return (true);
871 	default:
872 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "invalid "
873 		    "namespace discovery level specified: 0x%x", level));
874 	}
875 }
876 
877 void
nvme_ns_discover_fini(nvme_ns_iter_t * iter)878 nvme_ns_discover_fini(nvme_ns_iter_t *iter)
879 {
880 	free(iter);
881 }
882 
883 const char *
nvme_nsleveltostr(nvme_ns_disc_level_t level)884 nvme_nsleveltostr(nvme_ns_disc_level_t level)
885 {
886 	switch (level) {
887 	case NVME_NS_DISC_F_ALL:
888 		return ("unallocated");
889 	case NVME_NS_DISC_F_ALLOCATED:
890 		return ("allocated");
891 	case NVME_NS_DISC_F_ACTIVE:
892 		return ("active");
893 	case NVME_NS_DISC_F_NOT_IGNORED:
894 		return ("not ignored");
895 	case NVME_NS_DISC_F_BLKDEV:
896 		return ("blkdev");
897 	default:
898 		return ("unknown level");
899 	}
900 }
901 
902 nvme_ns_disc_level_t
nvme_ns_state_to_disc_level(nvme_ns_state_t state)903 nvme_ns_state_to_disc_level(nvme_ns_state_t state)
904 {
905 	if ((state & NVME_NS_STATE_ALLOCATED) == 0) {
906 		return (NVME_NS_DISC_F_ALL);
907 	}
908 
909 	if ((state & NVME_NS_STATE_ACTIVE) == 0) {
910 		return (NVME_NS_DISC_F_ALLOCATED);
911 	}
912 
913 	if ((state & NVME_NS_STATE_IGNORED) != 0) {
914 		return (NVME_NS_DISC_F_ACTIVE);
915 	}
916 
917 	if ((state & NVME_NS_STATE_ATTACHED) == 0) {
918 		return (NVME_NS_DISC_F_NOT_IGNORED);
919 	} else {
920 		return (NVME_NS_DISC_F_BLKDEV);
921 	}
922 }
923 
924 nvme_iter_t
nvme_ns_discover_step(nvme_ns_iter_t * iter,const nvme_ns_disc_t ** discp)925 nvme_ns_discover_step(nvme_ns_iter_t *iter, const nvme_ns_disc_t **discp)
926 {
927 	nvme_ctrl_t *ctrl = iter->nni_ctrl;
928 
929 	if (iter->nni_err) {
930 		return (NVME_ITER_ERROR);
931 	}
932 
933 	if (iter->nni_done) {
934 		return (NVME_ITER_DONE);
935 	}
936 
937 	while (iter->nni_cur_idx <= ctrl->nc_info.id_nn) {
938 		uint32_t nsid = iter->nni_cur_idx;
939 		nvme_ioctl_ns_info_t ns_info = { 0 };
940 		nvme_ns_disc_level_t level;
941 
942 		if (!nvme_ioc_ns_info(ctrl, nsid, &ns_info)) {
943 			iter->nni_err = true;
944 			return (NVME_ITER_ERROR);
945 		}
946 
947 		iter->nni_cur_idx++;
948 		level = nvme_ns_state_to_disc_level(ns_info.nni_state);
949 		if (iter->nni_level > level) {
950 			continue;
951 		}
952 
953 		(void) memset(&iter->nni_disc, 0, sizeof (nvme_ns_disc_t));
954 		iter->nni_disc.nnd_nsid = nsid;
955 		iter->nni_disc.nnd_level = level;
956 
957 		if (nvme_guid_valid(ctrl, ns_info.nni_id.id_nguid)) {
958 			iter->nni_disc.nnd_flags |= NVME_NS_DISC_F_NGUID_VALID;
959 			(void) memcpy(iter->nni_disc.nnd_nguid,
960 			    ns_info.nni_id.id_nguid,
961 			    sizeof (ns_info.nni_id.id_nguid));
962 		}
963 
964 		if (nvme_eui64_valid(ctrl, ns_info.nni_id.id_eui64)) {
965 			iter->nni_disc.nnd_flags |= NVME_NS_DISC_F_EUI64_VALID;
966 			(void) memcpy(iter->nni_disc.nnd_eui64,
967 			    ns_info.nni_id.id_eui64,
968 			    sizeof (ns_info.nni_id.id_eui64));
969 		}
970 
971 		*discp = &iter->nni_disc;
972 		return (NVME_ITER_VALID);
973 	}
974 
975 	iter->nni_done = true;
976 	return (NVME_ITER_DONE);
977 }
978 
979 bool
nvme_ns_discover_init(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level,nvme_ns_iter_t ** iterp)980 nvme_ns_discover_init(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level,
981     nvme_ns_iter_t **iterp)
982 {
983 	nvme_ns_iter_t *iter;
984 
985 	if (!nvme_ns_discover_validate(ctrl, level)) {
986 		return (false);
987 	}
988 
989 	if (iterp == NULL) {
990 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
991 		    "encountered invalid nvme_ns_iter_t output pointer: %p",
992 		    iterp));
993 	}
994 
995 	iter = calloc(1, sizeof (nvme_ns_iter_t));
996 	if (iter == NULL) {
997 		int e = errno;
998 		return (nvme_ctrl_error(ctrl, NVME_ERR_NO_MEM, e, "failed to "
999 		    "allocate memory for a new nvme_ns_iter_t: %s",
1000 		    strerror(e)));
1001 	}
1002 
1003 	iter->nni_ctrl = ctrl;
1004 	iter->nni_level = level;
1005 	iter->nni_cur_idx = 1;
1006 
1007 	*iterp = iter;
1008 	return (nvme_ctrl_success(ctrl));
1009 }
1010 
1011 bool
nvme_ns_discover(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level,nvme_ns_disc_f func,void * arg)1012 nvme_ns_discover(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level,
1013     nvme_ns_disc_f func, void *arg)
1014 {
1015 	nvme_ns_iter_t *iter;
1016 	nvme_iter_t ret;
1017 	const nvme_ns_disc_t *disc;
1018 
1019 	if (!nvme_ns_discover_validate(ctrl, level)) {
1020 		return (false);
1021 	}
1022 
1023 	if (func == NULL) {
1024 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1025 		    "encountered invalid nvme_ns_disc_f function pointer: %p",
1026 		    func));
1027 	}
1028 
1029 	if (!nvme_ns_discover_init(ctrl, level, &iter)) {
1030 		return (false);
1031 	}
1032 
1033 	while ((ret = nvme_ns_discover_step(iter, &disc)) == NVME_ITER_VALID) {
1034 		if (!func(ctrl, disc, arg))
1035 			break;
1036 	}
1037 
1038 	nvme_ns_discover_fini(iter);
1039 	if (ret == NVME_ITER_ERROR) {
1040 		return (false);
1041 	}
1042 
1043 	return (nvme_ctrl_success(ctrl));
1044 }
1045 
1046 uint32_t
nvme_ns_disc_nsid(const nvme_ns_disc_t * discp)1047 nvme_ns_disc_nsid(const nvme_ns_disc_t *discp)
1048 {
1049 	return (discp->nnd_nsid);
1050 }
1051 
1052 nvme_ns_disc_level_t
nvme_ns_disc_level(const nvme_ns_disc_t * discp)1053 nvme_ns_disc_level(const nvme_ns_disc_t *discp)
1054 {
1055 	return (discp->nnd_level);
1056 }
1057 
1058 nvme_ns_disc_flags_t
nvme_ns_disc_flags(const nvme_ns_disc_t * discp)1059 nvme_ns_disc_flags(const nvme_ns_disc_t *discp)
1060 {
1061 	return (discp->nnd_flags);
1062 }
1063 
1064 const uint8_t *
nvme_ns_disc_eui64(const nvme_ns_disc_t * discp)1065 nvme_ns_disc_eui64(const nvme_ns_disc_t *discp)
1066 {
1067 	if ((discp->nnd_flags & NVME_NS_DISC_F_EUI64_VALID) == 0) {
1068 		return (NULL);
1069 	}
1070 
1071 	return (discp->nnd_eui64);
1072 }
1073 
1074 const uint8_t *
nvme_ns_disc_nguid(const nvme_ns_disc_t * discp)1075 nvme_ns_disc_nguid(const nvme_ns_disc_t *discp)
1076 {
1077 	if ((discp->nnd_flags & NVME_NS_DISC_F_NGUID_VALID) == 0) {
1078 		return (NULL);
1079 	}
1080 
1081 	return (discp->nnd_nguid);
1082 }
1083 
1084 void
nvme_ns_fini(nvme_ns_t * ns)1085 nvme_ns_fini(nvme_ns_t *ns)
1086 {
1087 	free(ns);
1088 }
1089 
1090 bool
nvme_ns_init(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_ns_t ** nsp)1091 nvme_ns_init(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_ns_t **nsp)
1092 {
1093 	nvme_ns_t *ns;
1094 
1095 	if (nsp == NULL) {
1096 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1097 		    "encountered invalid nvme_ns_t output pointer: %p", nsp));
1098 	}
1099 
1100 	if (nsid < NVME_NSID_MIN || nsid > ctrl->nc_info.id_nn) {
1101 		return (nvme_ctrl_error(ctrl, NVME_ERR_NS_RANGE, 0, "requested "
1102 		    "namespace 0x%x is invalid, valid namespaces are [0x%x, "
1103 		    "0x%x]", nsid, NVME_NSID_MIN, ctrl->nc_info.id_nn));
1104 	}
1105 
1106 	ns = calloc(1, sizeof (nvme_ns_t));
1107 	if (ns == NULL) {
1108 		int e = errno;
1109 		return (nvme_ctrl_error(ctrl, NVME_ERR_NO_MEM, e, "failed to "
1110 		    "allocate memory for a new nvme_ns_t: %s", strerror(e)));
1111 	}
1112 
1113 	ns->nn_ctrl = ctrl;
1114 	ns->nn_nsid = nsid;
1115 
1116 	*nsp = ns;
1117 	return (nvme_ctrl_success(ctrl));
1118 }
1119 
1120 typedef struct {
1121 	nvme_ctrl_t *nnia_ctrl;
1122 	const char *nnia_name;
1123 	bool nnia_found;
1124 	nvme_ns_t *nnia_ns;
1125 	nvme_err_data_t nnia_err;
1126 } nvme_ns_init_arg_t;
1127 
1128 static bool
nvme_ns_init_by_name_cb(nvme_ctrl_t * ctrl,const nvme_ns_disc_t * disc,void * arg)1129 nvme_ns_init_by_name_cb(nvme_ctrl_t *ctrl, const nvme_ns_disc_t *disc,
1130     void *arg)
1131 {
1132 	nvme_ns_init_arg_t *init = arg;
1133 	char buf[NVME_NGUID_NAMELEN];
1134 	CTASSERT(NVME_NGUID_NAMELEN > NVME_EUI64_NAMELEN);
1135 
1136 	if ((disc->nnd_flags & NVME_NS_DISC_F_NGUID_VALID) != 0) {
1137 		(void) nvme_format_nguid(disc->nnd_nguid, buf, sizeof (buf));
1138 		if (strcasecmp(init->nnia_name, buf) == 0)
1139 			goto match;
1140 	}
1141 
1142 	if ((disc->nnd_flags & NVME_NS_DISC_F_EUI64_VALID) != 0) {
1143 		(void) nvme_format_eui64(disc->nnd_eui64, buf, sizeof (buf));
1144 		if (strcasecmp(init->nnia_name, buf) == 0)
1145 			goto match;
1146 	}
1147 
1148 	(void) snprintf(buf, sizeof (buf), "%u", disc->nnd_nsid);
1149 	if (strcasecmp(init->nnia_name, buf) == 0)
1150 		goto match;
1151 
1152 	return (true);
1153 
1154 match:
1155 	init->nnia_found = true;
1156 	if (!nvme_ns_init(ctrl, disc->nnd_nsid, &init->nnia_ns)) {
1157 		nvme_ctrl_err_save(ctrl, &init->nnia_err);
1158 	}
1159 
1160 	return (false);
1161 }
1162 
1163 /*
1164  * Attempt to find a namespace by 'name'. A name could be the NGUID, EUI64, or
1165  * just the plain old namespace ID.
1166  */
1167 bool
nvme_ns_init_by_name(nvme_ctrl_t * ctrl,const char * ns_name,nvme_ns_t ** nsp)1168 nvme_ns_init_by_name(nvme_ctrl_t *ctrl, const char *ns_name, nvme_ns_t **nsp)
1169 {
1170 	nvme_ns_init_arg_t init;
1171 
1172 	if (ns_name == NULL) {
1173 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1174 		    "encountered invalid namespace name: %p", ns_name));
1175 	}
1176 
1177 	if (nsp == NULL) {
1178 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1179 		    "encountered invalid nvme_ns_t output pointer: %p", nsp));
1180 	}
1181 
1182 	init.nnia_ctrl = ctrl;
1183 	init.nnia_name = ns_name;
1184 	init.nnia_found = false;
1185 	init.nnia_ns = NULL;
1186 
1187 	if (!nvme_ns_discover(ctrl, NVME_NS_DISC_F_ALL, nvme_ns_init_by_name_cb,
1188 	    &init)) {
1189 		return (false);
1190 	}
1191 
1192 	if (!init.nnia_found) {
1193 		return (nvme_ctrl_error(ctrl, NVME_ERR_NS_RANGE, 0, "failed to "
1194 		    "find NVMe namespace %s on nvme%d", ns_name,
1195 		    ctrl->nc_inst));
1196 	}
1197 
1198 	if (init.nnia_ns == NULL) {
1199 		nvme_ctrl_err_set(ctrl, &init.nnia_err);
1200 		return (false);
1201 	}
1202 
1203 	*nsp = init.nnia_ns;
1204 	return (nvme_ctrl_success(ctrl));
1205 }
1206 
1207 bool
nvme_ctrl_ns_init(nvme_t * nvme,const char * name,nvme_ctrl_t ** ctrlp,nvme_ns_t ** nsp)1208 nvme_ctrl_ns_init(nvme_t *nvme, const char *name, nvme_ctrl_t **ctrlp,
1209     nvme_ns_t **nsp)
1210 {
1211 	const char *slash, *ns_name;
1212 	char *eptr;
1213 	nvme_ctrl_t *ctrl;
1214 	nvme_ns_t *ns;
1215 	unsigned long inst;
1216 	size_t ctrl_namelen;
1217 
1218 	if (name == NULL) {
1219 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
1220 		    "invalid name to search for: %p", name));
1221 	}
1222 
1223 	/*
1224 	 * We require a controller, but the namespace output pointer is only
1225 	 * required if we end up having a namespace present.
1226 	 */
1227 	if (ctrlp == NULL) {
1228 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
1229 		    "invalid nvme_ctrl_t output pointer: %p", ctrlp));
1230 	}
1231 
1232 	slash = strchr(name, '/');
1233 	if (slash != NULL) {
1234 		ctrl_namelen = (uintptr_t)slash - (uintptr_t)name;
1235 		ns_name = slash + 1;
1236 
1237 		if (nsp == NULL) {
1238 			return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0,
1239 			    "encountered invalid nvme_ns_t output pointer: %p",
1240 			    nsp));
1241 		}
1242 
1243 	} else {
1244 		ctrl_namelen = strlen(name);
1245 		ns_name = NULL;
1246 	}
1247 
1248 	*ctrlp = NULL;
1249 	if (nsp != NULL) {
1250 		*nsp = NULL;
1251 	}
1252 
1253 	if (strncmp(name, "nvme", 4) != 0) {
1254 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0, "unable "
1255 		    "to map controller '%.*s' to a known device class, "
1256 		    "expected the controller to start with 'nvme'",
1257 		    (int)ctrl_namelen, name));
1258 	}
1259 
1260 	/*
1261 	 * Before we go ahead and try to parse this with strtoul we need to
1262 	 * manually check two things that strtoul will not:
1263 	 *
1264 	 * 1) If we have a null terminator, then we'll just get a 0 back.
1265 	 * 2) If there are multiple leading zeros in a row then that's an error.
1266 	 * We don't want to conflate 001 and 1 as the same here. The only valid
1267 	 * case is 'nvme0' which is 5 characters long, hence the check below.
1268 	 */
1269 	if (ctrl_namelen == 4) {
1270 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1271 		    "no controller instance specified in %.*s",
1272 		    (int)ctrl_namelen, name));
1273 	}
1274 
1275 	if (name[4] == '0' && ctrl_namelen > 5) {
1276 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1277 		    "leading zeros aren't allowed for the instance specified "
1278 		    "in %.*s", (int)ctrl_namelen, name));
1279 	}
1280 
1281 	errno = 0;
1282 	inst = strtoul(name + 4, &eptr, 10);
1283 	if (errno != 0 || (*eptr != '\0' && eptr != slash)) {
1284 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1285 		    "failed to parse controller instance from %.*s",
1286 		    (int)ctrl_namelen, name));
1287 	}
1288 
1289 	if (inst > INT32_MAX) {
1290 		return (nvme_error(nvme, NVME_ERR_ILLEGAL_INSTANCE, 0,
1291 		    "parsed controller instance %lu is outside the valid "
1292 		    "range [0, %d]", inst, INT32_MAX));
1293 	}
1294 
1295 	if (!nvme_ctrl_init_by_instance(nvme, (int32_t)inst, &ctrl)) {
1296 		return (false);
1297 	}
1298 
1299 	if (ns_name == NULL) {
1300 		*ctrlp = ctrl;
1301 		return (nvme_success(nvme));
1302 	}
1303 
1304 	if (!nvme_ns_init_by_name(ctrl, ns_name, &ns)) {
1305 		nvme_err_data_t err;
1306 
1307 		nvme_ctrl_err_save(ctrl, &err);
1308 		nvme_err_set(nvme, &err);
1309 		nvme_ctrl_fini(ctrl);
1310 		return (false);
1311 	}
1312 
1313 	*ctrlp = ctrl;
1314 	*nsp = ns;
1315 
1316 	return (nvme_success(nvme));
1317 }
1318 
1319 bool
nvme_ns_bd_attach(nvme_ns_t * ns)1320 nvme_ns_bd_attach(nvme_ns_t *ns)
1321 {
1322 	nvme_ctrl_t *ctrl = ns->nn_ctrl;
1323 	nvme_ioctl_common_t com;
1324 
1325 	(void) memset(&com, 0, sizeof (com));
1326 	com.nioc_nsid = ns->nn_nsid;
1327 
1328 	if (ioctl(ns->nn_ctrl->nc_fd, NVME_IOC_ATTACH, &com) != 0) {
1329 		int e = errno;
1330 		return (nvme_ioctl_syserror(ctrl, e, "namespace attach"));
1331 	}
1332 
1333 	if (com.nioc_drv_err != NVME_IOCTL_E_OK) {
1334 		return (nvme_ioctl_error(ctrl, &com, "namespace attach"));
1335 	}
1336 
1337 	return (nvme_ctrl_success(ctrl));
1338 }
1339 
1340 bool
nvme_ns_bd_detach(nvme_ns_t * ns)1341 nvme_ns_bd_detach(nvme_ns_t *ns)
1342 {
1343 	nvme_ctrl_t *ctrl = ns->nn_ctrl;
1344 	nvme_ioctl_common_t com;
1345 
1346 	(void) memset(&com, 0, sizeof (com));
1347 	com.nioc_nsid = ns->nn_nsid;
1348 
1349 	if (ioctl(ns->nn_ctrl->nc_fd, NVME_IOC_DETACH, &com) != 0) {
1350 		int e = errno;
1351 		return (nvme_ioctl_syserror(ctrl, e, "namespace detach"));
1352 	}
1353 
1354 	if (com.nioc_drv_err != NVME_IOCTL_E_OK) {
1355 		return (nvme_ioctl_error(ctrl, &com, "namespace detach"));
1356 	}
1357 
1358 	return (nvme_ctrl_success(ctrl));
1359 }
1360 
1361 /*
1362  * Check for a lock programming error and upanic() if so.
1363  */
1364 static void
nvme_lock_check(nvme_ctrl_t * ctrl)1365 nvme_lock_check(nvme_ctrl_t *ctrl)
1366 {
1367 	char msg[1024];
1368 	int ret;
1369 	const char *up;
1370 	size_t ulen;
1371 	const char *base = "fatal libnvme locking error detected";
1372 
1373 	if (ctrl->nc_err.ne_err != NVME_ERR_LOCK_PROG) {
1374 		return;
1375 	}
1376 
1377 	ret = snprintf(msg, sizeof (msg), "%s: %s (controller %p)", base,
1378 	    ctrl->nc_err.ne_errmsg, ctrl);
1379 	if (ret >= sizeof (msg)) {
1380 		ulen = sizeof (msg);
1381 		up = msg;
1382 	} else if (ret <= 0) {
1383 		ulen = strlen(base) + 1;
1384 		up = base;
1385 	} else {
1386 		ulen = (size_t)ret + 1;
1387 		up = msg;
1388 	}
1389 
1390 	upanic(up, ulen);
1391 }
1392 
1393 static bool
nvme_lock_common(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_lock_level_t level,nvme_lock_flags_t flags)1394 nvme_lock_common(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_lock_level_t level,
1395     nvme_lock_flags_t flags)
1396 {
1397 	nvme_ioctl_lock_t lock;
1398 	const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK;
1399 
1400 	if (level != NVME_LOCK_L_READ && level != NVME_LOCK_L_WRITE) {
1401 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "unknown "
1402 		    "lock level: 0x%x", level));
1403 	}
1404 
1405 	if ((flags & ~all_flags) != 0) {
1406 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "unknown "
1407 		    "lock flags: 0x%x", flags & ~all_flags));
1408 	}
1409 
1410 	(void) memset(&lock, 0, sizeof (lock));
1411 	lock.nil_common.nioc_nsid = nsid;
1412 	if (nsid != 0) {
1413 		lock.nil_ent = NVME_LOCK_E_NS;
1414 	} else {
1415 		lock.nil_ent = NVME_LOCK_E_CTRL;
1416 	}
1417 	lock.nil_level = level;
1418 	lock.nil_flags = flags;
1419 
1420 	if (ioctl(ctrl->nc_fd, NVME_IOC_LOCK, &lock) != 0) {
1421 		int e = errno;
1422 		return (nvme_ioctl_syserror(ctrl, e, "lock"));
1423 	}
1424 
1425 	if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) {
1426 		(void) nvme_ioctl_error(ctrl, &lock.nil_common, "lock");
1427 		nvme_lock_check(ctrl);
1428 		return (false);
1429 	}
1430 
1431 	return (nvme_ctrl_success(ctrl));
1432 }
1433 
1434 /*
1435  * You may reasonably be wondering why does this return and why do we basically
1436  * panic everywhere. The reality is twofold. The first part of this is that we
1437  * know from experience in libc that error checking mutexes are not the most
1438  * common and the kernel simplicity of mutex_enter() and mutex_exit() are really
1439  * a boon. The second piece here is that the way that the ioctl path works here,
1440  * only programming errors or mischief in the library could cause this to fail
1441  * at the raw ioctl / errno level. That is EBADF/EFAULT, etc. are our fault and
1442  * if you cannot unlock because of that you're not going to get much further.
1443  */
1444 void
nvme_unlock_common(nvme_ctrl_t * ctrl,uint32_t nsid)1445 nvme_unlock_common(nvme_ctrl_t *ctrl, uint32_t nsid)
1446 {
1447 	nvme_ioctl_unlock_t unlock;
1448 
1449 	(void) memset(&unlock, 0, sizeof (unlock));
1450 	unlock.niu_common.nioc_nsid = nsid;
1451 	if (nsid != 0) {
1452 		unlock.niu_ent = NVME_LOCK_E_NS;
1453 	} else {
1454 		unlock.niu_ent = NVME_LOCK_E_CTRL;
1455 	}
1456 
1457 	/*
1458 	 * Because all unlock ioctls errors are promoted to an error, we don't
1459 	 * bother calling nvme_ioctl_syserror() here.
1460 	 */
1461 	if (ioctl(ctrl->nc_fd, NVME_IOC_UNLOCK, &unlock) != 0) {
1462 		int e = errno;
1463 		(void) nvme_ctrl_error(ctrl, NVME_ERR_LOCK_PROG, e, "internal "
1464 		    "programming error: failed to issue unlock ioctl: %s",
1465 		    strerror(e));
1466 		nvme_lock_check(ctrl);
1467 		return;
1468 	}
1469 
1470 	if (unlock.niu_common.nioc_drv_err != NVME_IOCTL_E_OK) {
1471 		(void) nvme_ioctl_error(ctrl, &unlock.niu_common, "unlock");
1472 		/*
1473 		 * Promote any other failure to a new fatal failure. Consumers
1474 		 * expect this to have worked.
1475 		 */
1476 		if (ctrl->nc_err.ne_err != NVME_ERR_LOCK_PROG) {
1477 			nvme_err_data_t err;
1478 			nvme_ctrl_err_save(ctrl, &err);
1479 			(void) nvme_ctrl_error(ctrl, NVME_ERR_LOCK_PROG, 0,
1480 			    "internal programming error: received unexpected "
1481 			    "libnvme error 0x%x: %s", err.ne_err,
1482 			    err.ne_errmsg);
1483 		}
1484 		nvme_lock_check(ctrl);
1485 		return;
1486 	}
1487 
1488 	(void) nvme_ctrl_success(ctrl);
1489 }
1490 
1491 bool
nvme_ctrl_lock(nvme_ctrl_t * ctrl,nvme_lock_level_t level,nvme_lock_flags_t flags)1492 nvme_ctrl_lock(nvme_ctrl_t *ctrl, nvme_lock_level_t level,
1493     nvme_lock_flags_t flags)
1494 {
1495 	return (nvme_lock_common(ctrl, 0, level, flags));
1496 }
1497 
1498 bool
nvme_ns_lock(nvme_ns_t * ns,nvme_lock_level_t level,nvme_lock_flags_t flags)1499 nvme_ns_lock(nvme_ns_t *ns, nvme_lock_level_t level,
1500     nvme_lock_flags_t flags)
1501 {
1502 	return (nvme_lock_common(ns->nn_ctrl, ns->nn_nsid, level, flags));
1503 }
1504 
1505 void
nvme_ctrl_unlock(nvme_ctrl_t * ctrl)1506 nvme_ctrl_unlock(nvme_ctrl_t *ctrl)
1507 {
1508 	nvme_unlock_common(ctrl, 0);
1509 }
1510 
1511 void
nvme_ns_unlock(nvme_ns_t * ns)1512 nvme_ns_unlock(nvme_ns_t *ns)
1513 {
1514 	nvme_unlock_common(ns->nn_ctrl, ns->nn_nsid);
1515 }
1516