1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2024 Oxide Computer Company
14 */
15
16 /*
17 * Programmatic interface to NVMe Devices
18 *
19 * libnvme exists to provide a means of performing non-I/O related operations on
20 * an NVMe device. This is intended to allow software, regardless of whether it
21 * is part of illumos or not, to operate on NVMe devices and perform most of the
22 * administrative and operator tasks that might come up. This library does not
23 * provide a stable interface yet. The rest of this block comment goes into the
24 * organization and background into why it looks the way it does.
25 *
26 * --------------------
27 * Library Organization
28 * --------------------
29 *
30 * There are two large classes of source files that make up this library
31 * currently:
32 *
33 * 1. Source code that implements the library's interfaces is found alongside
34 * this file in lib/libnvme/common. This code is generally organized based
35 * around the portion of the NVMe specification that it implements. So for
36 * example, code that implements logic related to the features is found
37 * in libnvme_feature.c, formatting namespaces in libnvme_format.c, log
38 * pages in libnvme_log.c, etc. All files in the library begin with
39 * 'libnvme_' as a way to help namespace the file names from the second set
40 * of files.
41 *
42 * 2. Validation logic that is shared between libnvme and the kernel is found
43 * in common/nvme/. While the kernel must validate requests regardless, we
44 * leverage this shared information as a means for trying to ensure that we
45 * have useful errors early. That code is factored in a way to facilitate
46 * easier unit testing.
47 *
48 * Because of the nature of this split, all of the opaque structures that we
49 * create and their relationships are all maintained in the library (group 1).
50 * All of the logic in group 2 is designed to be constant data tables and
51 * functions that are fed information about the controller they are operating on
52 * to answer them.
53 *
54 * There are several general classes of interfaces and related structures that
55 * we have in the library. We break them into the following general categories
56 * based on their purpose:
57 *
58 * DISCOVERY
59 *
60 * One of the large responsibilities of this library is helping someone discover
61 * information about something, whether that be a controller, a namespace, a log
62 * page, a feature, a unique command, etc. Information about one of these items
63 * is contained in a generally opaque discovery structure. For example, the
64 * nvme_log_disc_t.
65 *
66 * The goal of these structures is to contain all of the metadata for working
67 * with the object in question. Continuing on the log page discovery example, it
68 * can tell us information about what fields are required, whether or not the
69 * log might be supported, whether it operates on a controller, a namespace, or
70 * something else, as well as more human-usable things such as names and
71 * descriptions.
72 *
73 * Discovery objects are both for humans and for programmatic consumption. There
74 * are several cases where requests can be created directly from discovery
75 * objects. A well designed discovery object can allow a general implementation
76 * of a consumer such as nvmeadm to build up a request without having to
77 * hardcode everything about what is needed for each request (though most
78 * consumers still need to have information about the actual contents, meaning,
79 * and semantics of a log or feature).
80 *
81 * Discovery objects are obtained in two general ways. The first is using one of
82 * the iterator/callback based functions to discover a given class of data. The
83 * second path is that several of the functions which operate based on the name
84 * of something, e.g. nvme_log_req_init_by_name(),
85 * nvme_get_feat_req_init_by_name(), etc. will return a discovery object.
86 *
87 * When a discovery object is returned based on iteration (more below), the
88 * memory is owned by the iterator. When it is returned by a request
89 * initialization function, then it has its own life time and must be freed.
90 * We try to make this distinction clear in the API based on whether or not the
91 * discovery object is 'const'.
92 *
93 * All discovery objects should be fully filled out before they are handed back
94 * to a caller. It is an explicit design goal that every function that gets data
95 * from the discovery structure operates on a const version of the pointer. This
96 * is the hint that you cannot perform additional I/O or related after handing
97 * out the discovery structure. Attempts to loosen this constraint should be
98 * considered carefully due to how we communicate ownership.
99 *
100 * ITERATORS
101 *
102 * A common pattern of the library is iterating over items. This includes
103 * controllers and namespaces, but also as part of discovering what specific
104 * logs, commands, features, etc. are actually supported by the device.
105 * Iteration always follows the same general pattern:
106 *
107 * 1. An iterator is initialized with a call to nvme_<name>_discover_init().
108 * This will generally return a structure of the form nvme_<name>_iter_t. This
109 * structure contains the memory for the corresponding value that is returned
110 * from step in (2).
111 *
112 * 2. To actually pull values out of an iterator, one must call the
113 * nvme_<name>_step() function for the iterator. This will return a
114 * corresponding nvme_<name>_disc_t structure that is opaque and has a suite of
115 * functions that are usable for getting information out from it. This structure
116 * is valid only until the next time the nvme_<name>_step() is called. The
117 * return value of step indicates the state of the data and indicates whether or
118 * not there is an error, the iterator has finished, or we successfully stepped
119 * and the data is filled out.
120 *
121 * If discovery data needs to outlive a given iteration, then it can be
122 * duplicated which will give it a separate lifetime, though that comes with
123 * the responsibility that it must then be freed.
124 *
125 * 3. To finish using iterators, one finally calls the corresponding
126 * nvme_<name>_discover_fini(). That will deallocate the iterator structure and
127 * finish everything up.
128 *
129 * REQUESTS
130 *
131 * One of the chief goals of this library is to be able to perform requests.
132 * Each request has a structure that can be initialized, filled out, and then
133 * executed. A request structure can be reused multiple times with minor
134 * adjustments in-between (though changes aren't required). Request structures
135 * are either initialized in a blank mode where every value must be filled out
136 * or they can be initialized through their discovery object (or the common name
137 * of such an object).
138 *
139 * When a request structure is initialized through a discovery object, it
140 * automatically sets several of the fields, knows which ones are still required
141 * to be set, and which fields cannot be set. For example, if you create a get
142 * log page request from a log discovery object, it will not allow you to change
143 * the log page you're requesting; however, in return you don't have to specify
144 * the command set interface or log identifier.
145 *
146 * Request objects are tied to a controller. See 'Parallelism, Thread Safety,
147 * and Errors' for more information.
148 *
149 * INFORMATION SNAPSHOTS
150 *
151 * To get information about a namespace or controller, one has to take an
152 * information snapshot. Once an information snapshot is obtained, this snapshot
153 * answers all questions about the controller with a mostly consistent set of
154 * point-in-time data. The main reason for this design was to try and simplify
155 * where errors can occur and to provide a straightforward serialization point
156 * so that way the raw underlying data could be gathered at one system and then
157 * interpreted later on another.
158 *
159 * The only reason that there are some fallible operations on the snapshot are
160 * things that are not guaranteed to exist for all such NVMe controllers.
161 *
162 * LIBRARY, CONTROLLER, NAMESPACE and SNAPSHOT HANDLES
163 *
164 * The last major set of types used in this library are opaque handles. As you
165 * might have guessed given the request structures, all of the objects which
166 * represent something are opaque. Each library handle is independent of one
167 * another and each controller handle is independent of one another. In general,
168 * it is expected that only a single controller handle is used at a given time
169 * for a given library handle, but this is not currently enforced. Error
170 * information and parallelism is tied into this, see 'Parallelism, Thread
171 * Safety, and Errors' for more information.
172 *
173 * -----------------
174 * Opaque Structures
175 * -----------------
176 *
177 * One of the things that might stand out in libnvme is the use of opaque
178 * structures everywhere with functions to access every arbitrary piece of data.
179 * This and the function pattern around building up a request were done to try
180 * and deal with the evolutionary nature of the NVMe specification. If you look
181 * at the various requests, with the exception of firmware download, almost
182 * every request has added additional features through the spec revisions. NVMe
183 * 2.0 changed most things again with the requirement to specify the command set
184 * interface.
185 *
186 * While the way that the NVMe specification has done this is quite reasonable,
187 * it makes it much more difficult to use a traditional series of arguments to
188 * functions or a structure without having to try to version the symbol through
189 * clever games. If instead we accept that the specification will change and
190 * that the specification is always taking these additional arguments out of
191 * values that must be zero, then an opaque request structure where you have to
192 * make an explicit function call and recompile to get slightly different
193 * behavior is mostly reasonable. We may not be able to be perfect given we're
194 * at the mercy of the specification, but at least this is better than the
195 * alternative.
196 *
197 * This is ultimately why all the request structures are opaque and use a
198 * pseudo-builder pattern to fill out the request information. Further evidence
199 * to this point is that there was no way to avoid changing every kernel
200 * structure here while retaining semantic operations. No one wants to manually
201 * assemble cdw12-15 here. That's not how we can add value for the library.
202 *
203 * Similarly, for all discovery objects we ended up utilizing opaque objects.
204 * The main reason here is that we want to be able to embed this library as a
205 * committed interface in other languages and having the discovery structures be
206 * something that everyone can see means it'll be harder to extend it. While
207 * this concern is somewhat more theoretical given the iterator pattern, given
208 * the other bits in the request structure we decided to lean into the
209 * opaqueness.
210 *
211 * --------------------------------------
212 * Parallelism, Thread Safety, and Errors
213 * --------------------------------------
214 *
215 * One of the library's major design points is how do we achieve thread-safety,
216 * how does ownership work, where do errors appear, and what is the degree of
217 * parallelism that is achievable. To work through this we look at a few
218 * different things:
219 *
220 * 1. The degree to which the hardware allows for parallelism
221 * 2. The degree to which users might desire parallelism
222 * 3. The ergonomics of getting and storing errors
223 *
224 * The NVMe specification allows for different degrees of admin command
225 * parallelism on a per-command basis. This is discoverable, but the main point
226 * is that there are a class of commands where only one can be outstanding at a
227 * time, which likely fall into the case of most of the destructive commands
228 * like Format NVM, Activate Firmware, etc. Our expectation to some extent is
229 * that most admin queue commands don't need to be issued in parallel; however,
230 * beyond how we structure the library and error handling, we don't try to
231 * enforce that here. The kernel does do some enforcement through requiring
232 * mandatory write locks to perform some operations.
233 *
234 * When we get to how do folks want to use this, during the initial design phase
235 * we mostly theorized based on how nvmeadm is using it today and how various
236 * daemons like a FRU monitor or an appliance kit's software might want to
237 * interact with it. Our general starting assumption is that it's very
238 * reasonable for each discovered controller to be handled in parallel, but that
239 * operations on a controller itself are likely serial given that we're not
240 * issuing I/O through this mechanism. If we were, then that'd be an entirely
241 * different set of constraints.
242 *
243 * To discuss the perceived ergonomics, we need to first discuss what error
244 * information we want to be able to have. It's an important goal of both the
245 * NVMe driver and this library to give useful semantic errors. In particular,
246 * for any operation we want to make sure that we include the following
247 * information:
248 *
249 * o A hopefully distinguishable semantic error
250 * o Saving errno as a system error if relevant (e.g if open(2) failed)
251 * o A message for humans that gives more specifics about what happened and is
252 * intended to be passed along to the output of a command or another error
253 * message.
254 * o If a controller error occurs, we want to be able to provide the
255 * controller's sc (status code) and sct (status code type).
256 *
257 * With this we get to the questions around ergonomics and related which are
258 * entirely subjective. Given that we want to capture that information how do we
259 * best do this given the tooling that we have. When the library was first being
260 * prototyped all errors were on the nvme_t, basically the top-level handle.
261 * This meant that each operation on a controller had to be done serially or you
262 * would have to use different handles. However, the simplicity was that there
263 * was one thing to check.
264 *
265 * This evolution changed slightly when we introduced information snapshots.
266 * Because the information snapshots are meant to be separate entities whose
267 * lifetime can extend beyond the nvme_t library handle, they ended up
268 * developing their own error codes and functions. This has been okay because
269 * there aren't too many use cases there, though the need to duplicate error
270 * handling functions is a bit painful.
271 *
272 * From there, we did consider what if each request had its own error
273 * information that could be extracted. That would turn into a lot of functions
274 * to get at that data. The controller's allowed parallelism for admin commands
275 * varies based on each command. Some commands must occur when there are no
276 * other admin commands on the controller and others when there there is nothing
277 * on the namespace. However, due to that nuance, it would lead to forcing the
278 * consumer to understand the controller's specifics more than is often
279 * necessary for a given request. To add to that, it'd also just be a pain to
280 * try to get all the error information out in a different way and the consumers
281 * we started writing in this fashion were not looking good.
282 *
283 * We also considered whether we could consolidate all the error functions on
284 * each request into one structure that we get, but that didn't move the needle
285 * too much. It also raised some more concerns around how we minimize races and
286 * how data changes around that.
287 *
288 * So all of this led us to our current compromise position: we allow for
289 * parallelism at the controller level. More specifically:
290 *
291 * 1. Operations which take the nvme_t handle set errors on it and must operate
292 * serially. That is the nvme_t should only be used from one thread at any
293 * time, but may move between threads. Errors are set on it.
294 *
295 * 2. The nvme_ctrl_t has its own error information. A given nvme_ctrl_t should
296 * only be used serially; however, different ones can be used in parallel. A
297 * controller doesn't guarantee exclusivity. That requires an explicit
298 * locking operation.
299 *
300 * 3. Both request structures and namespaces place their errors on the
301 * corresponding controller that they were created from. Therefore the
302 * per-controller serialization in (2) applies here as well. If two requests
303 * are tied to different controllers, they can proceed in parallel.
304 *
305 * 4. Once a controller or namespace snapshot is obtained, they fall into a
306 * similar pattern: each one can be operated on in parallel, but generally
307 * one should only operate on a single one serially.
308 *
309 * Other than the constraints defined above, the library does not care which
310 * threads that an operation occurs on. These can be moved to wherever it needs
311 * to be. Locking and related in the kernel is based on the open file descriptor
312 * to the controller.
313 *
314 * ----------------
315 * Field Validation
316 * ----------------
317 *
318 * Every request is made up of fields that correspond to parts of the NVMe
319 * specification. Our requests operate in terms of the logical fields that we
320 * opt to expose and that the kernel knows how to consume. In general, we don't
321 * expose the raw cdw values that make up the commands (except for the vendor
322 * unique commands or arguments that are explicitly that way ala get features).
323 * While operating on raw cdw arguments would be a simple way to create ABI
324 * stability, it would leave everyone having to break up all the fields
325 * themselves and we believe end up somewhat more error prone than the
326 * interfaces we expose today.
327 *
328 * Requests are created in one of two ways today: they are either initialized
329 * from corresponding discovery data e.g. nvme_log_req_init_by_disc() and
330 * nvme_get_feat_req_init_by_name(), or one creates a raw request ala
331 * nvme_get_feat_req_init(). In the former cases, we fill out a bunch of the
332 * fields that would normally need to be set such as the log or feature ID. We
333 * also will note which fields are allowed and expected. For example, the health
334 * log page does not take or expect a lsp (log specific parameter) or related
335 * and therefore we can flag that with an _UNUSE class error. Conversely,
336 * requests that are created from their raw form will not have any such error
337 * checking performed until they are finalized and checked by the kernel. The
338 * set of fields that can be set in a request is usually tracked in the
339 * structure with a member of the form <prefix>_allow.
340 *
341 * One set of library error checking that is uniform between both types is that
342 * of missing fields. There are minimum fields that must be set for different
343 * types of requests. That check will always be performed regardless of the path
344 * that is taken through the system. Tracking which members must still be set is
345 * done by a member of the form <prefix>_need.
346 *
347 * When we perform validation, we try to push the vast majority of it into the
348 * common validation code that is shared between the kernel and userland. This
349 * is wrapped up through the nvme_field_check_one() logic. The common code will
350 * check if the field is supported by the controller (generating an _UNSUP class
351 * error if not) and if the value of the field is within a valid range
352 * (generating a _RANGE class error if not).
353 *
354 * While we try to fold the majority of such checks into the common code as
355 * possible, it isn't perfect and some things have to be checked outside of
356 * that. Those consist of the following general cases:
357 *
358 * 1) Items that are not semantically fields in the actual command but are
359 * things that we are tracking ourselves in the library. An example of this
360 * would be fields in the vuc request structure that we are synthesizing
361 * ourselves.
362 *
363 * 2) While the field logic has the specifics of what controller is being
364 * operated upon, it doesn't have all the knowledge of what things can be
365 * combined or not. It can answer the specifics about its field, but cannot look
366 * at the broader request.
367 *
368 * As a result, there are some duplicated checks in the library and the kernel,
369 * though several are left just to the kernel. However, the vast majority of
370 * validation does happen through these common routines which leaves the library
371 * nvme_<type>_req_set_<field> functions generally wrappers around checking
372 * common code and updating our tracking around what fields are set or not so we
373 * can issue an ioctl.
374 */
375
376 #include <stdlib.h>
377 #include <stdarg.h>
378 #include <libdevinfo.h>
379 #include <unistd.h>
380 #include <string.h>
381 #include <sys/types.h>
382 #include <sys/stat.h>
383 #include <fcntl.h>
384 #include <upanic.h>
385
386 #include "libnvme_impl.h"
387
388 bool
nvme_vers_ctrl_atleast(const nvme_ctrl_t * ctrl,const nvme_version_t * targ)389 nvme_vers_ctrl_atleast(const nvme_ctrl_t *ctrl, const nvme_version_t *targ)
390 {
391 return (nvme_vers_atleast(&ctrl->nc_vers, targ));
392 }
393
394 bool
nvme_vers_ctrl_info_atleast(const nvme_ctrl_info_t * ci,const nvme_version_t * targ)395 nvme_vers_ctrl_info_atleast(const nvme_ctrl_info_t *ci,
396 const nvme_version_t *targ)
397 {
398 return (nvme_vers_atleast(&ci->nci_vers, targ));
399 }
400
401 bool
nvme_vers_ns_info_atleast(const nvme_ns_info_t * info,const nvme_version_t * targ)402 nvme_vers_ns_info_atleast(const nvme_ns_info_t *info,
403 const nvme_version_t *targ)
404 {
405 return (nvme_vers_atleast(&info->nni_vers, targ));
406 }
407
408 bool
nvme_guid_valid(const nvme_ctrl_t * ctrl,const uint8_t guid[16])409 nvme_guid_valid(const nvme_ctrl_t *ctrl, const uint8_t guid[16])
410 {
411 const uint8_t zero_guid[16] = { 0 };
412
413 return (nvme_vers_ctrl_atleast(ctrl, &nvme_vers_1v2) &&
414 memcmp(zero_guid, guid, sizeof (zero_guid)) != 0);
415 }
416
417 bool
nvme_eui64_valid(const nvme_ctrl_t * ctrl,const uint8_t eui64[8])418 nvme_eui64_valid(const nvme_ctrl_t *ctrl, const uint8_t eui64[8])
419 {
420 const uint8_t zero_eui[8] = { 0 };
421
422 return (nvme_vers_ctrl_atleast(ctrl, &nvme_vers_1v1) &&
423 memcmp(zero_eui, eui64, sizeof (zero_eui)) != 0);
424 }
425
426 int
nvme_format_nguid(const uint8_t nguid[16],char * buf,size_t len)427 nvme_format_nguid(const uint8_t nguid[16], char *buf, size_t len)
428 {
429 return (snprintf(buf, len, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X"
430 "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
431 nguid[0], nguid[1], nguid[2], nguid[3], nguid[4], nguid[5],
432 nguid[6], nguid[7], nguid[8], nguid[9], nguid[10], nguid[11],
433 nguid[12], nguid[13], nguid[14], nguid[15]));
434 }
435
436 int
nvme_format_eui64(const uint8_t eui64[8],char * buf,size_t len)437 nvme_format_eui64(const uint8_t eui64[8], char *buf, size_t len)
438 {
439 return (snprintf(buf, len, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
440 eui64[0], eui64[1], eui64[2], eui64[3], eui64[4], eui64[5],
441 eui64[6], eui64[7]));
442 }
443
444 void
nvme_fini(nvme_t * nvme)445 nvme_fini(nvme_t *nvme)
446 {
447 if (nvme == NULL)
448 return;
449
450 if (nvme->nh_devinfo != DI_NODE_NIL) {
451 di_fini(nvme->nh_devinfo);
452 }
453
454 free(nvme);
455 }
456
457 nvme_t *
nvme_init(void)458 nvme_init(void)
459 {
460 nvme_t *nvme;
461
462 nvme = calloc(1, sizeof (nvme_t));
463 if (nvme == NULL) {
464 return (NULL);
465 }
466
467 nvme->nh_devinfo = di_init("/", DINFOCPYALL);
468 if (nvme->nh_devinfo == DI_NODE_NIL) {
469 nvme_fini(nvme);
470 return (NULL);
471 }
472
473 return (nvme);
474 }
475
476 void
nvme_ctrl_discover_fini(nvme_ctrl_iter_t * iter)477 nvme_ctrl_discover_fini(nvme_ctrl_iter_t *iter)
478 {
479 free(iter);
480 }
481
482 nvme_iter_t
nvme_ctrl_discover_step(nvme_ctrl_iter_t * iter,const nvme_ctrl_disc_t ** discp)483 nvme_ctrl_discover_step(nvme_ctrl_iter_t *iter, const nvme_ctrl_disc_t **discp)
484 {
485 di_minor_t m;
486
487 *discp = NULL;
488 if (iter->ni_done) {
489 return (NVME_ITER_DONE);
490 }
491
492 for (;;) {
493 if (iter->ni_cur == NULL) {
494 iter->ni_cur = di_drv_first_node("nvme",
495 iter->ni_nvme->nh_devinfo);
496 } else {
497 iter->ni_cur = di_drv_next_node(iter->ni_cur);
498 }
499
500 if (iter->ni_cur == NULL) {
501 iter->ni_done = true;
502 return (NVME_ITER_DONE);
503 }
504
505 for (m = di_minor_next(iter->ni_cur, DI_MINOR_NIL);
506 m != DI_MINOR_NIL; m = di_minor_next(iter->ni_cur, m)) {
507 if (strcmp(di_minor_nodetype(m),
508 DDI_NT_NVME_NEXUS) == 0) {
509 break;
510 }
511 }
512
513 if (m == DI_MINOR_NIL) {
514 continue;
515 }
516
517 iter->ni_disc.ncd_devi = iter->ni_cur;
518 iter->ni_disc.ncd_minor = m;
519 *discp = &iter->ni_disc;
520 return (NVME_ITER_VALID);
521 }
522
523 return (NVME_ITER_DONE);
524 }
525
526 bool
nvme_ctrl_discover_init(nvme_t * nvme,nvme_ctrl_iter_t ** iterp)527 nvme_ctrl_discover_init(nvme_t *nvme, nvme_ctrl_iter_t **iterp)
528 {
529 nvme_ctrl_iter_t *iter;
530
531 if (iterp == NULL) {
532 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
533 "invalid nvme_ctrl_iter_t output pointer: %p", iterp));
534 }
535
536 iter = calloc(1, sizeof (nvme_ctrl_iter_t));
537 if (iter == NULL) {
538 int e = errno;
539 return (nvme_error(nvme, NVME_ERR_NO_MEM, e, "failed to "
540 "allocate memory for a new nvme_ctrl_iter_t: %s",
541 strerror(e)));
542 }
543 iter->ni_nvme = nvme;
544 *iterp = iter;
545 return (nvme_success(nvme));
546 }
547
548 bool
nvme_ctrl_discover(nvme_t * nvme,nvme_ctrl_disc_f func,void * arg)549 nvme_ctrl_discover(nvme_t *nvme, nvme_ctrl_disc_f func, void *arg)
550 {
551 nvme_ctrl_iter_t *iter;
552 const nvme_ctrl_disc_t *disc;
553 nvme_iter_t ret;
554
555 if (func == NULL) {
556 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
557 "invalid nvme_ctrl_disc_f function pointer: %p", func));
558 }
559
560 if (!nvme_ctrl_discover_init(nvme, &iter)) {
561 return (false);
562 }
563
564 while ((ret = nvme_ctrl_discover_step(iter, &disc)) ==
565 NVME_ITER_VALID) {
566 if (!func(nvme, disc, arg))
567 break;
568 }
569
570 nvme_ctrl_discover_fini(iter);
571 if (ret == NVME_ITER_ERROR) {
572 return (false);
573 }
574
575 return (nvme_success(nvme));
576 }
577
578 di_node_t
nvme_ctrl_disc_devi(const nvme_ctrl_disc_t * discp)579 nvme_ctrl_disc_devi(const nvme_ctrl_disc_t *discp)
580 {
581 return (discp->ncd_devi);
582 }
583
584 di_minor_t
nvme_ctrl_disc_minor(const nvme_ctrl_disc_t * discp)585 nvme_ctrl_disc_minor(const nvme_ctrl_disc_t *discp)
586 {
587 return (discp->ncd_minor);
588 }
589
590 void
nvme_ctrl_fini(nvme_ctrl_t * ctrl)591 nvme_ctrl_fini(nvme_ctrl_t *ctrl)
592 {
593 if (ctrl == NULL) {
594 return;
595 }
596
597 if (ctrl->nc_devi_path != NULL) {
598 di_devfs_path_free(ctrl->nc_devi_path);
599 }
600
601 if (ctrl->nc_fd >= 0) {
602 (void) close(ctrl->nc_fd);
603 ctrl->nc_fd = -1;
604 }
605
606 free(ctrl);
607 }
608
609 bool
nvme_ctrl_init(nvme_t * nvme,di_node_t di,nvme_ctrl_t ** outp)610 nvme_ctrl_init(nvme_t *nvme, di_node_t di, nvme_ctrl_t **outp)
611 {
612 const char *drv;
613 int32_t inst;
614 di_minor_t minor;
615 char *path, buf[PATH_MAX];
616 nvme_ctrl_t *ctrl;
617 nvme_ioctl_ctrl_info_t ctrl_info;
618
619 if (di == DI_NODE_NIL) {
620 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
621 "invalid di_node_t: %p", di));
622 }
623
624 if (outp == NULL) {
625 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
626 "invalid nvme_ctrl_t output pointer: %p", outp));
627 }
628 *outp = NULL;
629
630 drv = di_driver_name(di);
631 inst = di_instance(di);
632 if (drv == NULL || inst < 0) {
633 return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s has "
634 "no driver attached", di_node_name(di)));
635 }
636
637 if (strcmp(drv, "nvme") != 0) {
638 return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s isn't "
639 "attached to nvme, found %s", di_node_name(di), drv));
640 }
641
642 /*
643 * We have an NVMe node. Find the right minor that corresponds to the
644 * attachment point. Once we find that then we can go ahead and open a
645 * path to that and construct the device.
646 */
647 minor = DI_MINOR_NIL;
648 while ((minor = di_minor_next(di, minor)) != DI_MINOR_NIL) {
649 if (strcmp(di_minor_nodetype(minor), DDI_NT_NVME_NEXUS) == 0) {
650 break;
651 }
652 }
653
654 if (minor == DI_MINOR_NIL) {
655 return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s isn't "
656 "attached to nvme, found %s", di_node_name(di), drv));
657 }
658
659 path = di_devfs_minor_path(minor);
660 if (path == NULL) {
661 int e = errno;
662 return (nvme_error(nvme, NVME_ERR_LIBDEVINFO, e, "failed to "
663 "obtain /devices path for the requested minor: %s",
664 strerror(e)));
665 }
666
667 if (snprintf(buf, sizeof (buf), "/devices%s", path) >= sizeof (buf)) {
668 di_devfs_path_free(path);
669 return (nvme_error(nvme, NVME_ERR_INTERNAL, 0, "failed to "
670 "construct full /devices minor path, would have overflown "
671 "internal buffer"));
672 }
673 di_devfs_path_free(path);
674
675 ctrl = calloc(1, sizeof (*ctrl));
676 if (ctrl == NULL) {
677 int e = errno;
678 return (nvme_error(nvme, NVME_ERR_NO_MEM, e, "failed to "
679 "allocate memory for a new nvme_ctrl_t: %s", strerror(e)));
680 }
681
682 ctrl->nc_nvme = nvme;
683 ctrl->nc_devi = di;
684 ctrl->nc_minor = minor;
685 ctrl->nc_inst = inst;
686 ctrl->nc_fd = open(buf, O_RDWR | O_CLOEXEC);
687 if (ctrl->nc_fd < 0) {
688 int e = errno;
689 nvme_ctrl_fini(ctrl);
690 return (nvme_error(nvme, NVME_ERR_OPEN_DEV, e, "failed to open "
691 "device path %s: %s", buf, strerror(e)));
692 }
693
694 ctrl->nc_devi_path = di_devfs_path(di);
695 if (ctrl->nc_devi_path == NULL) {
696 int e = errno;
697 nvme_ctrl_fini(ctrl);
698 return (nvme_error(nvme, NVME_ERR_LIBDEVINFO, e, "failed to "
699 "obtain /devices path for the controller: %s",
700 strerror(e)));
701 }
702
703 if (!nvme_ioc_ctrl_info(ctrl, &ctrl_info)) {
704 nvme_err_data_t err;
705
706 nvme_ctrl_err_save(ctrl, &err);
707 nvme_err_set(nvme, &err);
708 nvme_ctrl_fini(ctrl);
709 return (false);
710 }
711
712 ctrl->nc_vers = ctrl_info.nci_vers;
713 ctrl->nc_info = ctrl_info.nci_ctrl_id;
714
715 nvme_vendor_map_ctrl(ctrl);
716
717 *outp = ctrl;
718 return (nvme_success(nvme));
719 }
720
721 typedef struct {
722 bool ncia_found;
723 int32_t ncia_inst;
724 nvme_ctrl_t *ncia_ctrl;
725 nvme_err_data_t ncia_err;
726 } nvme_ctrl_init_arg_t;
727
728 bool
nvme_ctrl_init_by_instance_cb(nvme_t * nvme,const nvme_ctrl_disc_t * disc,void * arg)729 nvme_ctrl_init_by_instance_cb(nvme_t *nvme, const nvme_ctrl_disc_t *disc,
730 void *arg)
731 {
732 nvme_ctrl_init_arg_t *init = arg;
733
734 if (di_instance(disc->ncd_devi) != init->ncia_inst) {
735 return (true);
736 }
737
738 /*
739 * If we fail to open the controller, we need to save the error
740 * information because it's going to end up being clobbered because this
741 * is a callback function surrounded by other libnvme callers.
742 */
743 init->ncia_found = true;
744 if (!nvme_ctrl_init(nvme, disc->ncd_devi, &init->ncia_ctrl)) {
745 nvme_err_save(nvme, &init->ncia_err);
746 }
747
748 return (false);
749 }
750
751 bool
nvme_ctrl_init_by_instance(nvme_t * nvme,int32_t inst,nvme_ctrl_t ** outp)752 nvme_ctrl_init_by_instance(nvme_t *nvme, int32_t inst, nvme_ctrl_t **outp)
753 {
754 nvme_ctrl_init_arg_t init;
755
756 if (inst < 0) {
757 return (nvme_error(nvme, NVME_ERR_ILLEGAL_INSTANCE, 0,
758 "encountered illegal negative instance number: %d", inst));
759 }
760
761 if (outp == NULL) {
762 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
763 "invalid nvme_ctrl_t output pointer: %p", outp));
764 }
765
766 init.ncia_found = false;
767 init.ncia_inst = inst;
768 init.ncia_ctrl = NULL;
769
770 if (!nvme_ctrl_discover(nvme, nvme_ctrl_init_by_instance_cb, &init)) {
771 return (false);
772 }
773
774 if (!init.ncia_found) {
775 return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
776 "failed to find NVMe controller nvme%d", inst));
777 }
778
779 /*
780 * If we don't have an NVMe controller structure but we did find the
781 * instance, then we must have had an error constructing this will which
782 * be on our handle. We have to reconstruct the error from saved
783 * information as nvme_ctrl_discover will have clobbered it.
784 */
785 if (init.ncia_ctrl == NULL) {
786 nvme_err_set(nvme, &init.ncia_err);
787 return (false);
788 }
789
790 *outp = init.ncia_ctrl;
791 return (nvme_success(nvme));
792 }
793
794 bool
nvme_ctrl_devi(nvme_ctrl_t * ctrl,di_node_t * devip)795 nvme_ctrl_devi(nvme_ctrl_t *ctrl, di_node_t *devip)
796 {
797 *devip = ctrl->nc_devi;
798 return (nvme_ctrl_success(ctrl));
799 }
800
801 bool
nvme_ioc_ctrl_info(nvme_ctrl_t * ctrl,nvme_ioctl_ctrl_info_t * info)802 nvme_ioc_ctrl_info(nvme_ctrl_t *ctrl, nvme_ioctl_ctrl_info_t *info)
803 {
804 (void) memset(info, 0, sizeof (nvme_ioctl_ctrl_info_t));
805
806 if (ioctl(ctrl->nc_fd, NVME_IOC_CTRL_INFO, info) != 0) {
807 int e = errno;
808 return (nvme_ioctl_syserror(ctrl, e, "controller info"));
809 }
810
811 if (info->nci_common.nioc_drv_err != NVME_IOCTL_E_OK) {
812 return (nvme_ioctl_error(ctrl, &info->nci_common,
813 "controller info"));
814 }
815
816 return (true);
817 }
818
819 bool
nvme_ioc_ns_info(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_ioctl_ns_info_t * info)820 nvme_ioc_ns_info(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_ioctl_ns_info_t *info)
821 {
822 (void) memset(info, 0, sizeof (nvme_ioctl_ns_info_t));
823 info->nni_common.nioc_nsid = nsid;
824
825 if (ioctl(ctrl->nc_fd, NVME_IOC_NS_INFO, info) != 0) {
826 int e = errno;
827 return (nvme_ioctl_syserror(ctrl, e, "namespace info"));
828 }
829
830 if (info->nni_common.nioc_drv_err != NVME_IOCTL_E_OK) {
831 return (nvme_ioctl_error(ctrl, &info->nni_common,
832 "namespace info"));
833 }
834
835 return (true);
836 }
837
838 const char *
nvme_tporttostr(nvme_ctrl_transport_t tport)839 nvme_tporttostr(nvme_ctrl_transport_t tport)
840 {
841 switch (tport) {
842 case NVME_CTRL_TRANSPORT_PCI:
843 return ("PCI");
844 case NVME_CTRL_TRANSPORT_TCP:
845 return ("TCP");
846 case NVME_CTRL_TRANSPORT_RDMA:
847 return ("RDMA");
848 default:
849 return ("unknown transport");
850 }
851 }
852
853 static bool
nvme_ns_discover_validate(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level)854 nvme_ns_discover_validate(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level)
855 {
856 switch (level) {
857 case NVME_NS_DISC_F_ALL:
858 case NVME_NS_DISC_F_ALLOCATED:
859 case NVME_NS_DISC_F_ACTIVE:
860 case NVME_NS_DISC_F_NOT_IGNORED:
861 case NVME_NS_DISC_F_BLKDEV:
862 return (true);
863 default:
864 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "invalid "
865 "namespace discovery level specified: 0x%x", level));
866 }
867 }
868
869 void
nvme_ns_discover_fini(nvme_ns_iter_t * iter)870 nvme_ns_discover_fini(nvme_ns_iter_t *iter)
871 {
872 free(iter);
873 }
874
875 const char *
nvme_nsleveltostr(nvme_ns_disc_level_t level)876 nvme_nsleveltostr(nvme_ns_disc_level_t level)
877 {
878 switch (level) {
879 case NVME_NS_DISC_F_ALL:
880 return ("unallocated");
881 case NVME_NS_DISC_F_ALLOCATED:
882 return ("allocated");
883 case NVME_NS_DISC_F_ACTIVE:
884 return ("active");
885 case NVME_NS_DISC_F_NOT_IGNORED:
886 return ("not ignored");
887 case NVME_NS_DISC_F_BLKDEV:
888 return ("blkdev");
889 default:
890 return ("unknown level");
891 }
892 }
893
894 nvme_ns_disc_level_t
nvme_ns_state_to_disc_level(nvme_ns_state_t state)895 nvme_ns_state_to_disc_level(nvme_ns_state_t state)
896 {
897 if ((state & NVME_NS_STATE_ALLOCATED) == 0) {
898 return (NVME_NS_DISC_F_ALL);
899 }
900
901 if ((state & NVME_NS_STATE_ACTIVE) == 0) {
902 return (NVME_NS_DISC_F_ALLOCATED);
903 }
904
905 if ((state & NVME_NS_STATE_IGNORED) != 0) {
906 return (NVME_NS_DISC_F_ACTIVE);
907 }
908
909 if ((state & NVME_NS_STATE_ATTACHED) == 0) {
910 return (NVME_NS_DISC_F_NOT_IGNORED);
911 } else {
912 return (NVME_NS_DISC_F_BLKDEV);
913 }
914 }
915
916 nvme_iter_t
nvme_ns_discover_step(nvme_ns_iter_t * iter,const nvme_ns_disc_t ** discp)917 nvme_ns_discover_step(nvme_ns_iter_t *iter, const nvme_ns_disc_t **discp)
918 {
919 nvme_ctrl_t *ctrl = iter->nni_ctrl;
920
921 if (iter->nni_err) {
922 return (NVME_ITER_ERROR);
923 }
924
925 if (iter->nni_done) {
926 return (NVME_ITER_DONE);
927 }
928
929 while (iter->nni_cur_idx <= ctrl->nc_info.id_nn) {
930 uint32_t nsid = iter->nni_cur_idx;
931 nvme_ioctl_ns_info_t ns_info = { 0 };
932 nvme_ns_disc_level_t level;
933
934 if (!nvme_ioc_ns_info(ctrl, nsid, &ns_info)) {
935 iter->nni_err = true;
936 return (NVME_ITER_ERROR);
937 }
938
939 iter->nni_cur_idx++;
940 level = nvme_ns_state_to_disc_level(ns_info.nni_state);
941 if (iter->nni_level > level) {
942 continue;
943 }
944
945 (void) memset(&iter->nni_disc, 0, sizeof (nvme_ns_disc_t));
946 iter->nni_disc.nnd_nsid = nsid;
947 iter->nni_disc.nnd_level = level;
948
949 if (nvme_guid_valid(ctrl, ns_info.nni_id.id_nguid)) {
950 iter->nni_disc.nnd_flags |= NVME_NS_DISC_F_NGUID_VALID;
951 (void) memcpy(iter->nni_disc.nnd_nguid,
952 ns_info.nni_id.id_nguid,
953 sizeof (ns_info.nni_id.id_nguid));
954 }
955
956 if (nvme_eui64_valid(ctrl, ns_info.nni_id.id_eui64)) {
957 iter->nni_disc.nnd_flags |= NVME_NS_DISC_F_EUI64_VALID;
958 (void) memcpy(iter->nni_disc.nnd_eui64,
959 ns_info.nni_id.id_eui64,
960 sizeof (ns_info.nni_id.id_eui64));
961 }
962
963 *discp = &iter->nni_disc;
964 return (NVME_ITER_VALID);
965 }
966
967 iter->nni_done = true;
968 return (NVME_ITER_DONE);
969 }
970
971 bool
nvme_ns_discover_init(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level,nvme_ns_iter_t ** iterp)972 nvme_ns_discover_init(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level,
973 nvme_ns_iter_t **iterp)
974 {
975 nvme_ns_iter_t *iter;
976
977 if (!nvme_ns_discover_validate(ctrl, level)) {
978 return (false);
979 }
980
981 if (iterp == NULL) {
982 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
983 "encountered invalid nvme_ns_iter_t output pointer: %p",
984 iterp));
985 }
986
987 iter = calloc(1, sizeof (nvme_ns_iter_t));
988 if (iter == NULL) {
989 int e = errno;
990 return (nvme_ctrl_error(ctrl, NVME_ERR_NO_MEM, e, "failed to "
991 "allocate memory for a new nvme_ns_iter_t: %s",
992 strerror(e)));
993 }
994
995 iter->nni_ctrl = ctrl;
996 iter->nni_level = level;
997 iter->nni_cur_idx = 1;
998
999 *iterp = iter;
1000 return (nvme_ctrl_success(ctrl));
1001 }
1002
1003 bool
nvme_ns_discover(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level,nvme_ns_disc_f func,void * arg)1004 nvme_ns_discover(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level,
1005 nvme_ns_disc_f func, void *arg)
1006 {
1007 nvme_ns_iter_t *iter;
1008 nvme_iter_t ret;
1009 const nvme_ns_disc_t *disc;
1010
1011 if (!nvme_ns_discover_validate(ctrl, level)) {
1012 return (false);
1013 }
1014
1015 if (func == NULL) {
1016 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1017 "encountered invalid nvme_ns_disc_f function pointer: %p",
1018 func));
1019 }
1020
1021 if (!nvme_ns_discover_init(ctrl, level, &iter)) {
1022 return (false);
1023 }
1024
1025 while ((ret = nvme_ns_discover_step(iter, &disc)) == NVME_ITER_VALID) {
1026 if (!func(ctrl, disc, arg))
1027 break;
1028 }
1029
1030 nvme_ns_discover_fini(iter);
1031 if (ret == NVME_ITER_ERROR) {
1032 return (false);
1033 }
1034
1035 return (nvme_ctrl_success(ctrl));
1036 }
1037
1038 uint32_t
nvme_ns_disc_nsid(const nvme_ns_disc_t * discp)1039 nvme_ns_disc_nsid(const nvme_ns_disc_t *discp)
1040 {
1041 return (discp->nnd_nsid);
1042 }
1043
1044 nvme_ns_disc_level_t
nvme_ns_disc_level(const nvme_ns_disc_t * discp)1045 nvme_ns_disc_level(const nvme_ns_disc_t *discp)
1046 {
1047 return (discp->nnd_level);
1048 }
1049
1050 nvme_ns_disc_flags_t
nvme_ns_disc_flags(const nvme_ns_disc_t * discp)1051 nvme_ns_disc_flags(const nvme_ns_disc_t *discp)
1052 {
1053 return (discp->nnd_flags);
1054 }
1055
1056 const uint8_t *
nvme_ns_disc_eui64(const nvme_ns_disc_t * discp)1057 nvme_ns_disc_eui64(const nvme_ns_disc_t *discp)
1058 {
1059 if ((discp->nnd_flags & NVME_NS_DISC_F_EUI64_VALID) == 0) {
1060 return (NULL);
1061 }
1062
1063 return (discp->nnd_eui64);
1064 }
1065
1066 const uint8_t *
nvme_ns_disc_nguid(const nvme_ns_disc_t * discp)1067 nvme_ns_disc_nguid(const nvme_ns_disc_t *discp)
1068 {
1069 if ((discp->nnd_flags & NVME_NS_DISC_F_NGUID_VALID) == 0) {
1070 return (NULL);
1071 }
1072
1073 return (discp->nnd_nguid);
1074 }
1075
1076 void
nvme_ns_fini(nvme_ns_t * ns)1077 nvme_ns_fini(nvme_ns_t *ns)
1078 {
1079 free(ns);
1080 }
1081
1082 bool
nvme_ns_init(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_ns_t ** nsp)1083 nvme_ns_init(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_ns_t **nsp)
1084 {
1085 nvme_ns_t *ns;
1086
1087 if (nsp == NULL) {
1088 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1089 "encountered invalid nvme_ns_t output pointer: %p", nsp));
1090 }
1091
1092 if (nsid < NVME_NSID_MIN || nsid > ctrl->nc_info.id_nn) {
1093 return (nvme_ctrl_error(ctrl, NVME_ERR_NS_RANGE, 0, "requested "
1094 "namespace 0x%x is invalid, valid namespaces are [0x%x, "
1095 "0x%x]", nsid, NVME_NSID_MIN, ctrl->nc_info.id_nn));
1096 }
1097
1098 ns = calloc(1, sizeof (nvme_ns_t));
1099 if (ns == NULL) {
1100 int e = errno;
1101 return (nvme_ctrl_error(ctrl, NVME_ERR_NO_MEM, e, "failed to "
1102 "allocate memory for a new nvme_ns_t: %s", strerror(e)));
1103 }
1104
1105 ns->nn_ctrl = ctrl;
1106 ns->nn_nsid = nsid;
1107
1108 *nsp = ns;
1109 return (nvme_ctrl_success(ctrl));
1110 }
1111
1112 typedef struct {
1113 nvme_ctrl_t *nnia_ctrl;
1114 const char *nnia_name;
1115 bool nnia_found;
1116 nvme_ns_t *nnia_ns;
1117 nvme_err_data_t nnia_err;
1118 } nvme_ns_init_arg_t;
1119
1120 static bool
nvme_ns_init_by_name_cb(nvme_ctrl_t * ctrl,const nvme_ns_disc_t * disc,void * arg)1121 nvme_ns_init_by_name_cb(nvme_ctrl_t *ctrl, const nvme_ns_disc_t *disc,
1122 void *arg)
1123 {
1124 nvme_ns_init_arg_t *init = arg;
1125 char buf[NVME_NGUID_NAMELEN];
1126 CTASSERT(NVME_NGUID_NAMELEN > NVME_EUI64_NAMELEN);
1127
1128 if ((disc->nnd_flags & NVME_NS_DISC_F_NGUID_VALID) != 0) {
1129 (void) nvme_format_nguid(disc->nnd_nguid, buf, sizeof (buf));
1130 if (strcasecmp(init->nnia_name, buf) == 0)
1131 goto match;
1132 }
1133
1134 if ((disc->nnd_flags & NVME_NS_DISC_F_EUI64_VALID) != 0) {
1135 (void) nvme_format_eui64(disc->nnd_eui64, buf, sizeof (buf));
1136 if (strcasecmp(init->nnia_name, buf) == 0)
1137 goto match;
1138 }
1139
1140 (void) snprintf(buf, sizeof (buf), "%u", disc->nnd_nsid);
1141 if (strcasecmp(init->nnia_name, buf) == 0)
1142 goto match;
1143
1144 return (true);
1145
1146 match:
1147 init->nnia_found = true;
1148 if (!nvme_ns_init(ctrl, disc->nnd_nsid, &init->nnia_ns)) {
1149 nvme_ctrl_err_save(ctrl, &init->nnia_err);
1150 }
1151
1152 return (false);
1153 }
1154
1155 /*
1156 * Attempt to find a namespace by 'name'. A name could be the NGUID, EUI64, or
1157 * just the plain old namespace ID.
1158 */
1159 bool
nvme_ns_init_by_name(nvme_ctrl_t * ctrl,const char * ns_name,nvme_ns_t ** nsp)1160 nvme_ns_init_by_name(nvme_ctrl_t *ctrl, const char *ns_name, nvme_ns_t **nsp)
1161 {
1162 nvme_ns_init_arg_t init;
1163
1164 if (ns_name == NULL) {
1165 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1166 "encountered invalid namespace name: %p", ns_name));
1167 }
1168
1169 if (nsp == NULL) {
1170 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1171 "encountered invalid nvme_ns_t output pointer: %p", nsp));
1172 }
1173
1174 init.nnia_ctrl = ctrl;
1175 init.nnia_name = ns_name;
1176 init.nnia_found = false;
1177 init.nnia_ns = NULL;
1178
1179 if (!nvme_ns_discover(ctrl, NVME_NS_DISC_F_ALL, nvme_ns_init_by_name_cb,
1180 &init)) {
1181 return (false);
1182 }
1183
1184 if (!init.nnia_found) {
1185 return (nvme_ctrl_error(ctrl, NVME_ERR_NS_RANGE, 0, "failed to "
1186 "find NVMe namespace %s on nvme%d", ns_name,
1187 ctrl->nc_inst));
1188 }
1189
1190 if (init.nnia_ns == NULL) {
1191 nvme_ctrl_err_set(ctrl, &init.nnia_err);
1192 return (false);
1193 }
1194
1195 *nsp = init.nnia_ns;
1196 return (nvme_ctrl_success(ctrl));
1197 }
1198
1199 bool
nvme_ctrl_ns_init(nvme_t * nvme,const char * name,nvme_ctrl_t ** ctrlp,nvme_ns_t ** nsp)1200 nvme_ctrl_ns_init(nvme_t *nvme, const char *name, nvme_ctrl_t **ctrlp,
1201 nvme_ns_t **nsp)
1202 {
1203 const char *slash, *ns_name;
1204 char *eptr;
1205 nvme_ctrl_t *ctrl;
1206 nvme_ns_t *ns;
1207 unsigned long inst;
1208 size_t ctrl_namelen;
1209
1210 if (name == NULL) {
1211 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
1212 "invalid name to search for: %p", name));
1213 }
1214
1215 /*
1216 * We require a controller, but the namespace output pointer is only
1217 * required if we end up having a namespace present.
1218 */
1219 if (ctrlp == NULL) {
1220 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
1221 "invalid nvme_ctrl_t output pointer: %p", ctrlp));
1222 }
1223
1224 slash = strchr(name, '/');
1225 if (slash != NULL) {
1226 ctrl_namelen = (uintptr_t)slash - (uintptr_t)name;
1227 ns_name = slash + 1;
1228
1229 if (nsp == NULL) {
1230 return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0,
1231 "encountered invalid nvme_ns_t output pointer: %p",
1232 nsp));
1233 }
1234
1235 } else {
1236 ctrl_namelen = strlen(name);
1237 ns_name = NULL;
1238 }
1239
1240 *ctrlp = NULL;
1241 if (nsp != NULL) {
1242 *nsp = NULL;
1243 }
1244
1245 if (strncmp(name, "nvme", 4) != 0) {
1246 return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0, "unable "
1247 "to map controller '%.*s' to a known device class, "
1248 "expected the controller to start with 'nvme'",
1249 (int)ctrl_namelen, name));
1250 }
1251
1252 /*
1253 * Before we go ahead and try to parse this with strtoul we need to
1254 * manually check two things that strtoul will not:
1255 *
1256 * 1) If we have a null terminator, then we'll just get a 0 back.
1257 * 2) If there are multiple leading zeros in a row then that's an error.
1258 * We don't want to conflate 001 and 1 as the same here. The only valid
1259 * case is 'nvme0' which is 5 characters long, hence the check below.
1260 */
1261 if (ctrl_namelen == 4) {
1262 return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1263 "no controller instance specified in %.*s",
1264 (int)ctrl_namelen, name));
1265 }
1266
1267 if (name[4] == '0' && ctrl_namelen > 5) {
1268 return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1269 "leading zeros aren't allowed for the instance specified "
1270 "in %.*s", (int)ctrl_namelen, name));
1271 }
1272
1273 errno = 0;
1274 inst = strtoul(name + 4, &eptr, 10);
1275 if (errno != 0 || (*eptr != '\0' && eptr != slash)) {
1276 return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1277 "failed to parse controller instance from %.*s",
1278 (int)ctrl_namelen, name));
1279 }
1280
1281 if (inst > INT32_MAX) {
1282 return (nvme_error(nvme, NVME_ERR_ILLEGAL_INSTANCE, 0,
1283 "parsed controller instance %lu is outside the valid "
1284 "range [0, %d]", inst, INT32_MAX));
1285 }
1286
1287 if (!nvme_ctrl_init_by_instance(nvme, (int32_t)inst, &ctrl)) {
1288 return (false);
1289 }
1290
1291 if (ns_name == NULL) {
1292 *ctrlp = ctrl;
1293 return (nvme_success(nvme));
1294 }
1295
1296 if (!nvme_ns_init_by_name(ctrl, ns_name, &ns)) {
1297 nvme_err_data_t err;
1298
1299 nvme_ctrl_err_save(ctrl, &err);
1300 nvme_err_set(nvme, &err);
1301 nvme_ctrl_fini(ctrl);
1302 return (false);
1303 }
1304
1305 *ctrlp = ctrl;
1306 *nsp = ns;
1307
1308 return (nvme_success(nvme));
1309 }
1310
1311 bool
nvme_ns_bd_attach(nvme_ns_t * ns)1312 nvme_ns_bd_attach(nvme_ns_t *ns)
1313 {
1314 nvme_ctrl_t *ctrl = ns->nn_ctrl;
1315 nvme_ioctl_common_t com;
1316
1317 (void) memset(&com, 0, sizeof (com));
1318 com.nioc_nsid = ns->nn_nsid;
1319
1320 if (ioctl(ns->nn_ctrl->nc_fd, NVME_IOC_ATTACH, &com) != 0) {
1321 int e = errno;
1322 return (nvme_ioctl_syserror(ctrl, e, "namespace attach"));
1323 }
1324
1325 if (com.nioc_drv_err != NVME_IOCTL_E_OK) {
1326 return (nvme_ioctl_error(ctrl, &com, "namespace attach"));
1327 }
1328
1329 return (nvme_ctrl_success(ctrl));
1330 }
1331
1332 bool
nvme_ns_bd_detach(nvme_ns_t * ns)1333 nvme_ns_bd_detach(nvme_ns_t *ns)
1334 {
1335 nvme_ctrl_t *ctrl = ns->nn_ctrl;
1336 nvme_ioctl_common_t com;
1337
1338 (void) memset(&com, 0, sizeof (com));
1339 com.nioc_nsid = ns->nn_nsid;
1340
1341 if (ioctl(ns->nn_ctrl->nc_fd, NVME_IOC_DETACH, &com) != 0) {
1342 int e = errno;
1343 return (nvme_ioctl_syserror(ctrl, e, "namespace detach"));
1344 }
1345
1346 if (com.nioc_drv_err != NVME_IOCTL_E_OK) {
1347 return (nvme_ioctl_error(ctrl, &com, "namespace detach"));
1348 }
1349
1350 return (nvme_ctrl_success(ctrl));
1351 }
1352
1353 /*
1354 * Check for a lock programming error and upanic() if so.
1355 */
1356 static void
nvme_lock_check(nvme_ctrl_t * ctrl)1357 nvme_lock_check(nvme_ctrl_t *ctrl)
1358 {
1359 char msg[1024];
1360 int ret;
1361 const char *up;
1362 size_t ulen;
1363 const char *base = "fatal libnvme locking error detected";
1364
1365 if (ctrl->nc_err.ne_err != NVME_ERR_LOCK_PROG) {
1366 return;
1367 }
1368
1369 ret = snprintf(msg, sizeof (msg), "%s: %s (controller %p)", base,
1370 ctrl->nc_err.ne_errmsg, ctrl);
1371 if (ret >= sizeof (msg)) {
1372 ulen = sizeof (msg);
1373 up = msg;
1374 } else if (ret <= 0) {
1375 ulen = strlen(base) + 1;
1376 up = base;
1377 } else {
1378 ulen = (size_t)ret + 1;
1379 up = msg;
1380 }
1381
1382 upanic(up, ulen);
1383 }
1384
1385 static bool
nvme_lock_common(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_lock_level_t level,nvme_lock_flags_t flags)1386 nvme_lock_common(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_lock_level_t level,
1387 nvme_lock_flags_t flags)
1388 {
1389 nvme_ioctl_lock_t lock;
1390 const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK;
1391
1392 if (level != NVME_LOCK_L_READ && level != NVME_LOCK_L_WRITE) {
1393 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "unknown "
1394 "lock level: 0x%x", level));
1395 }
1396
1397 if ((flags & ~all_flags) != 0) {
1398 return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "unknown "
1399 "lock flags: 0x%x", flags & ~all_flags));
1400 }
1401
1402 (void) memset(&lock, 0, sizeof (lock));
1403 lock.nil_common.nioc_nsid = nsid;
1404 if (nsid != 0) {
1405 lock.nil_ent = NVME_LOCK_E_NS;
1406 } else {
1407 lock.nil_ent = NVME_LOCK_E_CTRL;
1408 }
1409 lock.nil_level = level;
1410 lock.nil_flags = flags;
1411
1412 if (ioctl(ctrl->nc_fd, NVME_IOC_LOCK, &lock) != 0) {
1413 int e = errno;
1414 return (nvme_ioctl_syserror(ctrl, e, "lock"));
1415 }
1416
1417 if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) {
1418 (void) nvme_ioctl_error(ctrl, &lock.nil_common, "lock");
1419 nvme_lock_check(ctrl);
1420 return (false);
1421 }
1422
1423 return (nvme_ctrl_success(ctrl));
1424 }
1425
1426 /*
1427 * You may reasonably be wondering why does this return and why do we basically
1428 * panic everywhere. The reality is twofold. The first part of this is that we
1429 * know from experience in libc that error checking mutexes are not the most
1430 * common and the kernel simplicity of mutex_enter() and mutex_exit() are really
1431 * a boon. The second piece here is that the way that the ioctl path works here,
1432 * only programming errors or mischief in the library could cause this to fail
1433 * at the raw ioctl / errno level. That is EBADF/EFAULT, etc. are our fault and
1434 * if you cannot unlock because of that you're not going to get much further.
1435 */
1436 void
nvme_unlock_common(nvme_ctrl_t * ctrl,uint32_t nsid)1437 nvme_unlock_common(nvme_ctrl_t *ctrl, uint32_t nsid)
1438 {
1439 nvme_ioctl_unlock_t unlock;
1440
1441 (void) memset(&unlock, 0, sizeof (unlock));
1442 unlock.niu_common.nioc_nsid = nsid;
1443 if (nsid != 0) {
1444 unlock.niu_ent = NVME_LOCK_E_NS;
1445 } else {
1446 unlock.niu_ent = NVME_LOCK_E_CTRL;
1447 }
1448
1449 /*
1450 * Because all unlock ioctls errors are promoted to an error, we don't
1451 * bother calling nvme_ioctl_syserror() here.
1452 */
1453 if (ioctl(ctrl->nc_fd, NVME_IOC_UNLOCK, &unlock) != 0) {
1454 int e = errno;
1455 (void) nvme_ctrl_error(ctrl, NVME_ERR_LOCK_PROG, e, "internal "
1456 "programming error: failed to issue unlock ioctl: %s",
1457 strerror(e));
1458 nvme_lock_check(ctrl);
1459 return;
1460 }
1461
1462 if (unlock.niu_common.nioc_drv_err != NVME_IOCTL_E_OK) {
1463 (void) nvme_ioctl_error(ctrl, &unlock.niu_common, "unlock");
1464 /*
1465 * Promote any other failure to a new fatal failure. Consumers
1466 * expect this to have worked.
1467 */
1468 if (ctrl->nc_err.ne_err != NVME_ERR_LOCK_PROG) {
1469 nvme_err_data_t err;
1470 nvme_ctrl_err_save(ctrl, &err);
1471 (void) nvme_ctrl_error(ctrl, NVME_ERR_LOCK_PROG, 0,
1472 "internal programming error: received unexpected "
1473 "libnvme error 0x%x: %s", err.ne_err,
1474 err.ne_errmsg);
1475 }
1476 nvme_lock_check(ctrl);
1477 return;
1478 }
1479
1480 (void) nvme_ctrl_success(ctrl);
1481 }
1482
1483 bool
nvme_ctrl_lock(nvme_ctrl_t * ctrl,nvme_lock_level_t level,nvme_lock_flags_t flags)1484 nvme_ctrl_lock(nvme_ctrl_t *ctrl, nvme_lock_level_t level,
1485 nvme_lock_flags_t flags)
1486 {
1487 return (nvme_lock_common(ctrl, 0, level, flags));
1488 }
1489
1490 bool
nvme_ns_lock(nvme_ns_t * ns,nvme_lock_level_t level,nvme_lock_flags_t flags)1491 nvme_ns_lock(nvme_ns_t *ns, nvme_lock_level_t level,
1492 nvme_lock_flags_t flags)
1493 {
1494 return (nvme_lock_common(ns->nn_ctrl, ns->nn_nsid, level, flags));
1495 }
1496
1497 void
nvme_ctrl_unlock(nvme_ctrl_t * ctrl)1498 nvme_ctrl_unlock(nvme_ctrl_t *ctrl)
1499 {
1500 nvme_unlock_common(ctrl, 0);
1501 }
1502
1503 void
nvme_ns_unlock(nvme_ns_t * ns)1504 nvme_ns_unlock(nvme_ns_t *ns)
1505 {
1506 nvme_unlock_common(ns->nn_ctrl, ns->nn_nsid);
1507 }
1508