1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2012-2016 Intel Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include "opt_nvme.h"
30
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/bus.h>
35 #include <sys/conf.h>
36 #include <sys/ioccom.h>
37 #include <sys/proc.h>
38 #include <sys/smp.h>
39 #include <sys/uio.h>
40 #include <sys/sbuf.h>
41 #include <sys/endian.h>
42 #include <machine/stdarg.h>
43 #include <vm/vm.h>
44
45 #include "nvme_private.h"
46 #include "nvme_linux.h"
47
48 #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */
49
50 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
51 struct nvme_async_event_request *aer);
52
53 static void
nvme_ctrlr_barrier(struct nvme_controller * ctrlr,int flags)54 nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags)
55 {
56 bus_barrier(ctrlr->resource, 0, rman_get_size(ctrlr->resource), flags);
57 }
58
59 static void
nvme_ctrlr_devctl_va(struct nvme_controller * ctrlr,const char * type,const char * msg,va_list ap)60 nvme_ctrlr_devctl_va(struct nvme_controller *ctrlr, const char *type,
61 const char *msg, va_list ap)
62 {
63 struct sbuf sb;
64 int error;
65
66 if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL)
67 return;
68 sbuf_printf(&sb, "name=\"%s\" ", device_get_nameunit(ctrlr->dev));
69 sbuf_vprintf(&sb, msg, ap);
70 error = sbuf_finish(&sb);
71 if (error == 0)
72 devctl_notify("nvme", "controller", type, sbuf_data(&sb));
73 sbuf_delete(&sb);
74 }
75
76 static void
nvme_ctrlr_devctl(struct nvme_controller * ctrlr,const char * type,const char * msg,...)77 nvme_ctrlr_devctl(struct nvme_controller *ctrlr, const char *type, const char *msg, ...)
78 {
79 va_list ap;
80
81 va_start(ap, msg);
82 nvme_ctrlr_devctl_va(ctrlr, type, msg, ap);
83 va_end(ap);
84 }
85
86 static void
nvme_ctrlr_devctl_log(struct nvme_controller * ctrlr,const char * type,const char * msg,...)87 nvme_ctrlr_devctl_log(struct nvme_controller *ctrlr, const char *type, const char *msg, ...)
88 {
89 struct sbuf sb;
90 va_list ap;
91 int error;
92
93 if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL)
94 return;
95 sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev));
96 va_start(ap, msg);
97 sbuf_vprintf(&sb, msg, ap);
98 va_end(ap);
99 error = sbuf_finish(&sb);
100 if (error == 0)
101 printf("%s\n", sbuf_data(&sb));
102 sbuf_delete(&sb);
103 va_start(ap, msg);
104 nvme_ctrlr_devctl_va(ctrlr, type, msg, ap);
105 va_end(ap);
106 }
107
108 static int
nvme_ctrlr_construct_admin_qpair(struct nvme_controller * ctrlr)109 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
110 {
111 struct nvme_qpair *qpair;
112 uint32_t num_entries;
113 int error;
114
115 qpair = &ctrlr->adminq;
116 qpair->id = 0;
117 qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
118 qpair->domain = ctrlr->domain;
119
120 num_entries = NVME_ADMIN_ENTRIES;
121 TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
122 /*
123 * If admin_entries was overridden to an invalid value, revert it
124 * back to our default value.
125 */
126 if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
127 num_entries > NVME_MAX_ADMIN_ENTRIES) {
128 nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
129 "specified\n", num_entries);
130 num_entries = NVME_ADMIN_ENTRIES;
131 }
132
133 /*
134 * The admin queue's max xfer size is treated differently than the
135 * max I/O xfer size. 16KB is sufficient here - maybe even less?
136 */
137 error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS,
138 ctrlr);
139 return (error);
140 }
141
142 #define QP(ctrlr, c) ((c) * (ctrlr)->num_io_queues / mp_ncpus)
143
144 static int
nvme_ctrlr_construct_io_qpairs(struct nvme_controller * ctrlr)145 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
146 {
147 struct nvme_qpair *qpair;
148 uint32_t cap_lo;
149 uint16_t mqes;
150 int c, error, i, n;
151 int num_entries, num_trackers, max_entries;
152
153 /*
154 * NVMe spec sets a hard limit of 64K max entries, but devices may
155 * specify a smaller limit, so we need to check the MQES field in the
156 * capabilities register. We have to cap the number of entries to the
157 * current stride allows for in BAR 0/1, otherwise the remainder entries
158 * are inaccessible. MQES should reflect this, and this is just a
159 * fail-safe.
160 */
161 max_entries =
162 (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) /
163 (1 << (ctrlr->dstrd + 1));
164 num_entries = NVME_IO_ENTRIES;
165 TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
166 cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
167 mqes = NVME_CAP_LO_MQES(cap_lo);
168 num_entries = min(num_entries, mqes + 1);
169 num_entries = min(num_entries, max_entries);
170
171 num_trackers = NVME_IO_TRACKERS;
172 TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
173
174 num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
175 num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
176 /*
177 * No need to have more trackers than entries in the submit queue. Note
178 * also that for a queue size of N, we can only have (N-1) commands
179 * outstanding, hence the "-1" here.
180 */
181 num_trackers = min(num_trackers, (num_entries-1));
182
183 /*
184 * Our best estimate for the maximum number of I/Os that we should
185 * normally have in flight at one time. This should be viewed as a hint,
186 * not a hard limit and will need to be revisited when the upper layers
187 * of the storage system grows multi-queue support.
188 */
189 ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;
190
191 ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
192 M_NVME, M_ZERO | M_WAITOK);
193
194 for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) {
195 qpair = &ctrlr->ioq[i];
196
197 /*
198 * Admin queue has ID=0. IO queues start at ID=1 -
199 * hence the 'i+1' here.
200 */
201 qpair->id = i + 1;
202 if (ctrlr->num_io_queues > 1) {
203 /* Find number of CPUs served by this queue. */
204 for (n = 1; QP(ctrlr, c + n) == i; n++)
205 ;
206 /* Shuffle multiple NVMe devices between CPUs. */
207 qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n;
208 qpair->domain = pcpu_find(qpair->cpu)->pc_domain;
209 } else {
210 qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
211 qpair->domain = ctrlr->domain;
212 }
213
214 /*
215 * For I/O queues, use the controller-wide max_xfer_size
216 * calculated in nvme_attach().
217 */
218 error = nvme_qpair_construct(qpair, num_entries, num_trackers,
219 ctrlr);
220 if (error)
221 return (error);
222
223 /*
224 * Do not bother binding interrupts if we only have one I/O
225 * interrupt thread for this controller.
226 */
227 if (ctrlr->num_io_queues > 1)
228 bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu);
229 }
230
231 return (0);
232 }
233
234 static void
nvme_ctrlr_fail(struct nvme_controller * ctrlr,bool admin_also)235 nvme_ctrlr_fail(struct nvme_controller *ctrlr, bool admin_also)
236 {
237 int i;
238
239 /*
240 * No need to disable queues before failing them. Failing is a superet
241 * of disabling (though pedantically we'd abort the AERs silently with
242 * a different error, though when we fail, that hardly matters).
243 */
244 ctrlr->is_failed = true;
245 if (admin_also) {
246 ctrlr->is_failed_admin = true;
247 nvme_qpair_fail(&ctrlr->adminq);
248 }
249 if (ctrlr->ioq != NULL) {
250 for (i = 0; i < ctrlr->num_io_queues; i++) {
251 nvme_qpair_fail(&ctrlr->ioq[i]);
252 }
253 }
254 nvme_notify_fail_consumers(ctrlr);
255 }
256
257 /*
258 * Wait for RDY to change.
259 *
260 * Starts sleeping for 1us and geometrically increases it the longer we wait,
261 * capped at 1ms.
262 */
263 static int
nvme_ctrlr_wait_for_ready(struct nvme_controller * ctrlr,int desired_val)264 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val)
265 {
266 int timeout = ticks + MSEC_2_TICKS(ctrlr->ready_timeout_in_ms);
267 sbintime_t delta_t = SBT_1US;
268 uint32_t csts;
269
270 while (1) {
271 csts = nvme_mmio_read_4(ctrlr, csts);
272 if (csts == NVME_GONE) /* Hot unplug. */
273 return (ENXIO);
274 if (NVMEV(NVME_CSTS_REG_RDY, csts) == desired_val)
275 break;
276 if (timeout - ticks < 0) {
277 nvme_printf(ctrlr, "controller ready did not become %d "
278 "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms);
279 return (ENXIO);
280 }
281
282 pause_sbt("nvmerdy", delta_t, 0, C_PREL(1));
283 delta_t = min(SBT_1MS, delta_t * 3 / 2);
284 }
285
286 return (0);
287 }
288
289 static int
nvme_ctrlr_disable(struct nvme_controller * ctrlr)290 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
291 {
292 uint32_t cc;
293 uint32_t csts;
294 uint8_t en, rdy;
295 int err;
296
297 cc = nvme_mmio_read_4(ctrlr, cc);
298 csts = nvme_mmio_read_4(ctrlr, csts);
299
300 en = NVMEV(NVME_CC_REG_EN, cc);
301 rdy = NVMEV(NVME_CSTS_REG_RDY, csts);
302
303 /*
304 * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
305 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
306 * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
307 * isn't the desired value. Short circuit if we're already disabled.
308 */
309 if (en == 0) {
310 /* Wait for RDY == 0 or timeout & fail */
311 if (rdy == 0)
312 return (0);
313 return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
314 }
315 if (rdy == 0) {
316 /* EN == 1, wait for RDY == 1 or timeout & fail */
317 err = nvme_ctrlr_wait_for_ready(ctrlr, 1);
318 if (err != 0)
319 return (err);
320 }
321
322 cc &= ~NVMEM(NVME_CC_REG_EN);
323 nvme_mmio_write_4(ctrlr, cc, cc);
324
325 /*
326 * A few drives have firmware bugs that freeze the drive if we access
327 * the mmio too soon after we disable.
328 */
329 if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY)
330 pause("nvmeR", MSEC_2_TICKS(B4_CHK_RDY_DELAY_MS));
331 return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
332 }
333
334 static int
nvme_ctrlr_enable(struct nvme_controller * ctrlr)335 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
336 {
337 uint32_t cc;
338 uint32_t csts;
339 uint32_t aqa;
340 uint32_t qsize;
341 uint8_t en, rdy;
342 int err;
343
344 cc = nvme_mmio_read_4(ctrlr, cc);
345 csts = nvme_mmio_read_4(ctrlr, csts);
346
347 en = NVMEV(NVME_CC_REG_EN, cc);
348 rdy = NVMEV(NVME_CSTS_REG_RDY, csts);
349
350 /*
351 * See note in nvme_ctrlr_disable. Short circuit if we're already enabled.
352 */
353 if (en == 1) {
354 if (rdy == 1)
355 return (0);
356 return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
357 }
358
359 /* EN == 0 already wait for RDY == 0 or timeout & fail */
360 err = nvme_ctrlr_wait_for_ready(ctrlr, 0);
361 if (err != 0)
362 return (err);
363
364 nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
365 nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
366
367 /* acqs and asqs are 0-based. */
368 qsize = ctrlr->adminq.num_entries - 1;
369
370 aqa = 0;
371 aqa |= NVMEF(NVME_AQA_REG_ACQS, qsize);
372 aqa |= NVMEF(NVME_AQA_REG_ASQS, qsize);
373 nvme_mmio_write_4(ctrlr, aqa, aqa);
374
375 /* Initialization values for CC */
376 cc = 0;
377 cc |= NVMEF(NVME_CC_REG_EN, 1);
378 cc |= NVMEF(NVME_CC_REG_CSS, 0);
379 cc |= NVMEF(NVME_CC_REG_AMS, 0);
380 cc |= NVMEF(NVME_CC_REG_SHN, 0);
381 cc |= NVMEF(NVME_CC_REG_IOSQES, 6); /* SQ entry size == 64 == 2^6 */
382 cc |= NVMEF(NVME_CC_REG_IOCQES, 4); /* CQ entry size == 16 == 2^4 */
383
384 /*
385 * Use the Memory Page Size selected during device initialization. Note
386 * that value stored in mps is suitable to use here without adjusting by
387 * NVME_MPS_SHIFT.
388 */
389 cc |= NVMEF(NVME_CC_REG_MPS, ctrlr->mps);
390
391 nvme_ctrlr_barrier(ctrlr, BUS_SPACE_BARRIER_WRITE);
392 nvme_mmio_write_4(ctrlr, cc, cc);
393
394 return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
395 }
396
397 static void
nvme_ctrlr_disable_qpairs(struct nvme_controller * ctrlr)398 nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
399 {
400 int i;
401
402 nvme_admin_qpair_disable(&ctrlr->adminq);
403 /*
404 * I/O queues are not allocated before the initial HW
405 * reset, so do not try to disable them. Use is_initialized
406 * to determine if this is the initial HW reset.
407 */
408 if (ctrlr->is_initialized) {
409 for (i = 0; i < ctrlr->num_io_queues; i++)
410 nvme_io_qpair_disable(&ctrlr->ioq[i]);
411 }
412 }
413
414 static int
nvme_ctrlr_hw_reset(struct nvme_controller * ctrlr)415 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
416 {
417 int err;
418
419 TSENTER();
420
421 ctrlr->is_failed_admin = true;
422 nvme_ctrlr_disable_qpairs(ctrlr);
423
424 err = nvme_ctrlr_disable(ctrlr);
425 if (err != 0)
426 goto out;
427
428 err = nvme_ctrlr_enable(ctrlr);
429 out:
430 if (err == 0)
431 ctrlr->is_failed_admin = false;
432
433 TSEXIT();
434 return (err);
435 }
436
437 void
nvme_ctrlr_reset(struct nvme_controller * ctrlr)438 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
439 {
440 int cmpset;
441
442 cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
443
444 if (cmpset == 0)
445 /*
446 * Controller is already resetting. Return immediately since
447 * there is no need to kick off another reset.
448 */
449 return;
450
451 if (!ctrlr->is_dying)
452 taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
453 }
454
455 static int
nvme_ctrlr_identify(struct nvme_controller * ctrlr)456 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
457 {
458 struct nvme_completion_poll_status status;
459
460 status.done = 0;
461 nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
462 nvme_completion_poll_cb, &status);
463 nvme_completion_poll(&status);
464 if (nvme_completion_is_error(&status.cpl)) {
465 nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
466 return (ENXIO);
467 }
468
469 /* Convert data to host endian */
470 nvme_controller_data_swapbytes(&ctrlr->cdata);
471
472 /*
473 * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
474 * controller supports.
475 */
476 if (ctrlr->cdata.mdts > 0)
477 ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
478 1 << (ctrlr->cdata.mdts + NVME_MPS_SHIFT +
479 NVME_CAP_HI_MPSMIN(ctrlr->cap_hi)));
480
481 return (0);
482 }
483
484 static int
nvme_ctrlr_set_num_qpairs(struct nvme_controller * ctrlr)485 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
486 {
487 struct nvme_completion_poll_status status;
488 int cq_allocated, sq_allocated;
489
490 status.done = 0;
491 nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
492 nvme_completion_poll_cb, &status);
493 nvme_completion_poll(&status);
494 if (nvme_completion_is_error(&status.cpl)) {
495 nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n");
496 return (ENXIO);
497 }
498
499 /*
500 * Data in cdw0 is 0-based.
501 * Lower 16-bits indicate number of submission queues allocated.
502 * Upper 16-bits indicate number of completion queues allocated.
503 */
504 sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
505 cq_allocated = (status.cpl.cdw0 >> 16) + 1;
506
507 /*
508 * Controller may allocate more queues than we requested,
509 * so use the minimum of the number requested and what was
510 * actually allocated.
511 */
512 ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
513 ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
514 if (ctrlr->num_io_queues > vm_ndomains)
515 ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains;
516
517 return (0);
518 }
519
520 static int
nvme_ctrlr_create_qpairs(struct nvme_controller * ctrlr)521 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
522 {
523 struct nvme_completion_poll_status status;
524 struct nvme_qpair *qpair;
525 int i;
526
527 for (i = 0; i < ctrlr->num_io_queues; i++) {
528 qpair = &ctrlr->ioq[i];
529
530 status.done = 0;
531 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair,
532 nvme_completion_poll_cb, &status);
533 nvme_completion_poll(&status);
534 if (nvme_completion_is_error(&status.cpl)) {
535 nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
536 return (ENXIO);
537 }
538
539 status.done = 0;
540 nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair,
541 nvme_completion_poll_cb, &status);
542 nvme_completion_poll(&status);
543 if (nvme_completion_is_error(&status.cpl)) {
544 nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
545 return (ENXIO);
546 }
547 }
548
549 return (0);
550 }
551
552 static int
nvme_ctrlr_delete_qpairs(struct nvme_controller * ctrlr)553 nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr)
554 {
555 struct nvme_completion_poll_status status;
556 struct nvme_qpair *qpair;
557
558 for (int i = 0; i < ctrlr->num_io_queues; i++) {
559 qpair = &ctrlr->ioq[i];
560
561 status.done = 0;
562 nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair,
563 nvme_completion_poll_cb, &status);
564 nvme_completion_poll(&status);
565 if (nvme_completion_is_error(&status.cpl)) {
566 nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n");
567 return (ENXIO);
568 }
569
570 status.done = 0;
571 nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair,
572 nvme_completion_poll_cb, &status);
573 nvme_completion_poll(&status);
574 if (nvme_completion_is_error(&status.cpl)) {
575 nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n");
576 return (ENXIO);
577 }
578 }
579
580 return (0);
581 }
582
583 static int
nvme_ctrlr_construct_namespaces(struct nvme_controller * ctrlr)584 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
585 {
586 struct nvme_namespace *ns;
587 uint32_t i;
588
589 for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) {
590 ns = &ctrlr->ns[i];
591 nvme_ns_construct(ns, i+1, ctrlr);
592 }
593
594 return (0);
595 }
596
597 static bool
is_log_page_id_valid(uint8_t page_id)598 is_log_page_id_valid(uint8_t page_id)
599 {
600
601 switch (page_id) {
602 case NVME_LOG_ERROR:
603 case NVME_LOG_HEALTH_INFORMATION:
604 case NVME_LOG_FIRMWARE_SLOT:
605 case NVME_LOG_CHANGED_NAMESPACE:
606 case NVME_LOG_COMMAND_EFFECT:
607 case NVME_LOG_RES_NOTIFICATION:
608 case NVME_LOG_SANITIZE_STATUS:
609 return (true);
610 }
611
612 return (false);
613 }
614
615 static uint32_t
nvme_ctrlr_get_log_page_size(struct nvme_controller * ctrlr,uint8_t page_id)616 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
617 {
618 uint32_t log_page_size;
619
620 switch (page_id) {
621 case NVME_LOG_ERROR:
622 log_page_size = min(
623 sizeof(struct nvme_error_information_entry) *
624 (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE);
625 break;
626 case NVME_LOG_HEALTH_INFORMATION:
627 log_page_size = sizeof(struct nvme_health_information_page);
628 break;
629 case NVME_LOG_FIRMWARE_SLOT:
630 log_page_size = sizeof(struct nvme_firmware_page);
631 break;
632 case NVME_LOG_CHANGED_NAMESPACE:
633 log_page_size = sizeof(struct nvme_ns_list);
634 break;
635 case NVME_LOG_COMMAND_EFFECT:
636 log_page_size = sizeof(struct nvme_command_effects_page);
637 break;
638 case NVME_LOG_RES_NOTIFICATION:
639 log_page_size = sizeof(struct nvme_res_notification_page);
640 break;
641 case NVME_LOG_SANITIZE_STATUS:
642 log_page_size = sizeof(struct nvme_sanitize_status_page);
643 break;
644 default:
645 log_page_size = 0;
646 break;
647 }
648
649 return (log_page_size);
650 }
651
652 static void
nvme_ctrlr_log_critical_warnings(struct nvme_controller * ctrlr,uint8_t state)653 nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
654 uint8_t state)
655 {
656
657 if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)
658 nvme_printf(ctrlr, "SMART WARNING: available spare space below threshold\n");
659
660 if (state & NVME_CRIT_WARN_ST_TEMPERATURE)
661 nvme_printf(ctrlr, "SMART WARNING: temperature above threshold\n");
662
663 if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)
664 nvme_printf(ctrlr, "SMART WARNING: device reliability degraded\n");
665
666 if (state & NVME_CRIT_WARN_ST_READ_ONLY)
667 nvme_printf(ctrlr, "SMART WARNING: media placed in read only mode\n");
668
669 if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)
670 nvme_printf(ctrlr, "SMART WARNING: volatile memory backup device failed\n");
671
672 if (state & NVME_CRIT_WARN_ST_PERSISTENT_MEMORY_REGION)
673 nvme_printf(ctrlr, "SMART WARNING: persistent memory read only or unreliable\n");
674
675 if (state & NVME_CRIT_WARN_ST_RESERVED_MASK)
676 nvme_printf(ctrlr, "SMART WARNING: unknown critical warning(s): state = 0x%02x\n",
677 state & NVME_CRIT_WARN_ST_RESERVED_MASK);
678
679 nvme_ctrlr_devctl(ctrlr, "critical", "SMART_ERROR", "state=0x%02x", state);
680 }
681
682 static void
nvme_ctrlr_async_event_log_page_cb(void * arg,const struct nvme_completion * cpl)683 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
684 {
685 struct nvme_async_event_request *aer = arg;
686 struct nvme_health_information_page *health_info;
687 struct nvme_ns_list *nsl;
688 struct nvme_error_information_entry *err;
689 int i;
690
691 /*
692 * If the log page fetch for some reason completed with an error,
693 * don't pass log page data to the consumers. In practice, this case
694 * should never happen.
695 */
696 if (nvme_completion_is_error(cpl))
697 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
698 aer->log_page_id, NULL, 0);
699 else {
700 /* Convert data to host endian */
701 switch (aer->log_page_id) {
702 case NVME_LOG_ERROR:
703 err = (struct nvme_error_information_entry *)aer->log_page_buffer;
704 for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
705 nvme_error_information_entry_swapbytes(err++);
706 break;
707 case NVME_LOG_HEALTH_INFORMATION:
708 nvme_health_information_page_swapbytes(
709 (struct nvme_health_information_page *)aer->log_page_buffer);
710 break;
711 case NVME_LOG_CHANGED_NAMESPACE:
712 nvme_ns_list_swapbytes(
713 (struct nvme_ns_list *)aer->log_page_buffer);
714 break;
715 case NVME_LOG_COMMAND_EFFECT:
716 nvme_command_effects_page_swapbytes(
717 (struct nvme_command_effects_page *)aer->log_page_buffer);
718 break;
719 case NVME_LOG_RES_NOTIFICATION:
720 nvme_res_notification_page_swapbytes(
721 (struct nvme_res_notification_page *)aer->log_page_buffer);
722 break;
723 case NVME_LOG_SANITIZE_STATUS:
724 nvme_sanitize_status_page_swapbytes(
725 (struct nvme_sanitize_status_page *)aer->log_page_buffer);
726 break;
727 default:
728 break;
729 }
730
731 if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
732 health_info = (struct nvme_health_information_page *)
733 aer->log_page_buffer;
734 nvme_ctrlr_log_critical_warnings(aer->ctrlr,
735 health_info->critical_warning);
736 /*
737 * Critical warnings reported through the
738 * SMART/health log page are persistent, so
739 * clear the associated bits in the async event
740 * config so that we do not receive repeated
741 * notifications for the same event.
742 */
743 aer->ctrlr->async_event_config &=
744 ~health_info->critical_warning;
745 nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
746 aer->ctrlr->async_event_config, NULL, NULL);
747 } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
748 !nvme_use_nvd) {
749 nsl = (struct nvme_ns_list *)aer->log_page_buffer;
750 for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
751 if (nsl->ns[i] > NVME_MAX_NAMESPACES)
752 break;
753 nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
754 }
755 }
756
757 /*
758 * Pass the cpl data from the original async event completion,
759 * not the log page fetch.
760 */
761 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
762 aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
763 }
764
765 /*
766 * Repost another asynchronous event request to replace the one
767 * that just completed.
768 */
769 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
770 }
771
772 static void
nvme_ctrlr_async_event_cb(void * arg,const struct nvme_completion * cpl)773 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
774 {
775 struct nvme_async_event_request *aer = arg;
776
777 if (nvme_completion_is_error(cpl)) {
778 /*
779 * Do not retry failed async event requests. This avoids
780 * infinite loops where a new async event request is submitted
781 * to replace the one just failed, only to fail again and
782 * perpetuate the loop.
783 */
784 return;
785 }
786
787 /* Associated log page is in bits 23:16 of completion entry dw0. */
788 aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0);
789
790 nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
791 " page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0),
792 NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0),
793 aer->log_page_id);
794
795 if (is_log_page_id_valid(aer->log_page_id)) {
796 aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
797 aer->log_page_id);
798 memcpy(&aer->cpl, cpl, sizeof(*cpl));
799 nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
800 NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
801 aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
802 aer);
803 /* Wait to notify consumers until after log page is fetched. */
804 } else {
805 nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
806 NULL, 0);
807
808 /*
809 * Repost another asynchronous event request to replace the one
810 * that just completed.
811 */
812 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
813 }
814 }
815
816 static void
nvme_ctrlr_construct_and_submit_aer(struct nvme_controller * ctrlr,struct nvme_async_event_request * aer)817 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
818 struct nvme_async_event_request *aer)
819 {
820 struct nvme_request *req;
821
822 aer->ctrlr = ctrlr;
823 /*
824 * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable
825 * callback context. AER completions should be handled on a dedicated
826 * thread.
827 */
828 req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb,
829 aer);
830 aer->req = req;
831
832 /*
833 * Disable timeout here, since asynchronous event requests should by
834 * nature never be timed out.
835 */
836 req->timeout = false;
837 req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
838 nvme_ctrlr_submit_admin_request(ctrlr, req);
839 }
840
841 static void
nvme_ctrlr_configure_aer(struct nvme_controller * ctrlr)842 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
843 {
844 struct nvme_completion_poll_status status;
845 struct nvme_async_event_request *aer;
846 uint32_t i;
847
848 ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
849 NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
850 NVME_CRIT_WARN_ST_READ_ONLY |
851 NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
852 if (ctrlr->cdata.ver >= NVME_REV(1, 2))
853 ctrlr->async_event_config |=
854 ctrlr->cdata.oaes & (NVME_ASYNC_EVENT_NS_ATTRIBUTE |
855 NVME_ASYNC_EVENT_FW_ACTIVATE);
856
857 status.done = 0;
858 nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD,
859 0, NULL, 0, nvme_completion_poll_cb, &status);
860 nvme_completion_poll(&status);
861 if (nvme_completion_is_error(&status.cpl) ||
862 (status.cpl.cdw0 & 0xFFFF) == 0xFFFF ||
863 (status.cpl.cdw0 & 0xFFFF) == 0x0000) {
864 nvme_printf(ctrlr, "temperature threshold not supported\n");
865 } else
866 ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE;
867
868 nvme_ctrlr_cmd_set_async_event_config(ctrlr,
869 ctrlr->async_event_config, NULL, NULL);
870
871 /* aerl is a zero-based value, so we need to add 1 here. */
872 ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
873
874 for (i = 0; i < ctrlr->num_aers; i++) {
875 aer = &ctrlr->aer[i];
876 nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
877 }
878 }
879
880 static void
nvme_ctrlr_configure_int_coalescing(struct nvme_controller * ctrlr)881 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
882 {
883
884 ctrlr->int_coal_time = 0;
885 TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
886 &ctrlr->int_coal_time);
887
888 ctrlr->int_coal_threshold = 0;
889 TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
890 &ctrlr->int_coal_threshold);
891
892 nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
893 ctrlr->int_coal_threshold, NULL, NULL);
894 }
895
896 static void
nvme_ctrlr_hmb_free(struct nvme_controller * ctrlr)897 nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr)
898 {
899 struct nvme_hmb_chunk *hmbc;
900 int i;
901
902 if (ctrlr->hmb_desc_paddr) {
903 bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map);
904 bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
905 ctrlr->hmb_desc_map);
906 ctrlr->hmb_desc_paddr = 0;
907 }
908 if (ctrlr->hmb_desc_tag) {
909 bus_dma_tag_destroy(ctrlr->hmb_desc_tag);
910 ctrlr->hmb_desc_tag = NULL;
911 }
912 for (i = 0; i < ctrlr->hmb_nchunks; i++) {
913 hmbc = &ctrlr->hmb_chunks[i];
914 bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map);
915 bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
916 hmbc->hmbc_map);
917 }
918 ctrlr->hmb_nchunks = 0;
919 if (ctrlr->hmb_tag) {
920 bus_dma_tag_destroy(ctrlr->hmb_tag);
921 ctrlr->hmb_tag = NULL;
922 }
923 if (ctrlr->hmb_chunks) {
924 free(ctrlr->hmb_chunks, M_NVME);
925 ctrlr->hmb_chunks = NULL;
926 }
927 }
928
929 static void
nvme_ctrlr_hmb_alloc(struct nvme_controller * ctrlr)930 nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr)
931 {
932 struct nvme_hmb_chunk *hmbc;
933 size_t pref, min, minc, size;
934 int err, i;
935 uint64_t max;
936
937 /* Limit HMB to 5% of RAM size per device by default. */
938 max = (uint64_t)physmem * PAGE_SIZE / 20;
939 TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max);
940
941 /*
942 * Units of Host Memory Buffer in the Identify info are always in terms
943 * of 4k units.
944 */
945 min = (long long unsigned)ctrlr->cdata.hmmin * NVME_HMB_UNITS;
946 if (max == 0 || max < min)
947 return;
948 pref = MIN((long long unsigned)ctrlr->cdata.hmpre * NVME_HMB_UNITS, max);
949 minc = MAX(ctrlr->cdata.hmminds * NVME_HMB_UNITS, ctrlr->page_size);
950 if (min > 0 && ctrlr->cdata.hmmaxd > 0)
951 minc = MAX(minc, min / ctrlr->cdata.hmmaxd);
952 ctrlr->hmb_chunk = pref;
953
954 again:
955 /*
956 * However, the chunk sizes, number of chunks, and alignment of chunks
957 * are all based on the current MPS (ctrlr->page_size).
958 */
959 ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, ctrlr->page_size);
960 ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk);
961 if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd)
962 ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd;
963 ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) *
964 ctrlr->hmb_nchunks, M_NVME, M_WAITOK);
965 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
966 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
967 ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag);
968 if (err != 0) {
969 nvme_printf(ctrlr, "HMB tag create failed %d\n", err);
970 nvme_ctrlr_hmb_free(ctrlr);
971 return;
972 }
973
974 for (i = 0; i < ctrlr->hmb_nchunks; i++) {
975 hmbc = &ctrlr->hmb_chunks[i];
976 if (bus_dmamem_alloc(ctrlr->hmb_tag,
977 (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT,
978 &hmbc->hmbc_map)) {
979 nvme_printf(ctrlr, "failed to alloc HMB\n");
980 break;
981 }
982 if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map,
983 hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map,
984 &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) {
985 bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
986 hmbc->hmbc_map);
987 nvme_printf(ctrlr, "failed to load HMB\n");
988 break;
989 }
990 bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map,
991 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
992 }
993
994 if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min &&
995 ctrlr->hmb_chunk / 2 >= minc) {
996 ctrlr->hmb_nchunks = i;
997 nvme_ctrlr_hmb_free(ctrlr);
998 ctrlr->hmb_chunk /= 2;
999 goto again;
1000 }
1001 ctrlr->hmb_nchunks = i;
1002 if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) {
1003 nvme_ctrlr_hmb_free(ctrlr);
1004 return;
1005 }
1006
1007 size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks;
1008 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
1009 16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
1010 size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag);
1011 if (err != 0) {
1012 nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err);
1013 nvme_ctrlr_hmb_free(ctrlr);
1014 return;
1015 }
1016 if (bus_dmamem_alloc(ctrlr->hmb_desc_tag,
1017 (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK,
1018 &ctrlr->hmb_desc_map)) {
1019 nvme_printf(ctrlr, "failed to alloc HMB desc\n");
1020 nvme_ctrlr_hmb_free(ctrlr);
1021 return;
1022 }
1023 if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
1024 ctrlr->hmb_desc_vaddr, size, nvme_single_map,
1025 &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) {
1026 bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
1027 ctrlr->hmb_desc_map);
1028 nvme_printf(ctrlr, "failed to load HMB desc\n");
1029 nvme_ctrlr_hmb_free(ctrlr);
1030 return;
1031 }
1032
1033 for (i = 0; i < ctrlr->hmb_nchunks; i++) {
1034 memset(&ctrlr->hmb_desc_vaddr[i], 0,
1035 sizeof(struct nvme_hmb_desc));
1036 ctrlr->hmb_desc_vaddr[i].addr =
1037 htole64(ctrlr->hmb_chunks[i].hmbc_paddr);
1038 ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / ctrlr->page_size);
1039 }
1040 bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
1041 BUS_DMASYNC_PREWRITE);
1042
1043 nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n",
1044 (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk
1045 / 1024 / 1024);
1046 }
1047
1048 static void
nvme_ctrlr_hmb_enable(struct nvme_controller * ctrlr,bool enable,bool memret)1049 nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret)
1050 {
1051 struct nvme_completion_poll_status status;
1052 uint32_t cdw11;
1053
1054 cdw11 = 0;
1055 if (enable)
1056 cdw11 |= 1;
1057 if (memret)
1058 cdw11 |= 2;
1059 status.done = 0;
1060 nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11,
1061 ctrlr->hmb_nchunks * ctrlr->hmb_chunk / ctrlr->page_size,
1062 ctrlr->hmb_desc_paddr, ctrlr->hmb_desc_paddr >> 32,
1063 ctrlr->hmb_nchunks, NULL, 0,
1064 nvme_completion_poll_cb, &status);
1065 nvme_completion_poll(&status);
1066 if (nvme_completion_is_error(&status.cpl))
1067 nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n");
1068 }
1069
1070 static void
nvme_ctrlr_start(void * ctrlr_arg,bool resetting)1071 nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
1072 {
1073 struct nvme_controller *ctrlr = ctrlr_arg;
1074 uint32_t old_num_io_queues;
1075 int i;
1076
1077 TSENTER();
1078
1079 /*
1080 * Only reset adminq here when we are restarting the
1081 * controller after a reset. During initialization,
1082 * we have already submitted admin commands to get
1083 * the number of I/O queues supported, so cannot reset
1084 * the adminq again here.
1085 */
1086 if (resetting) {
1087 nvme_qpair_reset(&ctrlr->adminq);
1088 nvme_admin_qpair_enable(&ctrlr->adminq);
1089 }
1090
1091 if (ctrlr->ioq != NULL) {
1092 for (i = 0; i < ctrlr->num_io_queues; i++)
1093 nvme_qpair_reset(&ctrlr->ioq[i]);
1094 }
1095
1096 /*
1097 * If it was a reset on initialization command timeout, just
1098 * return here, letting initialization code fail gracefully.
1099 */
1100 if (resetting && !ctrlr->is_initialized)
1101 return;
1102
1103 if (resetting && nvme_ctrlr_identify(ctrlr) != 0) {
1104 nvme_ctrlr_fail(ctrlr, false);
1105 return;
1106 }
1107
1108 /*
1109 * The number of qpairs are determined during controller initialization,
1110 * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
1111 * HW limit. We call SET_FEATURES again here so that it gets called
1112 * after any reset for controllers that depend on the driver to
1113 * explicit specify how many queues it will use. This value should
1114 * never change between resets, so panic if somehow that does happen.
1115 */
1116 if (resetting) {
1117 old_num_io_queues = ctrlr->num_io_queues;
1118 if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
1119 nvme_ctrlr_fail(ctrlr, false);
1120 return;
1121 }
1122
1123 if (old_num_io_queues != ctrlr->num_io_queues) {
1124 panic("num_io_queues changed from %u to %u",
1125 old_num_io_queues, ctrlr->num_io_queues);
1126 }
1127 }
1128
1129 if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) {
1130 nvme_ctrlr_hmb_alloc(ctrlr);
1131 if (ctrlr->hmb_nchunks > 0)
1132 nvme_ctrlr_hmb_enable(ctrlr, true, false);
1133 } else if (ctrlr->hmb_nchunks > 0)
1134 nvme_ctrlr_hmb_enable(ctrlr, true, true);
1135
1136 if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
1137 nvme_ctrlr_fail(ctrlr, false);
1138 return;
1139 }
1140
1141 if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
1142 nvme_ctrlr_fail(ctrlr, false);
1143 return;
1144 }
1145
1146 nvme_ctrlr_configure_aer(ctrlr);
1147 nvme_ctrlr_configure_int_coalescing(ctrlr);
1148
1149 for (i = 0; i < ctrlr->num_io_queues; i++)
1150 nvme_io_qpair_enable(&ctrlr->ioq[i]);
1151 TSEXIT();
1152 }
1153
1154 void
nvme_ctrlr_start_config_hook(void * arg)1155 nvme_ctrlr_start_config_hook(void *arg)
1156 {
1157 struct nvme_controller *ctrlr = arg;
1158
1159 TSENTER();
1160
1161 if (nvme_ctrlr_hw_reset(ctrlr) != 0 || ctrlr->fail_on_reset != 0) {
1162 nvme_ctrlr_fail(ctrlr, true);
1163 config_intrhook_disestablish(&ctrlr->config_hook);
1164 return;
1165 }
1166
1167 nvme_qpair_reset(&ctrlr->adminq);
1168 nvme_admin_qpair_enable(&ctrlr->adminq);
1169
1170 if (nvme_ctrlr_identify(ctrlr) == 0 &&
1171 nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
1172 nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
1173 nvme_ctrlr_start(ctrlr, false);
1174 else
1175 nvme_ctrlr_fail(ctrlr, false);
1176
1177 nvme_sysctl_initialize_ctrlr(ctrlr);
1178 config_intrhook_disestablish(&ctrlr->config_hook);
1179
1180 if (!ctrlr->is_failed) {
1181 ctrlr->is_initialized = true;
1182 nvme_notify_new_controller(ctrlr);
1183 }
1184 TSEXIT();
1185 }
1186
1187 static void
nvme_ctrlr_reset_task(void * arg,int pending)1188 nvme_ctrlr_reset_task(void *arg, int pending)
1189 {
1190 struct nvme_controller *ctrlr = arg;
1191 int status;
1192
1193 nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"start\"");
1194 status = nvme_ctrlr_hw_reset(ctrlr);
1195 if (status == 0) {
1196 nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"success\"");
1197 nvme_ctrlr_start(ctrlr, true);
1198 } else {
1199 nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"timed_out\"");
1200 nvme_ctrlr_fail(ctrlr, true);
1201 }
1202
1203 atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
1204 }
1205
1206 /*
1207 * Poll all the queues enabled on the device for completion.
1208 */
1209 void
nvme_ctrlr_poll(struct nvme_controller * ctrlr)1210 nvme_ctrlr_poll(struct nvme_controller *ctrlr)
1211 {
1212 int i;
1213
1214 nvme_qpair_process_completions(&ctrlr->adminq);
1215
1216 for (i = 0; i < ctrlr->num_io_queues; i++)
1217 if (ctrlr->ioq && ctrlr->ioq[i].cpl)
1218 nvme_qpair_process_completions(&ctrlr->ioq[i]);
1219 }
1220
1221 /*
1222 * Poll the single-vector interrupt case: num_io_queues will be 1 and
1223 * there's only a single vector. While we're polling, we mask further
1224 * interrupts in the controller.
1225 */
1226 void
nvme_ctrlr_shared_handler(void * arg)1227 nvme_ctrlr_shared_handler(void *arg)
1228 {
1229 struct nvme_controller *ctrlr = arg;
1230
1231 nvme_mmio_write_4(ctrlr, intms, 1);
1232 nvme_ctrlr_poll(ctrlr);
1233 nvme_mmio_write_4(ctrlr, intmc, 1);
1234 }
1235
1236 static void
nvme_pt_done(void * arg,const struct nvme_completion * cpl)1237 nvme_pt_done(void *arg, const struct nvme_completion *cpl)
1238 {
1239 struct nvme_pt_command *pt = arg;
1240 struct mtx *mtx = pt->driver_lock;
1241 uint16_t status;
1242
1243 bzero(&pt->cpl, sizeof(pt->cpl));
1244 pt->cpl.cdw0 = cpl->cdw0;
1245
1246 status = cpl->status;
1247 status &= ~NVMEM(NVME_STATUS_P);
1248 pt->cpl.status = status;
1249
1250 mtx_lock(mtx);
1251 pt->driver_lock = NULL;
1252 wakeup(pt);
1253 mtx_unlock(mtx);
1254 }
1255
1256 int
nvme_ctrlr_passthrough_cmd(struct nvme_controller * ctrlr,struct nvme_pt_command * pt,uint32_t nsid,int is_user_buffer,int is_admin_cmd)1257 nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
1258 struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer,
1259 int is_admin_cmd)
1260 {
1261 struct nvme_request *req;
1262 struct mtx *mtx;
1263 struct buf *buf = NULL;
1264 int ret = 0;
1265
1266 if (pt->len > 0) {
1267 if (pt->len > ctrlr->max_xfer_size) {
1268 nvme_printf(ctrlr, "pt->len (%d) "
1269 "exceeds max_xfer_size (%d)\n", pt->len,
1270 ctrlr->max_xfer_size);
1271 return EIO;
1272 }
1273 if (is_user_buffer) {
1274 buf = uma_zalloc(pbuf_zone, M_WAITOK);
1275 buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
1276 if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
1277 ret = EFAULT;
1278 goto err;
1279 }
1280 req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
1281 M_WAITOK, nvme_pt_done, pt);
1282 } else
1283 req = nvme_allocate_request_vaddr(pt->buf, pt->len,
1284 M_WAITOK, nvme_pt_done, pt);
1285 } else
1286 req = nvme_allocate_request_null(M_WAITOK, nvme_pt_done, pt);
1287
1288 /* Assume user space already converted to little-endian */
1289 req->cmd.opc = pt->cmd.opc;
1290 req->cmd.fuse = pt->cmd.fuse;
1291 req->cmd.rsvd2 = pt->cmd.rsvd2;
1292 req->cmd.rsvd3 = pt->cmd.rsvd3;
1293 req->cmd.cdw10 = pt->cmd.cdw10;
1294 req->cmd.cdw11 = pt->cmd.cdw11;
1295 req->cmd.cdw12 = pt->cmd.cdw12;
1296 req->cmd.cdw13 = pt->cmd.cdw13;
1297 req->cmd.cdw14 = pt->cmd.cdw14;
1298 req->cmd.cdw15 = pt->cmd.cdw15;
1299
1300 req->cmd.nsid = htole32(nsid);
1301
1302 mtx = mtx_pool_find(mtxpool_sleep, pt);
1303 pt->driver_lock = mtx;
1304
1305 if (is_admin_cmd)
1306 nvme_ctrlr_submit_admin_request(ctrlr, req);
1307 else
1308 nvme_ctrlr_submit_io_request(ctrlr, req);
1309
1310 mtx_lock(mtx);
1311 while (pt->driver_lock != NULL)
1312 mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0);
1313 mtx_unlock(mtx);
1314
1315 if (buf != NULL) {
1316 vunmapbuf(buf);
1317 err:
1318 uma_zfree(pbuf_zone, buf);
1319 }
1320
1321 return (ret);
1322 }
1323
1324 static void
nvme_npc_done(void * arg,const struct nvme_completion * cpl)1325 nvme_npc_done(void *arg, const struct nvme_completion *cpl)
1326 {
1327 struct nvme_passthru_cmd *npc = arg;
1328 struct mtx *mtx = (void *)(uintptr_t)npc->metadata;
1329
1330 npc->result = cpl->cdw0; /* cpl in host order by now */
1331 mtx_lock(mtx);
1332 npc->metadata = 0;
1333 wakeup(npc);
1334 mtx_unlock(mtx);
1335 }
1336
1337 /* XXX refactor? */
1338
1339 int
nvme_ctrlr_linux_passthru_cmd(struct nvme_controller * ctrlr,struct nvme_passthru_cmd * npc,uint32_t nsid,bool is_user,bool is_admin)1340 nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
1341 struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin)
1342 {
1343 struct nvme_request *req;
1344 struct mtx *mtx;
1345 struct buf *buf = NULL;
1346 int ret = 0;
1347
1348 /*
1349 * We don't support metadata.
1350 */
1351 if (npc->metadata != 0 || npc->metadata_len != 0)
1352 return (EIO);
1353
1354 if (npc->data_len > 0 && npc->addr != 0) {
1355 if (npc->data_len > ctrlr->max_xfer_size) {
1356 nvme_printf(ctrlr,
1357 "npc->data_len (%d) exceeds max_xfer_size (%d)\n",
1358 npc->data_len, ctrlr->max_xfer_size);
1359 return (EIO);
1360 }
1361 /* We only support data out or data in commands, but not both at once. */
1362 if ((npc->opcode & 0x3) == 0 || (npc->opcode & 0x3) == 3)
1363 return (EINVAL);
1364 if (is_user) {
1365 buf = uma_zalloc(pbuf_zone, M_WAITOK);
1366 buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ;
1367 if (vmapbuf(buf, (void *)(uintptr_t)npc->addr,
1368 npc->data_len, 1) < 0) {
1369 ret = EFAULT;
1370 goto err;
1371 }
1372 req = nvme_allocate_request_vaddr(buf->b_data,
1373 npc->data_len, M_WAITOK, nvme_npc_done, npc);
1374 } else
1375 req = nvme_allocate_request_vaddr(
1376 (void *)(uintptr_t)npc->addr, npc->data_len,
1377 M_WAITOK, nvme_npc_done, npc);
1378 } else
1379 req = nvme_allocate_request_null(M_WAITOK, nvme_npc_done, npc);
1380
1381 req->cmd.opc = npc->opcode;
1382 req->cmd.fuse = npc->flags;
1383 req->cmd.rsvd2 = htole16(npc->cdw2);
1384 req->cmd.rsvd3 = htole16(npc->cdw3);
1385 req->cmd.cdw10 = htole32(npc->cdw10);
1386 req->cmd.cdw11 = htole32(npc->cdw11);
1387 req->cmd.cdw12 = htole32(npc->cdw12);
1388 req->cmd.cdw13 = htole32(npc->cdw13);
1389 req->cmd.cdw14 = htole32(npc->cdw14);
1390 req->cmd.cdw15 = htole32(npc->cdw15);
1391
1392 req->cmd.nsid = htole32(nsid);
1393
1394 mtx = mtx_pool_find(mtxpool_sleep, npc);
1395 npc->metadata = (uintptr_t) mtx;
1396
1397 /* XXX no timeout passed down */
1398 if (is_admin)
1399 nvme_ctrlr_submit_admin_request(ctrlr, req);
1400 else
1401 nvme_ctrlr_submit_io_request(ctrlr, req);
1402
1403 mtx_lock(mtx);
1404 while (npc->metadata != 0)
1405 mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0);
1406 mtx_unlock(mtx);
1407
1408 if (buf != NULL) {
1409 vunmapbuf(buf);
1410 err:
1411 uma_zfree(pbuf_zone, buf);
1412 }
1413
1414 return (ret);
1415 }
1416
1417 static int
nvme_ctrlr_ioctl(struct cdev * cdev,u_long cmd,caddr_t arg,int flag,struct thread * td)1418 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1419 struct thread *td)
1420 {
1421 struct nvme_controller *ctrlr;
1422 struct nvme_pt_command *pt;
1423
1424 ctrlr = cdev->si_drv1;
1425
1426 switch (cmd) {
1427 case NVME_IOCTL_RESET: /* Linux compat */
1428 case NVME_RESET_CONTROLLER:
1429 nvme_ctrlr_reset(ctrlr);
1430 break;
1431 case NVME_PASSTHROUGH_CMD:
1432 pt = (struct nvme_pt_command *)arg;
1433 return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid),
1434 1 /* is_user_buffer */, 1 /* is_admin_cmd */));
1435 case NVME_GET_NSID:
1436 {
1437 struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
1438 strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
1439 sizeof(gnsid->cdev));
1440 gnsid->nsid = 0;
1441 break;
1442 }
1443 case NVME_GET_MAX_XFER_SIZE:
1444 *(uint64_t *)arg = ctrlr->max_xfer_size;
1445 break;
1446 /* Linux Compatible (see nvme_linux.h) */
1447 case NVME_IOCTL_ID:
1448 td->td_retval[0] = 0xfffffffful;
1449 return (0);
1450
1451 case NVME_IOCTL_ADMIN_CMD:
1452 case NVME_IOCTL_IO_CMD: {
1453 struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
1454
1455 return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, true,
1456 cmd == NVME_IOCTL_ADMIN_CMD));
1457 }
1458
1459 default:
1460 return (ENOTTY);
1461 }
1462
1463 return (0);
1464 }
1465
1466 static struct cdevsw nvme_ctrlr_cdevsw = {
1467 .d_version = D_VERSION,
1468 .d_flags = 0,
1469 .d_ioctl = nvme_ctrlr_ioctl
1470 };
1471
1472 int
nvme_ctrlr_construct(struct nvme_controller * ctrlr,device_t dev)1473 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
1474 {
1475 struct make_dev_args md_args;
1476 uint32_t cap_lo;
1477 uint32_t cap_hi;
1478 uint32_t to, vs, pmrcap;
1479 int status, timeout_period;
1480
1481 ctrlr->dev = dev;
1482
1483 mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
1484 if (bus_get_domain(dev, &ctrlr->domain) != 0)
1485 ctrlr->domain = 0;
1486
1487 ctrlr->cap_lo = cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
1488 if (bootverbose) {
1489 device_printf(dev, "CapLo: 0x%08x: MQES %u%s%s%s%s, TO %u\n",
1490 cap_lo, NVME_CAP_LO_MQES(cap_lo),
1491 NVME_CAP_LO_CQR(cap_lo) ? ", CQR" : "",
1492 NVME_CAP_LO_AMS(cap_lo) ? ", AMS" : "",
1493 (NVME_CAP_LO_AMS(cap_lo) & 0x1) ? " WRRwUPC" : "",
1494 (NVME_CAP_LO_AMS(cap_lo) & 0x2) ? " VS" : "",
1495 NVME_CAP_LO_TO(cap_lo));
1496 }
1497 ctrlr->cap_hi = cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
1498 if (bootverbose) {
1499 device_printf(dev, "CapHi: 0x%08x: DSTRD %u%s, CSS %x%s, "
1500 "CPS %x, MPSMIN %u, MPSMAX %u%s%s%s%s%s\n", cap_hi,
1501 NVME_CAP_HI_DSTRD(cap_hi),
1502 NVME_CAP_HI_NSSRS(cap_hi) ? ", NSSRS" : "",
1503 NVME_CAP_HI_CSS(cap_hi),
1504 NVME_CAP_HI_BPS(cap_hi) ? ", BPS" : "",
1505 NVME_CAP_HI_CPS(cap_hi),
1506 NVME_CAP_HI_MPSMIN(cap_hi),
1507 NVME_CAP_HI_MPSMAX(cap_hi),
1508 NVME_CAP_HI_PMRS(cap_hi) ? ", PMRS" : "",
1509 NVME_CAP_HI_CMBS(cap_hi) ? ", CMBS" : "",
1510 NVME_CAP_HI_NSSS(cap_hi) ? ", NSSS" : "",
1511 NVME_CAP_HI_CRWMS(cap_hi) ? ", CRWMS" : "",
1512 NVME_CAP_HI_CRIMS(cap_hi) ? ", CRIMS" : "");
1513 }
1514 if (bootverbose) {
1515 vs = nvme_mmio_read_4(ctrlr, vs);
1516 device_printf(dev, "Version: 0x%08x: %d.%d\n", vs,
1517 NVME_MAJOR(vs), NVME_MINOR(vs));
1518 }
1519 if (bootverbose && NVME_CAP_HI_PMRS(cap_hi)) {
1520 pmrcap = nvme_mmio_read_4(ctrlr, pmrcap);
1521 device_printf(dev, "PMRCap: 0x%08x: BIR %u%s%s, PMRTU %u, "
1522 "PMRWBM %x, PMRTO %u%s\n", pmrcap,
1523 NVME_PMRCAP_BIR(pmrcap),
1524 NVME_PMRCAP_RDS(pmrcap) ? ", RDS" : "",
1525 NVME_PMRCAP_WDS(pmrcap) ? ", WDS" : "",
1526 NVME_PMRCAP_PMRTU(pmrcap),
1527 NVME_PMRCAP_PMRWBM(pmrcap),
1528 NVME_PMRCAP_PMRTO(pmrcap),
1529 NVME_PMRCAP_CMSS(pmrcap) ? ", CMSS" : "");
1530 }
1531
1532 ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2;
1533
1534 ctrlr->mps = NVME_CAP_HI_MPSMIN(cap_hi);
1535 ctrlr->page_size = 1 << (NVME_MPS_SHIFT + ctrlr->mps);
1536
1537 /* Get ready timeout value from controller, in units of 500ms. */
1538 to = NVME_CAP_LO_TO(cap_lo) + 1;
1539 ctrlr->ready_timeout_in_ms = to * 500;
1540
1541 timeout_period = NVME_ADMIN_TIMEOUT_PERIOD;
1542 TUNABLE_INT_FETCH("hw.nvme.admin_timeout_period", &timeout_period);
1543 timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
1544 timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
1545 ctrlr->admin_timeout_period = timeout_period;
1546
1547 timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
1548 TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
1549 timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
1550 timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
1551 ctrlr->timeout_period = timeout_period;
1552
1553 nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
1554 TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
1555
1556 ctrlr->enable_aborts = 0;
1557 TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
1558
1559 ctrlr->alignment_splits = counter_u64_alloc(M_WAITOK);
1560
1561 /* Cap transfers by the maximum addressable by page-sized PRP (4KB pages -> 2MB). */
1562 ctrlr->max_xfer_size = MIN(maxphys, (ctrlr->page_size / 8 * ctrlr->page_size));
1563 if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
1564 return (ENXIO);
1565
1566 /*
1567 * Create 2 threads for the taskqueue. The reset thread will block when
1568 * it detects that the controller has failed until all I/O has been
1569 * failed up the stack. The fail_req task needs to be able to run in
1570 * this case to finish the request failure for some cases.
1571 *
1572 * We could partially solve this race by draining the failed requeust
1573 * queue before proceding to free the sim, though nothing would stop
1574 * new I/O from coming in after we do that drain, but before we reach
1575 * cam_sim_free, so this big hammer is used instead.
1576 */
1577 ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
1578 taskqueue_thread_enqueue, &ctrlr->taskqueue);
1579 taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq");
1580
1581 ctrlr->is_resetting = 0;
1582 ctrlr->is_initialized = false;
1583 ctrlr->notification_sent = 0;
1584 TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
1585 STAILQ_INIT(&ctrlr->fail_req);
1586 ctrlr->is_failed = false;
1587
1588 make_dev_args_init(&md_args);
1589 md_args.mda_devsw = &nvme_ctrlr_cdevsw;
1590 md_args.mda_uid = UID_ROOT;
1591 md_args.mda_gid = GID_WHEEL;
1592 md_args.mda_mode = 0600;
1593 md_args.mda_unit = device_get_unit(dev);
1594 md_args.mda_si_drv1 = (void *)ctrlr;
1595 status = make_dev_s(&md_args, &ctrlr->cdev, "%s",
1596 device_get_nameunit(dev));
1597 if (status != 0)
1598 return (ENXIO);
1599
1600 return (0);
1601 }
1602
1603 /*
1604 * Called on detach, or on error on attach. The nvme_controller won't be used
1605 * again once we return, so we have to tear everything down (so nothing
1606 * references this, no callbacks, etc), but don't need to reset all the state
1607 * since nvme_controller will be freed soon.
1608 */
1609 void
nvme_ctrlr_destruct(struct nvme_controller * ctrlr,device_t dev)1610 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
1611 {
1612 int gone, i;
1613
1614 ctrlr->is_dying = true;
1615
1616 if (ctrlr->resource == NULL)
1617 goto nores;
1618 if (!mtx_initialized(&ctrlr->adminq.lock))
1619 goto noadminq;
1620
1621 /*
1622 * Check whether it is a hot unplug or a clean driver detach.
1623 * If device is not there any more, skip any shutdown commands.
1624 */
1625 gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
1626 if (gone)
1627 nvme_ctrlr_fail(ctrlr, true);
1628 else
1629 nvme_notify_fail_consumers(ctrlr);
1630
1631 for (i = 0; i < NVME_MAX_NAMESPACES; i++)
1632 nvme_ns_destruct(&ctrlr->ns[i]);
1633
1634 if (ctrlr->cdev)
1635 destroy_dev(ctrlr->cdev);
1636
1637 if (ctrlr->is_initialized) {
1638 if (!gone) {
1639 if (ctrlr->hmb_nchunks > 0)
1640 nvme_ctrlr_hmb_enable(ctrlr, false, false);
1641 nvme_ctrlr_delete_qpairs(ctrlr);
1642 }
1643 nvme_ctrlr_hmb_free(ctrlr);
1644 }
1645 if (ctrlr->ioq != NULL) {
1646 for (i = 0; i < ctrlr->num_io_queues; i++)
1647 nvme_io_qpair_destroy(&ctrlr->ioq[i]);
1648 free(ctrlr->ioq, M_NVME);
1649 }
1650 nvme_admin_qpair_destroy(&ctrlr->adminq);
1651
1652 /*
1653 * Notify the controller of a shutdown, even though this is due to
1654 * a driver unload, not a system shutdown (this path is not invoked
1655 * during shutdown). This ensures the controller receives a
1656 * shutdown notification in case the system is shutdown before
1657 * reloading the driver.
1658 */
1659 if (!gone)
1660 nvme_ctrlr_shutdown(ctrlr);
1661
1662 if (!gone)
1663 nvme_ctrlr_disable(ctrlr);
1664
1665 noadminq:
1666 if (ctrlr->taskqueue)
1667 taskqueue_free(ctrlr->taskqueue);
1668
1669 if (ctrlr->tag)
1670 bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
1671
1672 if (ctrlr->res)
1673 bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
1674 rman_get_rid(ctrlr->res), ctrlr->res);
1675
1676 if (ctrlr->bar4_resource != NULL) {
1677 bus_release_resource(dev, SYS_RES_MEMORY,
1678 ctrlr->bar4_resource_id, ctrlr->bar4_resource);
1679 }
1680
1681 bus_release_resource(dev, SYS_RES_MEMORY,
1682 ctrlr->resource_id, ctrlr->resource);
1683
1684 nores:
1685 if (ctrlr->alignment_splits)
1686 counter_u64_free(ctrlr->alignment_splits);
1687
1688 mtx_destroy(&ctrlr->lock);
1689 }
1690
1691 void
nvme_ctrlr_shutdown(struct nvme_controller * ctrlr)1692 nvme_ctrlr_shutdown(struct nvme_controller *ctrlr)
1693 {
1694 uint32_t cc;
1695 uint32_t csts;
1696 int timeout;
1697
1698 cc = nvme_mmio_read_4(ctrlr, cc);
1699 cc &= ~NVMEM(NVME_CC_REG_SHN);
1700 cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
1701 nvme_mmio_write_4(ctrlr, cc, cc);
1702
1703 timeout = ticks + (ctrlr->cdata.rtd3e == 0 ? 5 * hz :
1704 ((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000);
1705 while (1) {
1706 csts = nvme_mmio_read_4(ctrlr, csts);
1707 if (csts == NVME_GONE) /* Hot unplug. */
1708 break;
1709 if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE)
1710 break;
1711 if (timeout - ticks < 0) {
1712 nvme_printf(ctrlr, "shutdown timeout\n");
1713 break;
1714 }
1715 pause("nvmeshut", 1);
1716 }
1717 }
1718
1719 void
nvme_ctrlr_submit_admin_request(struct nvme_controller * ctrlr,struct nvme_request * req)1720 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
1721 struct nvme_request *req)
1722 {
1723
1724 nvme_qpair_submit_request(&ctrlr->adminq, req);
1725 }
1726
1727 void
nvme_ctrlr_submit_io_request(struct nvme_controller * ctrlr,struct nvme_request * req)1728 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
1729 struct nvme_request *req)
1730 {
1731 struct nvme_qpair *qpair;
1732
1733 qpair = &ctrlr->ioq[QP(ctrlr, curcpu)];
1734 nvme_qpair_submit_request(qpair, req);
1735 }
1736
1737 device_t
nvme_ctrlr_get_device(struct nvme_controller * ctrlr)1738 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
1739 {
1740
1741 return (ctrlr->dev);
1742 }
1743
1744 const struct nvme_controller_data *
nvme_ctrlr_get_data(struct nvme_controller * ctrlr)1745 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
1746 {
1747
1748 return (&ctrlr->cdata);
1749 }
1750
1751 int
nvme_ctrlr_suspend(struct nvme_controller * ctrlr)1752 nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
1753 {
1754 int to = hz;
1755
1756 /*
1757 * Can't touch failed controllers, so it's already suspended. User will
1758 * need to do an explicit reset to bring it back, if that's even
1759 * possible.
1760 */
1761 if (ctrlr->is_failed)
1762 return (0);
1763
1764 /*
1765 * We don't want the reset taskqueue running, since it does similar
1766 * things, so prevent it from running after we start. Wait for any reset
1767 * that may have been started to complete. The reset process we follow
1768 * will ensure that any new I/O will queue and be given to the hardware
1769 * after we resume (though there should be none).
1770 */
1771 while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0)
1772 pause("nvmesusp", 1);
1773 if (to <= 0) {
1774 nvme_printf(ctrlr,
1775 "Competing reset task didn't finish. Try again later.\n");
1776 return (EWOULDBLOCK);
1777 }
1778
1779 if (ctrlr->hmb_nchunks > 0)
1780 nvme_ctrlr_hmb_enable(ctrlr, false, false);
1781
1782 /*
1783 * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to
1784 * delete the hardware I/O queues, and then shutdown. This properly
1785 * flushes any metadata the drive may have stored so it can survive
1786 * having its power removed and prevents the unsafe shutdown count from
1787 * incriminating. Once we delete the qpairs, we have to disable them
1788 * before shutting down.
1789 */
1790 nvme_ctrlr_delete_qpairs(ctrlr);
1791 nvme_ctrlr_disable_qpairs(ctrlr);
1792 nvme_ctrlr_shutdown(ctrlr);
1793
1794 return (0);
1795 }
1796
1797 int
nvme_ctrlr_resume(struct nvme_controller * ctrlr)1798 nvme_ctrlr_resume(struct nvme_controller *ctrlr)
1799 {
1800
1801 /*
1802 * Can't touch failed controllers, so nothing to do to resume.
1803 */
1804 if (ctrlr->is_failed)
1805 return (0);
1806
1807 if (nvme_ctrlr_hw_reset(ctrlr) != 0)
1808 goto fail;
1809
1810 /*
1811 * Now that we've reset the hardware, we can restart the controller. Any
1812 * I/O that was pending is requeued. Any admin commands are aborted with
1813 * an error. Once we've restarted, stop flagging the controller as being
1814 * in the reset phase.
1815 */
1816 nvme_ctrlr_start(ctrlr, true);
1817 (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
1818
1819 return (0);
1820 fail:
1821 /*
1822 * Since we can't bring the controller out of reset, announce and fail
1823 * the controller. However, we have to return success for the resume
1824 * itself, due to questionable APIs.
1825 */
1826 nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
1827 nvme_ctrlr_fail(ctrlr, true);
1828 (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
1829 return (0);
1830 }
1831