1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (C) 2012-2014 Intel Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/bus.h> 31 #include <sys/conf.h> 32 #include <sys/domainset.h> 33 #include <sys/proc.h> 34 #include <sys/sbuf.h> 35 36 #include <dev/pci/pcivar.h> 37 38 #include "nvme_private.h" 39 40 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t; 41 #define DO_NOT_RETRY 1 42 43 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, 44 struct nvme_request *req); 45 static void nvme_qpair_destroy(struct nvme_qpair *qpair); 46 47 static const char * 48 get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len) 49 { 50 struct sbuf sb; 51 52 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN); 53 nvme_opcode_sbuf(admin, opc, &sb); 54 if (sbuf_finish(&sb) != 0) 55 return (""); 56 return (buf); 57 } 58 59 static void 60 nvme_admin_qpair_print_command(struct nvme_qpair *qpair, 61 struct nvme_command *cmd) 62 { 63 char buf[64]; 64 65 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x " 66 "cdw10:%08x cdw11:%08x\n", 67 get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id, 68 cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10), 69 le32toh(cmd->cdw11)); 70 } 71 72 static void 73 nvme_io_qpair_print_command(struct nvme_qpair *qpair, 74 struct nvme_command *cmd) 75 { 76 char buf[64]; 77 78 switch (cmd->opc) { 79 case NVME_OPC_WRITE: 80 case NVME_OPC_READ: 81 case NVME_OPC_WRITE_UNCORRECTABLE: 82 case NVME_OPC_COMPARE: 83 case NVME_OPC_WRITE_ZEROES: 84 case NVME_OPC_VERIFY: 85 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " 86 "lba:%llu len:%d\n", 87 get_opcode_string(false, cmd->opc, buf, sizeof(buf)), 88 qpair->id, cmd->cid, le32toh(cmd->nsid), 89 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10), 90 (le32toh(cmd->cdw12) & 0xFFFF) + 1); 91 break; 92 default: 93 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", 94 get_opcode_string(false, cmd->opc, buf, sizeof(buf)), 95 qpair->id, cmd->cid, le32toh(cmd->nsid)); 96 break; 97 } 98 } 99 100 void 101 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) 102 { 103 if (qpair->id == 0) 104 nvme_admin_qpair_print_command(qpair, cmd); 105 else 106 nvme_io_qpair_print_command(qpair, cmd); 107 if (nvme_verbose_cmd_dump) { 108 nvme_printf(qpair->ctrlr, 109 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n", 110 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr, 111 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2); 112 nvme_printf(qpair->ctrlr, 113 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n", 114 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14, 115 cmd->cdw15); 116 } 117 } 118 119 static const char * 120 get_status_string(const struct nvme_completion *cpl, char *buf, size_t len) 121 { 122 struct sbuf sb; 123 124 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN); 125 nvme_sc_sbuf(cpl, &sb); 126 if (sbuf_finish(&sb) != 0) 127 return (""); 128 return (buf); 129 } 130 131 void 132 nvme_qpair_print_completion(struct nvme_qpair *qpair, 133 struct nvme_completion *cpl) 134 { 135 char buf[64]; 136 uint8_t crd, m, dnr, p; 137 138 crd = NVME_STATUS_GET_CRD(cpl->status); 139 m = NVME_STATUS_GET_M(cpl->status); 140 dnr = NVME_STATUS_GET_DNR(cpl->status); 141 p = NVME_STATUS_GET_P(cpl->status); 142 143 nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d " 144 "sqid:%d cid:%d cdw0:%x\n", 145 get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p, 146 cpl->sqid, cpl->cid, cpl->cdw0); 147 } 148 149 static bool 150 nvme_completion_is_retry(const struct nvme_completion *cpl) 151 { 152 uint8_t sct, sc, dnr; 153 154 sct = NVME_STATUS_GET_SCT(cpl->status); 155 sc = NVME_STATUS_GET_SC(cpl->status); 156 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */ 157 158 /* 159 * TODO: spec is not clear how commands that are aborted due 160 * to TLER will be marked. So for now, it seems 161 * NAMESPACE_NOT_READY is the only case where we should 162 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST 163 * set the DNR bit correctly since the driver controls that. 164 */ 165 switch (sct) { 166 case NVME_SCT_GENERIC: 167 switch (sc) { 168 case NVME_SC_ABORTED_BY_REQUEST: 169 case NVME_SC_NAMESPACE_NOT_READY: 170 if (dnr) 171 return (0); 172 else 173 return (1); 174 case NVME_SC_INVALID_OPCODE: 175 case NVME_SC_INVALID_FIELD: 176 case NVME_SC_COMMAND_ID_CONFLICT: 177 case NVME_SC_DATA_TRANSFER_ERROR: 178 case NVME_SC_ABORTED_POWER_LOSS: 179 case NVME_SC_INTERNAL_DEVICE_ERROR: 180 case NVME_SC_ABORTED_SQ_DELETION: 181 case NVME_SC_ABORTED_FAILED_FUSED: 182 case NVME_SC_ABORTED_MISSING_FUSED: 183 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: 184 case NVME_SC_COMMAND_SEQUENCE_ERROR: 185 case NVME_SC_LBA_OUT_OF_RANGE: 186 case NVME_SC_CAPACITY_EXCEEDED: 187 default: 188 return (0); 189 } 190 case NVME_SCT_COMMAND_SPECIFIC: 191 case NVME_SCT_MEDIA_ERROR: 192 return (0); 193 case NVME_SCT_PATH_RELATED: 194 switch (sc) { 195 case NVME_SC_INTERNAL_PATH_ERROR: 196 if (dnr) 197 return (0); 198 else 199 return (1); 200 default: 201 return (0); 202 } 203 case NVME_SCT_VENDOR_SPECIFIC: 204 default: 205 return (0); 206 } 207 } 208 209 static void 210 nvme_qpair_complete_tracker(struct nvme_tracker *tr, 211 struct nvme_completion *cpl, error_print_t print_on_error) 212 { 213 struct nvme_qpair *qpair = tr->qpair; 214 struct nvme_request *req; 215 bool retry, error, retriable; 216 217 mtx_assert(&qpair->lock, MA_NOTOWNED); 218 219 req = tr->req; 220 error = nvme_completion_is_error(cpl); 221 retriable = nvme_completion_is_retry(cpl); 222 retry = error && retriable && req->retries < nvme_retry_count; 223 if (retry) 224 qpair->num_retries++; 225 if (error && req->retries >= nvme_retry_count && retriable) 226 qpair->num_failures++; 227 228 if (error && (print_on_error == ERROR_PRINT_ALL || 229 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) { 230 nvme_qpair_print_command(qpair, &req->cmd); 231 nvme_qpair_print_completion(qpair, cpl); 232 } 233 234 qpair->act_tr[cpl->cid] = NULL; 235 236 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); 237 238 if (!retry) { 239 if (req->payload_valid) { 240 bus_dmamap_sync(qpair->dma_tag_payload, 241 tr->payload_dma_map, 242 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 243 } 244 if (req->cb_fn) 245 req->cb_fn(req->cb_arg, cpl); 246 } 247 248 mtx_lock(&qpair->lock); 249 250 if (retry) { 251 req->retries++; 252 nvme_qpair_submit_tracker(qpair, tr); 253 } else { 254 if (req->payload_valid) { 255 bus_dmamap_unload(qpair->dma_tag_payload, 256 tr->payload_dma_map); 257 } 258 259 nvme_free_request(req); 260 tr->req = NULL; 261 262 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq); 263 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 264 265 /* 266 * If the controller is in the middle of resetting, don't 267 * try to submit queued requests here - let the reset logic 268 * handle that instead. 269 */ 270 if (!STAILQ_EMPTY(&qpair->queued_req) && 271 !qpair->ctrlr->is_resetting) { 272 req = STAILQ_FIRST(&qpair->queued_req); 273 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 274 _nvme_qpair_submit_request(qpair, req); 275 } 276 } 277 278 mtx_unlock(&qpair->lock); 279 } 280 281 static uint32_t 282 nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr) 283 { 284 uint32_t status = 0; 285 286 status |= NVMEF(NVME_STATUS_SCT, sct); 287 status |= NVMEF(NVME_STATUS_SC, sc); 288 status |= NVMEF(NVME_STATUS_DNR, dnr); 289 /* M=0 : this is artificial so no data in error log page */ 290 /* CRD=0 : this is artificial and no delayed retry support anyway */ 291 /* P=0 : phase not checked */ 292 return (status); 293 } 294 295 static void 296 nvme_qpair_manual_complete_tracker( 297 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 298 error_print_t print_on_error) 299 { 300 struct nvme_completion cpl; 301 struct nvme_qpair * qpair = tr->qpair; 302 303 mtx_assert(&qpair->lock, MA_NOTOWNED); 304 305 memset(&cpl, 0, sizeof(cpl)); 306 307 cpl.sqid = qpair->id; 308 cpl.cid = tr->cid; 309 cpl.status = nvme_qpair_make_status(sct, sc, dnr); 310 nvme_qpair_complete_tracker(tr, &cpl, print_on_error); 311 } 312 313 static void 314 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, 315 struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr, 316 error_print_t print_on_error) 317 { 318 struct nvme_completion cpl; 319 bool error; 320 321 memset(&cpl, 0, sizeof(cpl)); 322 cpl.sqid = qpair->id; 323 cpl.status = nvme_qpair_make_status(sct, sc, dnr); 324 error = nvme_completion_is_error(&cpl); 325 326 if (error && print_on_error == ERROR_PRINT_ALL) { 327 nvme_qpair_print_command(qpair, &req->cmd); 328 nvme_qpair_print_completion(qpair, &cpl); 329 } 330 331 if (req->cb_fn) 332 req->cb_fn(req->cb_arg, &cpl); 333 334 nvme_free_request(req); 335 } 336 337 /* Locked version of completion processor */ 338 static bool 339 _nvme_qpair_process_completions(struct nvme_qpair *qpair) 340 { 341 struct nvme_tracker *tr; 342 struct nvme_completion cpl; 343 bool done = false; 344 bool in_panic = dumping || SCHEDULER_STOPPED(); 345 346 mtx_assert(&qpair->recovery, MA_OWNED); 347 348 /* 349 * qpair is not enabled, likely because a controller reset is in 350 * progress. Ignore the interrupt - any I/O that was associated with 351 * this interrupt will get retried when the reset is complete. Any 352 * pending completions for when we're in startup will be completed 353 * as soon as initialization is complete and we start sending commands 354 * to the device. 355 */ 356 if (qpair->recovery_state != RECOVERY_NONE) { 357 qpair->num_ignored++; 358 return (false); 359 } 360 361 /* 362 * Sanity check initialization. After we reset the hardware, the phase 363 * is defined to be 1. So if we get here with zero prior calls and the 364 * phase is 0, it means that we've lost a race between the 365 * initialization and the ISR running. With the phase wrong, we'll 366 * process a bunch of completions that aren't really completions leading 367 * to a KASSERT below. 368 */ 369 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0), 370 ("%s: Phase wrong for first interrupt call.", 371 device_get_nameunit(qpair->ctrlr->dev))); 372 373 qpair->num_intr_handler_calls++; 374 375 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 376 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 377 /* 378 * A panic can stop the CPU this routine is running on at any point. If 379 * we're called during a panic, complete the sq_head wrap protocol for 380 * the case where we are interrupted just after the increment at 1 381 * below, but before we can reset cq_head to zero at 2. Also cope with 382 * the case where we do the zero at 2, but may or may not have done the 383 * phase adjustment at step 3. The panic machinery flushes all pending 384 * memory writes, so we can make these strong ordering assumptions 385 * that would otherwise be unwise if we were racing in real time. 386 */ 387 if (__predict_false(in_panic)) { 388 if (qpair->cq_head == qpair->num_entries) { 389 /* 390 * Here we know that we need to zero cq_head and then negate 391 * the phase, which hasn't been assigned if cq_head isn't 392 * zero due to the atomic_store_rel. 393 */ 394 qpair->cq_head = 0; 395 qpair->phase = !qpair->phase; 396 } else if (qpair->cq_head == 0) { 397 /* 398 * In this case, we know that the assignment at 2 399 * happened below, but we don't know if it 3 happened or 400 * not. To do this, we look at the last completion 401 * entry and set the phase to the opposite phase 402 * that it has. This gets us back in sync 403 */ 404 cpl = qpair->cpl[qpair->num_entries - 1]; 405 nvme_completion_swapbytes(&cpl); 406 qpair->phase = !NVME_STATUS_GET_P(cpl.status); 407 } 408 } 409 410 while (1) { 411 uint16_t status; 412 413 /* 414 * We need to do this dance to avoid a race between the host and 415 * the device where the device overtakes the host while the host 416 * is reading this record, leaving the status field 'new' and 417 * the sqhd and cid fields potentially stale. If the phase 418 * doesn't match, that means status hasn't yet been updated and 419 * we'll get any pending changes next time. It also means that 420 * the phase must be the same the second time. We have to sync 421 * before reading to ensure any bouncing completes. 422 */ 423 status = le16toh(qpair->cpl[qpair->cq_head].status); 424 if (NVME_STATUS_GET_P(status) != qpair->phase) 425 break; 426 427 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 428 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 429 cpl = qpair->cpl[qpair->cq_head]; 430 nvme_completion_swapbytes(&cpl); 431 432 KASSERT( 433 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status), 434 ("Phase unexpectedly inconsistent")); 435 436 if (cpl.cid < qpair->num_trackers) 437 tr = qpair->act_tr[cpl.cid]; 438 else 439 tr = NULL; 440 441 done = true; 442 if (tr != NULL) { 443 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL); 444 qpair->sq_head = cpl.sqhd; 445 } else if (!in_panic) { 446 /* 447 * A missing tracker is normally an error. However, a 448 * panic can stop the CPU this routine is running on 449 * after completing an I/O but before updating 450 * qpair->cq_head at 1 below. Later, we re-enter this 451 * routine to poll I/O associated with the kernel 452 * dump. We find that the tr has been set to null before 453 * calling the completion routine. If it hasn't 454 * completed (or it triggers a panic), then '1' below 455 * won't have updated cq_head. Rather than panic again, 456 * ignore this condition because it's not unexpected. 457 */ 458 nvme_printf(qpair->ctrlr, 459 "cpl (cid = %u) does not map to outstanding cmd\n", 460 cpl.cid); 461 nvme_qpair_print_completion(qpair, 462 &qpair->cpl[qpair->cq_head]); 463 KASSERT(0, ("received completion for unknown cmd")); 464 } 465 466 /* 467 * There's a number of races with the following (see above) when 468 * the system panics. We compensate for each one of them by 469 * using the atomic store to force strong ordering (at least when 470 * viewed in the aftermath of a panic). 471 */ 472 if (++qpair->cq_head == qpair->num_entries) { /* 1 */ 473 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */ 474 qpair->phase = !qpair->phase; /* 3 */ 475 } 476 } 477 478 if (done) { 479 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, 480 qpair->cq_hdbl_off, qpair->cq_head); 481 } 482 483 return (done); 484 } 485 486 bool 487 nvme_qpair_process_completions(struct nvme_qpair *qpair) 488 { 489 bool done = false; 490 491 /* 492 * Interlock with reset / recovery code. This is an usually uncontended 493 * to make sure that we drain out of the ISRs before we reset the card 494 * and to prevent races with the recovery process called from a timeout 495 * context. 496 */ 497 mtx_lock(&qpair->recovery); 498 499 if (__predict_true(qpair->recovery_state == RECOVERY_NONE)) 500 done = _nvme_qpair_process_completions(qpair); 501 else 502 qpair->num_recovery_nolock++; // XXX likely need to rename 503 504 mtx_unlock(&qpair->recovery); 505 506 return (done); 507 } 508 509 static void 510 nvme_qpair_msi_handler(void *arg) 511 { 512 struct nvme_qpair *qpair = arg; 513 514 nvme_qpair_process_completions(qpair); 515 } 516 517 int 518 nvme_qpair_construct(struct nvme_qpair *qpair, 519 uint32_t num_entries, uint32_t num_trackers, 520 struct nvme_controller *ctrlr) 521 { 522 struct nvme_tracker *tr; 523 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz; 524 uint64_t queuemem_phys, prpmem_phys, list_phys; 525 uint8_t *queuemem, *prpmem, *prp_list; 526 int i, err; 527 528 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0; 529 qpair->num_entries = num_entries; 530 qpair->num_trackers = num_trackers; 531 qpair->ctrlr = ctrlr; 532 533 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); 534 mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF); 535 536 callout_init_mtx(&qpair->timer, &qpair->recovery, 0); 537 qpair->timer_armed = false; 538 qpair->recovery_state = RECOVERY_WAITING; 539 540 /* Note: NVMe PRP format is restricted to 4-byte alignment. */ 541 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 542 4, ctrlr->page_size, BUS_SPACE_MAXADDR, 543 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size, 544 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1, 545 ctrlr->page_size, 0, 546 NULL, NULL, &qpair->dma_tag_payload); 547 if (err != 0) { 548 nvme_printf(ctrlr, "payload tag create failed %d\n", err); 549 goto out; 550 } 551 552 /* 553 * Each component must be page aligned, and individual PRP lists 554 * cannot cross a page boundary. 555 */ 556 cmdsz = qpair->num_entries * sizeof(struct nvme_command); 557 cmdsz = roundup2(cmdsz, ctrlr->page_size); 558 cplsz = qpair->num_entries * sizeof(struct nvme_completion); 559 cplsz = roundup2(cplsz, ctrlr->page_size); 560 /* 561 * For commands requiring more than 2 PRP entries, one PRP will be 562 * embedded in the command (prp1), and the rest of the PRP entries 563 * will be in a list pointed to by the command (prp2). 564 */ 565 prpsz = sizeof(uint64_t) * 566 howmany(ctrlr->max_xfer_size, ctrlr->page_size); 567 prpmemsz = qpair->num_trackers * prpsz; 568 allocsz = cmdsz + cplsz + prpmemsz; 569 570 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 571 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, 572 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag); 573 if (err != 0) { 574 nvme_printf(ctrlr, "tag create failed %d\n", err); 575 goto out; 576 } 577 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain); 578 579 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, 580 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) { 581 nvme_printf(ctrlr, "failed to alloc qpair memory\n"); 582 goto out; 583 } 584 585 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map, 586 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) { 587 nvme_printf(ctrlr, "failed to load qpair memory\n"); 588 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 589 qpair->queuemem_map); 590 goto out; 591 } 592 593 qpair->num_cmds = 0; 594 qpair->num_intr_handler_calls = 0; 595 qpair->num_retries = 0; 596 qpair->num_failures = 0; 597 qpair->num_ignored = 0; 598 qpair->cmd = (struct nvme_command *)queuemem; 599 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); 600 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); 601 qpair->cmd_bus_addr = queuemem_phys; 602 qpair->cpl_bus_addr = queuemem_phys + cmdsz; 603 prpmem_phys = queuemem_phys + cmdsz + cplsz; 604 605 /* 606 * Calcuate the stride of the doorbell register. Many emulators set this 607 * value to correspond to a cache line. However, some hardware has set 608 * it to various small values. 609 */ 610 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) + 611 (qpair->id << (ctrlr->dstrd + 1)); 612 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) + 613 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd); 614 615 TAILQ_INIT(&qpair->free_tr); 616 TAILQ_INIT(&qpair->outstanding_tr); 617 STAILQ_INIT(&qpair->queued_req); 618 619 list_phys = prpmem_phys; 620 prp_list = prpmem; 621 for (i = 0; i < qpair->num_trackers; i++) { 622 if (list_phys + prpsz > prpmem_phys + prpmemsz) { 623 qpair->num_trackers = i; 624 break; 625 } 626 627 /* 628 * Make sure that the PRP list for this tracker doesn't 629 * overflow to another nvme page. 630 */ 631 if (trunc_page(list_phys) != 632 trunc_page(list_phys + prpsz - 1)) { 633 list_phys = roundup2(list_phys, ctrlr->page_size); 634 prp_list = 635 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size); 636 } 637 638 tr = malloc_domainset(sizeof(*tr), M_NVME, 639 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK); 640 bus_dmamap_create(qpair->dma_tag_payload, 0, 641 &tr->payload_dma_map); 642 tr->cid = i; 643 tr->qpair = qpair; 644 tr->prp = (uint64_t *)prp_list; 645 tr->prp_bus_addr = list_phys; 646 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 647 list_phys += prpsz; 648 prp_list += prpsz; 649 } 650 651 if (qpair->num_trackers == 0) { 652 nvme_printf(ctrlr, "failed to allocate enough trackers\n"); 653 goto out; 654 } 655 656 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) * 657 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain), 658 M_ZERO | M_WAITOK); 659 660 if (ctrlr->msi_count > 1) { 661 /* 662 * MSI-X vector resource IDs start at 1, so we add one to 663 * the queue's vector to get the corresponding rid to use. 664 */ 665 qpair->rid = qpair->vector + 1; 666 667 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, 668 &qpair->rid, RF_ACTIVE); 669 if (qpair->res == NULL) { 670 nvme_printf(ctrlr, "unable to allocate MSI\n"); 671 goto out; 672 } 673 if (bus_setup_intr(ctrlr->dev, qpair->res, 674 INTR_TYPE_MISC | INTR_MPSAFE, NULL, 675 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) { 676 nvme_printf(ctrlr, "unable to setup MSI\n"); 677 goto out; 678 } 679 if (qpair->id == 0) { 680 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 681 "admin"); 682 } else { 683 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 684 "io%d", qpair->id - 1); 685 } 686 } 687 688 return (0); 689 690 out: 691 nvme_qpair_destroy(qpair); 692 return (ENOMEM); 693 } 694 695 static void 696 nvme_qpair_destroy(struct nvme_qpair *qpair) 697 { 698 struct nvme_tracker *tr; 699 700 mtx_lock(&qpair->recovery); 701 qpair->timer_armed = false; 702 mtx_unlock(&qpair->recovery); 703 callout_drain(&qpair->timer); 704 705 if (qpair->tag) { 706 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); 707 qpair->tag = NULL; 708 } 709 710 if (qpair->act_tr) { 711 free(qpair->act_tr, M_NVME); 712 qpair->act_tr = NULL; 713 } 714 715 while (!TAILQ_EMPTY(&qpair->free_tr)) { 716 tr = TAILQ_FIRST(&qpair->free_tr); 717 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 718 bus_dmamap_destroy(qpair->dma_tag_payload, 719 tr->payload_dma_map); 720 free(tr, M_NVME); 721 } 722 723 if (qpair->cmd != NULL) { 724 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map); 725 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 726 qpair->queuemem_map); 727 qpair->cmd = NULL; 728 } 729 730 if (qpair->dma_tag) { 731 bus_dma_tag_destroy(qpair->dma_tag); 732 qpair->dma_tag = NULL; 733 } 734 735 if (qpair->dma_tag_payload) { 736 bus_dma_tag_destroy(qpair->dma_tag_payload); 737 qpair->dma_tag_payload = NULL; 738 } 739 740 if (mtx_initialized(&qpair->lock)) 741 mtx_destroy(&qpair->lock); 742 if (mtx_initialized(&qpair->recovery)) 743 mtx_destroy(&qpair->recovery); 744 745 if (qpair->res) { 746 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, 747 rman_get_rid(qpair->res), qpair->res); 748 qpair->res = NULL; 749 } 750 } 751 752 static void 753 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) 754 { 755 struct nvme_tracker *tr; 756 757 /* 758 * nvme_complete_tracker must be called without the qpair lock held. It 759 * takes the lock to adjust outstanding_tr list, so make sure we don't 760 * have it yet. We need the lock to make the list traverse safe, but 761 * have to drop the lock to complete any AER. We restart the list scan 762 * when we do this to make this safe. There's interlock with the ISR so 763 * we know this tracker won't be completed twice. 764 */ 765 mtx_assert(&qpair->lock, MA_NOTOWNED); 766 767 mtx_lock(&qpair->lock); 768 tr = TAILQ_FIRST(&qpair->outstanding_tr); 769 while (tr != NULL) { 770 if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) { 771 tr = TAILQ_NEXT(tr, tailq); 772 continue; 773 } 774 mtx_unlock(&qpair->lock); 775 nvme_qpair_manual_complete_tracker(tr, 776 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, 777 ERROR_PRINT_NONE); 778 mtx_lock(&qpair->lock); 779 tr = TAILQ_FIRST(&qpair->outstanding_tr); 780 } 781 mtx_unlock(&qpair->lock); 782 } 783 784 void 785 nvme_admin_qpair_destroy(struct nvme_qpair *qpair) 786 { 787 mtx_assert(&qpair->lock, MA_NOTOWNED); 788 789 nvme_admin_qpair_abort_aers(qpair); 790 nvme_qpair_destroy(qpair); 791 } 792 793 void 794 nvme_io_qpair_destroy(struct nvme_qpair *qpair) 795 { 796 797 nvme_qpair_destroy(qpair); 798 } 799 800 static void 801 nvme_abort_complete(void *arg, const struct nvme_completion *status) 802 { 803 struct nvme_tracker *tr = arg; 804 805 /* 806 * If cdw0 bit 0 == 1, the controller was not able to abort the command 807 * we requested. We still need to check the active tracker array, to 808 * cover race where I/O timed out at same time controller was completing 809 * the I/O. An abort command always is on the admin queue, but affects 810 * either an admin or an I/O queue, so take the appropriate qpair lock 811 * for the original command's queue, since we'll need it to avoid races 812 * with the completion code and to complete the command manually. 813 */ 814 mtx_lock(&tr->qpair->lock); 815 if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) { 816 /* 817 * An I/O has timed out, and the controller was unable to abort 818 * it for some reason. And we've not processed a completion for 819 * it yet. Construct a fake completion status, and then complete 820 * the I/O's tracker manually. 821 */ 822 nvme_printf(tr->qpair->ctrlr, 823 "abort command failed, aborting command manually\n"); 824 nvme_qpair_manual_complete_tracker(tr, 825 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); 826 } 827 /* 828 * XXX We don't check status for the possible 'Could not abort because 829 * excess aborts were submitted to the controller'. We don't prevent 830 * that, either. Document for the future here, since the standard is 831 * squishy and only says 'may generate' but implies anything is possible 832 * including hangs if you exceed the ACL. 833 */ 834 mtx_unlock(&tr->qpair->lock); 835 } 836 837 static void 838 nvme_qpair_timeout(void *arg) 839 { 840 struct nvme_qpair *qpair = arg; 841 struct nvme_controller *ctrlr = qpair->ctrlr; 842 struct nvme_tracker *tr; 843 sbintime_t now; 844 bool idle = true; 845 bool is_admin = qpair == &ctrlr->adminq; 846 bool fast; 847 uint32_t csts; 848 uint8_t cfs; 849 850 mtx_assert(&qpair->recovery, MA_OWNED); 851 852 /* 853 * If the controller is failed, then stop polling. This ensures that any 854 * failure processing that races with the qpair timeout will fail 855 * safely. 856 */ 857 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) { 858 nvme_printf(qpair->ctrlr, 859 "%sFailed controller, stopping watchdog timeout.\n", 860 is_admin ? "Complete " : ""); 861 qpair->timer_armed = false; 862 return; 863 } 864 865 /* 866 * Shutdown condition: We set qpair->timer_armed to false in 867 * nvme_qpair_destroy before calling callout_drain. When we call that, 868 * this routine might get called one last time. Exit w/o setting a 869 * timeout. None of the watchdog stuff needs to be done since we're 870 * destroying the qpair. 871 */ 872 if (!qpair->timer_armed) { 873 nvme_printf(qpair->ctrlr, 874 "Timeout fired during nvme_qpair_destroy\n"); 875 return; 876 } 877 878 switch (qpair->recovery_state) { 879 case RECOVERY_NONE: 880 /* 881 * Read csts to get value of cfs - controller fatal status. If 882 * we are in the hot-plug or controller failed status proceed 883 * directly to reset. We also bail early if the status reads all 884 * 1's or the control fatal status bit is now 1. The latter is 885 * always true when the former is true, but not vice versa. The 886 * intent of the code is that if the card is gone (all 1's) or 887 * we've failed, then try to do a reset (which someitmes 888 * unwedges a card reading all 1's that's not gone away, but 889 * usually doesn't). 890 */ 891 csts = nvme_mmio_read_4(ctrlr, csts); 892 cfs = NVMEV(NVME_CSTS_REG_CFS, csts); 893 if (csts == NVME_GONE || cfs == 1) { 894 /* 895 * We've had a command timeout that we weren't able to 896 * abort or we have aborts disabled and any command 897 * timed out. 898 * 899 * If we get here due to a possible surprise hot-unplug 900 * event, then we let nvme_ctrlr_reset confirm and fail 901 * the controller. 902 */ 903 do_reset: 904 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", 905 (csts == 0xffffffff) ? " and possible hot unplug" : 906 (cfs ? " and fatal error status" : "")); 907 qpair->recovery_state = RECOVERY_WAITING; 908 nvme_ctrlr_reset(ctrlr); 909 idle = false; 910 break; 911 } 912 913 914 /* 915 * See if there's any recovery needed. First, do a fast check to 916 * see if anything could have timed out. If not, then skip 917 * everything else. 918 */ 919 fast = false; 920 mtx_lock(&qpair->lock); 921 now = getsbinuptime(); 922 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { 923 /* 924 * Skip async commands, they are posted to the card for 925 * an indefinite amount of time and have no deadline. 926 */ 927 if (tr->deadline == SBT_MAX) 928 continue; 929 930 /* 931 * If the first real transaction is not in timeout, then 932 * we're done. Otherwise, we try recovery. 933 */ 934 idle = false; 935 if (now <= tr->deadline) 936 fast = true; 937 break; 938 } 939 mtx_unlock(&qpair->lock); 940 if (idle || fast) 941 break; 942 943 /* 944 * There's a stale transaction at the start of the queue whose 945 * deadline has passed. Poll the competions as a last-ditch 946 * effort in case an interrupt has been missed. Warn the user if 947 * transactions were found of possible interrupt issues, but 948 * just once per controller. 949 */ 950 if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) { 951 nvme_printf(ctrlr, "System interrupt issues?\n"); 952 ctrlr->isr_warned = true; 953 } 954 955 /* 956 * Now that we've run the ISR, re-rheck to see if there's any 957 * timed out commands and abort them or reset the card if so. 958 */ 959 mtx_lock(&qpair->lock); 960 idle = true; 961 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { 962 /* 963 * Skip async commands, they are posted to the card for 964 * an indefinite amount of time and have no deadline. 965 */ 966 if (tr->deadline == SBT_MAX) 967 continue; 968 969 /* 970 * If we know this tracker hasn't timed out, we also 971 * know all subsequent ones haven't timed out. The tr 972 * queue is in submission order and all normal commands 973 * in a queue have the same timeout (or the timeout was 974 * changed by the user, but we eventually timeout then). 975 */ 976 idle = false; 977 if (now <= tr->deadline) 978 break; 979 980 /* 981 * Timeout expired, abort it or reset controller. 982 */ 983 if (ctrlr->enable_aborts && 984 tr->req->cb_fn != nvme_abort_complete) { 985 /* 986 * This isn't an abort command, ask for a 987 * hardware abort. This goes to the admin 988 * queue which will reset the card if it 989 * times out. 990 */ 991 nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, 992 nvme_abort_complete, tr); 993 } else { 994 /* 995 * We have a live command in the card (either 996 * one we couldn't abort, or aborts weren't 997 * enabled). We can only reset. 998 */ 999 mtx_unlock(&qpair->lock); 1000 goto do_reset; 1001 } 1002 } 1003 mtx_unlock(&qpair->lock); 1004 break; 1005 1006 case RECOVERY_WAITING: 1007 /* 1008 * These messages aren't interesting while we're suspended. We 1009 * put the queues into waiting state while 1010 * suspending. Suspending takes a while, so we'll see these 1011 * during that time and they aren't diagnostic. At other times, 1012 * they indicate a problem that's worth complaining about. 1013 */ 1014 if (!device_is_suspended(ctrlr->dev)) 1015 nvme_printf(ctrlr, "Waiting for reset to complete\n"); 1016 idle = false; /* We want to keep polling */ 1017 break; 1018 } 1019 1020 /* 1021 * Rearm the timeout. 1022 */ 1023 if (!idle) { 1024 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0); 1025 } else { 1026 qpair->timer_armed = false; 1027 } 1028 } 1029 1030 /* 1031 * Submit the tracker to the hardware. Must already be in the 1032 * outstanding queue when called. 1033 */ 1034 void 1035 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) 1036 { 1037 struct nvme_request *req; 1038 struct nvme_controller *ctrlr; 1039 int timeout; 1040 1041 mtx_assert(&qpair->lock, MA_OWNED); 1042 1043 req = tr->req; 1044 req->cmd.cid = tr->cid; 1045 qpair->act_tr[tr->cid] = tr; 1046 ctrlr = qpair->ctrlr; 1047 1048 if (req->timeout) { 1049 if (req->cb_fn == nvme_completion_poll_cb) 1050 timeout = 1; 1051 else if (qpair->id == 0) 1052 timeout = ctrlr->admin_timeout_period; 1053 else 1054 timeout = ctrlr->timeout_period; 1055 tr->deadline = getsbinuptime() + timeout * SBT_1S; 1056 if (!qpair->timer_armed) { 1057 qpair->timer_armed = true; 1058 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 1059 nvme_qpair_timeout, qpair, qpair->cpu, 0); 1060 } 1061 } else 1062 tr->deadline = SBT_MAX; 1063 1064 /* Copy the command from the tracker to the submission queue. */ 1065 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); 1066 1067 if (++qpair->sq_tail == qpair->num_entries) 1068 qpair->sq_tail = 0; 1069 1070 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 1071 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 1072 bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle, 1073 qpair->sq_tdbl_off, qpair->sq_tail); 1074 qpair->num_cmds++; 1075 } 1076 1077 static void 1078 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) 1079 { 1080 struct nvme_tracker *tr = arg; 1081 uint32_t cur_nseg; 1082 1083 /* 1084 * If the mapping operation failed, return immediately. The caller 1085 * is responsible for detecting the error status and failing the 1086 * tracker manually. 1087 */ 1088 if (error != 0) { 1089 nvme_printf(tr->qpair->ctrlr, 1090 "nvme_payload_map err %d\n", error); 1091 return; 1092 } 1093 1094 /* 1095 * Note that we specified ctrlr->page_size for alignment and max 1096 * segment size when creating the bus dma tags. So here we can safely 1097 * just transfer each segment to its associated PRP entry. 1098 */ 1099 tr->req->cmd.prp1 = htole64(seg[0].ds_addr); 1100 1101 if (nseg == 2) { 1102 tr->req->cmd.prp2 = htole64(seg[1].ds_addr); 1103 } else if (nseg > 2) { 1104 cur_nseg = 1; 1105 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr); 1106 while (cur_nseg < nseg) { 1107 tr->prp[cur_nseg-1] = 1108 htole64((uint64_t)seg[cur_nseg].ds_addr); 1109 cur_nseg++; 1110 } 1111 } else { 1112 /* 1113 * prp2 should not be used by the controller 1114 * since there is only one segment, but set 1115 * to 0 just to be safe. 1116 */ 1117 tr->req->cmd.prp2 = 0; 1118 } 1119 1120 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map, 1121 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 1122 nvme_qpair_submit_tracker(tr->qpair, tr); 1123 } 1124 1125 static void 1126 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 1127 { 1128 struct nvme_tracker *tr; 1129 int err = 0; 1130 bool is_admin = qpair == &qpair->ctrlr->adminq; 1131 1132 mtx_assert(&qpair->lock, MA_OWNED); 1133 1134 tr = TAILQ_FIRST(&qpair->free_tr); 1135 req->qpair = qpair; 1136 1137 /* 1138 * The controller has failed, so fail the request. Note, that this races 1139 * the recovery / timeout code. Since we hold the qpair lock, we know 1140 * it's safe to fail directly. is_failed is set when we fail the 1141 * controller. It is only ever reset in the ioctl reset controller 1142 * path, which is safe to race (for failed controllers, we make no 1143 * guarantees about bringing it out of failed state relative to other 1144 * commands). We try hard to allow admin commands when the entire 1145 * controller hasn't failed, only something related to I/O queues. 1146 */ 1147 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) { 1148 nvme_qpair_manual_complete_request(qpair, req, 1149 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1, 1150 ERROR_PRINT_NONE); 1151 return; 1152 } 1153 1154 /* 1155 * No tracker is available, or the qpair is disabled due to an 1156 * in-progress controller-level reset. If we lose the race with 1157 * recovery_state, then we may add an extra request to the queue which 1158 * will be resubmitted later. We only set recovery_state to NONE with 1159 * qpair->lock also held, so if we observe that the state is not NONE, 1160 * we know it won't transition back to NONE without retrying queued 1161 * request. 1162 */ 1163 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { 1164 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 1165 return; 1166 } 1167 1168 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 1169 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); 1170 tr->deadline = SBT_MAX; 1171 tr->req = req; 1172 1173 if (!req->payload_valid) { 1174 nvme_qpair_submit_tracker(tr->qpair, tr); 1175 return; 1176 } 1177 1178 /* 1179 * tr->deadline updating when nvme_payload_map calls 1180 * nvme_qpair_submit_tracker (we call it above directly 1181 * when there's no map to load). 1182 */ 1183 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload, 1184 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0); 1185 if (err != 0) { 1186 /* 1187 * The dmamap operation failed, so we manually fail the 1188 * tracker here with DATA_TRANSFER_ERROR status. 1189 * 1190 * nvme_qpair_manual_complete_tracker must not be called 1191 * with the qpair lock held. 1192 */ 1193 nvme_printf(qpair->ctrlr, 1194 "bus_dmamap_load_mem returned 0x%x!\n", err); 1195 mtx_unlock(&qpair->lock); 1196 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1197 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL); 1198 mtx_lock(&qpair->lock); 1199 } 1200 } 1201 1202 void 1203 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 1204 { 1205 1206 mtx_lock(&qpair->lock); 1207 _nvme_qpair_submit_request(qpair, req); 1208 mtx_unlock(&qpair->lock); 1209 } 1210 1211 static void 1212 nvme_qpair_enable(struct nvme_qpair *qpair) 1213 { 1214 bool is_admin __unused = qpair == &qpair->ctrlr->adminq; 1215 1216 if (mtx_initialized(&qpair->recovery)) 1217 mtx_assert(&qpair->recovery, MA_OWNED); 1218 if (mtx_initialized(&qpair->lock)) 1219 mtx_assert(&qpair->lock, MA_OWNED); 1220 KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed), 1221 ("Enabling a failed qpair\n")); 1222 1223 qpair->recovery_state = RECOVERY_NONE; 1224 } 1225 1226 void 1227 nvme_qpair_reset(struct nvme_qpair *qpair) 1228 { 1229 1230 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0; 1231 1232 /* 1233 * First time through the completion queue, HW will set phase 1234 * bit on completions to 1. So set this to 1 here, indicating 1235 * we're looking for a 1 to know which entries have completed. 1236 * we'll toggle the bit each time when the completion queue 1237 * rolls over. 1238 */ 1239 qpair->phase = 1; 1240 1241 memset(qpair->cmd, 0, 1242 qpair->num_entries * sizeof(struct nvme_command)); 1243 memset(qpair->cpl, 0, 1244 qpair->num_entries * sizeof(struct nvme_completion)); 1245 } 1246 1247 void 1248 nvme_admin_qpair_enable(struct nvme_qpair *qpair) 1249 { 1250 struct nvme_tracker *tr; 1251 struct nvme_tracker *tr_temp; 1252 bool rpt; 1253 1254 /* 1255 * Manually abort each outstanding admin command. Do not retry 1256 * admin commands found here, since they will be left over from 1257 * a controller reset and its likely the context in which the 1258 * command was issued no longer applies. 1259 */ 1260 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr); 1261 if (rpt) 1262 nvme_printf(qpair->ctrlr, 1263 "aborting outstanding admin command\n"); 1264 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1265 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1266 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); 1267 } 1268 if (rpt) 1269 nvme_printf(qpair->ctrlr, 1270 "done aborting outstanding admin\n"); 1271 1272 mtx_lock(&qpair->recovery); 1273 mtx_lock(&qpair->lock); 1274 nvme_qpair_enable(qpair); 1275 mtx_unlock(&qpair->lock); 1276 mtx_unlock(&qpair->recovery); 1277 } 1278 1279 void 1280 nvme_io_qpair_enable(struct nvme_qpair *qpair) 1281 { 1282 STAILQ_HEAD(, nvme_request) temp; 1283 struct nvme_tracker *tr; 1284 struct nvme_tracker *tr_temp; 1285 struct nvme_request *req; 1286 bool report; 1287 1288 /* 1289 * Manually abort each outstanding I/O. This normally results in a 1290 * retry, unless the retry count on the associated request has 1291 * reached its limit. 1292 */ 1293 report = !TAILQ_EMPTY(&qpair->outstanding_tr); 1294 if (report) 1295 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); 1296 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1297 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1298 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY); 1299 } 1300 if (report) 1301 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n"); 1302 1303 mtx_lock(&qpair->recovery); 1304 mtx_lock(&qpair->lock); 1305 nvme_qpair_enable(qpair); 1306 1307 STAILQ_INIT(&temp); 1308 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request); 1309 1310 report = !STAILQ_EMPTY(&temp); 1311 if (report) 1312 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n"); 1313 while (!STAILQ_EMPTY(&temp)) { 1314 req = STAILQ_FIRST(&temp); 1315 STAILQ_REMOVE_HEAD(&temp, stailq); 1316 nvme_qpair_print_command(qpair, &req->cmd); 1317 _nvme_qpair_submit_request(qpair, req); 1318 } 1319 if (report) 1320 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n"); 1321 1322 mtx_unlock(&qpair->lock); 1323 mtx_unlock(&qpair->recovery); 1324 } 1325 1326 static void 1327 nvme_qpair_disable(struct nvme_qpair *qpair) 1328 { 1329 struct nvme_tracker *tr, *tr_temp; 1330 1331 if (mtx_initialized(&qpair->recovery)) 1332 mtx_assert(&qpair->recovery, MA_OWNED); 1333 if (mtx_initialized(&qpair->lock)) 1334 mtx_assert(&qpair->lock, MA_OWNED); 1335 1336 qpair->recovery_state = RECOVERY_WAITING; 1337 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1338 tr->deadline = SBT_MAX; 1339 } 1340 } 1341 1342 void 1343 nvme_admin_qpair_disable(struct nvme_qpair *qpair) 1344 { 1345 mtx_lock(&qpair->recovery); 1346 1347 mtx_lock(&qpair->lock); 1348 nvme_qpair_disable(qpair); 1349 mtx_unlock(&qpair->lock); 1350 1351 nvme_admin_qpair_abort_aers(qpair); 1352 1353 mtx_unlock(&qpair->recovery); 1354 } 1355 1356 void 1357 nvme_io_qpair_disable(struct nvme_qpair *qpair) 1358 { 1359 mtx_lock(&qpair->recovery); 1360 mtx_lock(&qpair->lock); 1361 1362 nvme_qpair_disable(qpair); 1363 1364 mtx_unlock(&qpair->lock); 1365 mtx_unlock(&qpair->recovery); 1366 } 1367 1368 void 1369 nvme_qpair_fail(struct nvme_qpair *qpair) 1370 { 1371 struct nvme_tracker *tr; 1372 struct nvme_request *req; 1373 1374 if (!mtx_initialized(&qpair->lock)) 1375 return; 1376 1377 mtx_lock(&qpair->lock); 1378 1379 if (!STAILQ_EMPTY(&qpair->queued_req)) { 1380 nvme_printf(qpair->ctrlr, "failing queued i/o\n"); 1381 } 1382 while (!STAILQ_EMPTY(&qpair->queued_req)) { 1383 req = STAILQ_FIRST(&qpair->queued_req); 1384 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1385 mtx_unlock(&qpair->lock); 1386 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, 1387 NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL); 1388 mtx_lock(&qpair->lock); 1389 } 1390 1391 if (!TAILQ_EMPTY(&qpair->outstanding_tr)) { 1392 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); 1393 } 1394 /* Manually abort each outstanding I/O. */ 1395 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { 1396 tr = TAILQ_FIRST(&qpair->outstanding_tr); 1397 /* 1398 * Do not remove the tracker. The abort_tracker path will 1399 * do that for us. 1400 */ 1401 mtx_unlock(&qpair->lock); 1402 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1403 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); 1404 mtx_lock(&qpair->lock); 1405 } 1406 1407 mtx_unlock(&qpair->lock); 1408 } 1409