1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (C) 2012-2014 Intel Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/bus.h> 31 #include <sys/conf.h> 32 #include <sys/domainset.h> 33 #include <sys/proc.h> 34 #include <sys/sbuf.h> 35 36 #include <dev/pci/pcivar.h> 37 38 #include "nvme_private.h" 39 40 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t; 41 #define DO_NOT_RETRY 1 42 43 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, 44 struct nvme_request *req); 45 static void nvme_qpair_destroy(struct nvme_qpair *qpair); 46 47 static const char * 48 get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len) 49 { 50 struct sbuf sb; 51 52 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN); 53 nvme_opcode_sbuf(admin, opc, &sb); 54 if (sbuf_finish(&sb) != 0) 55 return (""); 56 return (buf); 57 } 58 59 static void 60 nvme_admin_qpair_print_command(struct nvme_qpair *qpair, 61 struct nvme_command *cmd) 62 { 63 char buf[64]; 64 65 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x " 66 "cdw10:%08x cdw11:%08x\n", 67 get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id, 68 cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10), 69 le32toh(cmd->cdw11)); 70 } 71 72 static void 73 nvme_io_qpair_print_command(struct nvme_qpair *qpair, 74 struct nvme_command *cmd) 75 { 76 char buf[64]; 77 78 switch (cmd->opc) { 79 case NVME_OPC_WRITE: 80 case NVME_OPC_READ: 81 case NVME_OPC_WRITE_UNCORRECTABLE: 82 case NVME_OPC_COMPARE: 83 case NVME_OPC_WRITE_ZEROES: 84 case NVME_OPC_VERIFY: 85 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " 86 "lba:%llu len:%d\n", 87 get_opcode_string(false, cmd->opc, buf, sizeof(buf)), 88 qpair->id, cmd->cid, le32toh(cmd->nsid), 89 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10), 90 (le32toh(cmd->cdw12) & 0xFFFF) + 1); 91 break; 92 default: 93 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", 94 get_opcode_string(false, cmd->opc, buf, sizeof(buf)), 95 qpair->id, cmd->cid, le32toh(cmd->nsid)); 96 break; 97 } 98 } 99 100 void 101 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) 102 { 103 if (qpair->id == 0) 104 nvme_admin_qpair_print_command(qpair, cmd); 105 else 106 nvme_io_qpair_print_command(qpair, cmd); 107 if (nvme_verbose_cmd_dump) { 108 nvme_printf(qpair->ctrlr, 109 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n", 110 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr, 111 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2); 112 nvme_printf(qpair->ctrlr, 113 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n", 114 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14, 115 cmd->cdw15); 116 } 117 } 118 119 static const char * 120 get_status_string(const struct nvme_completion *cpl, char *buf, size_t len) 121 { 122 struct sbuf sb; 123 124 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN); 125 nvme_sc_sbuf(cpl, &sb); 126 if (sbuf_finish(&sb) != 0) 127 return (""); 128 return (buf); 129 } 130 131 void 132 nvme_qpair_print_completion(struct nvme_qpair *qpair, 133 struct nvme_completion *cpl) 134 { 135 char buf[64]; 136 uint8_t crd, m, dnr, p; 137 138 crd = NVME_STATUS_GET_CRD(cpl->status); 139 m = NVME_STATUS_GET_M(cpl->status); 140 dnr = NVME_STATUS_GET_DNR(cpl->status); 141 p = NVME_STATUS_GET_P(cpl->status); 142 143 nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d " 144 "sqid:%d cid:%d cdw0:%x\n", 145 get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p, 146 cpl->sqid, cpl->cid, cpl->cdw0); 147 } 148 149 static bool 150 nvme_completion_is_retry(const struct nvme_completion *cpl) 151 { 152 uint8_t sct, sc, dnr; 153 154 sct = NVME_STATUS_GET_SCT(cpl->status); 155 sc = NVME_STATUS_GET_SC(cpl->status); 156 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */ 157 158 /* 159 * TODO: spec is not clear how commands that are aborted due 160 * to TLER will be marked. So for now, it seems 161 * NAMESPACE_NOT_READY is the only case where we should 162 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST 163 * set the DNR bit correctly since the driver controls that. 164 */ 165 switch (sct) { 166 case NVME_SCT_GENERIC: 167 switch (sc) { 168 case NVME_SC_ABORTED_BY_REQUEST: 169 case NVME_SC_NAMESPACE_NOT_READY: 170 if (dnr) 171 return (0); 172 else 173 return (1); 174 case NVME_SC_INVALID_OPCODE: 175 case NVME_SC_INVALID_FIELD: 176 case NVME_SC_COMMAND_ID_CONFLICT: 177 case NVME_SC_DATA_TRANSFER_ERROR: 178 case NVME_SC_ABORTED_POWER_LOSS: 179 case NVME_SC_INTERNAL_DEVICE_ERROR: 180 case NVME_SC_ABORTED_SQ_DELETION: 181 case NVME_SC_ABORTED_FAILED_FUSED: 182 case NVME_SC_ABORTED_MISSING_FUSED: 183 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: 184 case NVME_SC_COMMAND_SEQUENCE_ERROR: 185 case NVME_SC_LBA_OUT_OF_RANGE: 186 case NVME_SC_CAPACITY_EXCEEDED: 187 default: 188 return (0); 189 } 190 case NVME_SCT_COMMAND_SPECIFIC: 191 case NVME_SCT_MEDIA_ERROR: 192 return (0); 193 case NVME_SCT_PATH_RELATED: 194 switch (sc) { 195 case NVME_SC_INTERNAL_PATH_ERROR: 196 if (dnr) 197 return (0); 198 else 199 return (1); 200 default: 201 return (0); 202 } 203 case NVME_SCT_VENDOR_SPECIFIC: 204 default: 205 return (0); 206 } 207 } 208 209 static void 210 nvme_qpair_complete_tracker(struct nvme_tracker *tr, 211 struct nvme_completion *cpl, error_print_t print_on_error) 212 { 213 struct nvme_qpair *qpair = tr->qpair; 214 struct nvme_request *req; 215 bool retry, error, retriable; 216 217 mtx_assert(&qpair->lock, MA_NOTOWNED); 218 219 req = tr->req; 220 error = nvme_completion_is_error(cpl); 221 retriable = nvme_completion_is_retry(cpl); 222 retry = error && retriable && req->retries < nvme_retry_count; 223 if (retry) 224 qpair->num_retries++; 225 if (error && req->retries >= nvme_retry_count && retriable) 226 qpair->num_failures++; 227 228 if (error && (print_on_error == ERROR_PRINT_ALL || 229 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) { 230 nvme_qpair_print_command(qpair, &req->cmd); 231 nvme_qpair_print_completion(qpair, cpl); 232 } 233 234 qpair->act_tr[cpl->cid] = NULL; 235 236 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); 237 238 if (!retry) { 239 if (req->payload_valid) { 240 bus_dmamap_sync(qpair->dma_tag_payload, 241 tr->payload_dma_map, 242 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 243 } 244 if (req->cb_fn) 245 req->cb_fn(req->cb_arg, cpl); 246 } 247 248 mtx_lock(&qpair->lock); 249 250 if (retry) { 251 req->retries++; 252 nvme_qpair_submit_tracker(qpair, tr); 253 } else { 254 if (req->payload_valid) { 255 bus_dmamap_unload(qpair->dma_tag_payload, 256 tr->payload_dma_map); 257 } 258 259 nvme_free_request(req); 260 tr->req = NULL; 261 262 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq); 263 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 264 265 /* 266 * If the controller is in the middle of resetting, don't 267 * try to submit queued requests here - let the reset logic 268 * handle that instead. 269 */ 270 if (!STAILQ_EMPTY(&qpair->queued_req) && 271 !qpair->ctrlr->is_resetting) { 272 req = STAILQ_FIRST(&qpair->queued_req); 273 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 274 _nvme_qpair_submit_request(qpair, req); 275 } 276 } 277 278 mtx_unlock(&qpair->lock); 279 } 280 281 static uint32_t 282 nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr) 283 { 284 uint32_t status = 0; 285 286 status |= NVMEF(NVME_STATUS_SCT, sct); 287 status |= NVMEF(NVME_STATUS_SC, sc); 288 status |= NVMEF(NVME_STATUS_DNR, dnr); 289 /* M=0 : this is artificial so no data in error log page */ 290 /* CRD=0 : this is artificial and no delayed retry support anyway */ 291 /* P=0 : phase not checked */ 292 return (status); 293 } 294 295 static void 296 nvme_qpair_manual_complete_tracker( 297 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 298 error_print_t print_on_error) 299 { 300 struct nvme_completion cpl; 301 struct nvme_qpair * qpair = tr->qpair; 302 303 mtx_assert(&qpair->lock, MA_NOTOWNED); 304 305 memset(&cpl, 0, sizeof(cpl)); 306 307 cpl.sqid = qpair->id; 308 cpl.cid = tr->cid; 309 cpl.status = nvme_qpair_make_status(sct, sc, dnr); 310 nvme_qpair_complete_tracker(tr, &cpl, print_on_error); 311 } 312 313 static void 314 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, 315 struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr, 316 error_print_t print_on_error) 317 { 318 struct nvme_completion cpl; 319 bool error; 320 321 memset(&cpl, 0, sizeof(cpl)); 322 cpl.sqid = qpair->id; 323 cpl.status = nvme_qpair_make_status(sct, sc, dnr); 324 error = nvme_completion_is_error(&cpl); 325 326 if (error && print_on_error == ERROR_PRINT_ALL) { 327 nvme_qpair_print_command(qpair, &req->cmd); 328 nvme_qpair_print_completion(qpair, &cpl); 329 } 330 331 if (req->cb_fn) 332 req->cb_fn(req->cb_arg, &cpl); 333 334 nvme_free_request(req); 335 } 336 337 /* Locked version of completion processor */ 338 static bool 339 _nvme_qpair_process_completions(struct nvme_qpair *qpair) 340 { 341 struct nvme_tracker *tr; 342 struct nvme_completion cpl; 343 bool done = false; 344 bool in_panic = dumping || SCHEDULER_STOPPED(); 345 346 mtx_assert(&qpair->recovery, MA_OWNED); 347 348 /* 349 * qpair is not enabled, likely because a controller reset is in 350 * progress. Ignore the interrupt - any I/O that was associated with 351 * this interrupt will get retried when the reset is complete. Any 352 * pending completions for when we're in startup will be completed 353 * as soon as initialization is complete and we start sending commands 354 * to the device. 355 */ 356 if (qpair->recovery_state != RECOVERY_NONE) { 357 qpair->num_ignored++; 358 return (false); 359 } 360 361 /* 362 * Sanity check initialization. After we reset the hardware, the phase 363 * is defined to be 1. So if we get here with zero prior calls and the 364 * phase is 0, it means that we've lost a race between the 365 * initialization and the ISR running. With the phase wrong, we'll 366 * process a bunch of completions that aren't really completions leading 367 * to a KASSERT below. 368 */ 369 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0), 370 ("%s: Phase wrong for first interrupt call.", 371 device_get_nameunit(qpair->ctrlr->dev))); 372 373 qpair->num_intr_handler_calls++; 374 375 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 376 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 377 /* 378 * A panic can stop the CPU this routine is running on at any point. If 379 * we're called during a panic, complete the sq_head wrap protocol for 380 * the case where we are interrupted just after the increment at 1 381 * below, but before we can reset cq_head to zero at 2. Also cope with 382 * the case where we do the zero at 2, but may or may not have done the 383 * phase adjustment at step 3. The panic machinery flushes all pending 384 * memory writes, so we can make these strong ordering assumptions 385 * that would otherwise be unwise if we were racing in real time. 386 */ 387 if (__predict_false(in_panic)) { 388 if (qpair->cq_head == qpair->num_entries) { 389 /* 390 * Here we know that we need to zero cq_head and then negate 391 * the phase, which hasn't been assigned if cq_head isn't 392 * zero due to the atomic_store_rel. 393 */ 394 qpair->cq_head = 0; 395 qpair->phase = !qpair->phase; 396 } else if (qpair->cq_head == 0) { 397 /* 398 * In this case, we know that the assignment at 2 399 * happened below, but we don't know if it 3 happened or 400 * not. To do this, we look at the last completion 401 * entry and set the phase to the opposite phase 402 * that it has. This gets us back in sync 403 */ 404 cpl = qpair->cpl[qpair->num_entries - 1]; 405 nvme_completion_swapbytes(&cpl); 406 qpair->phase = !NVME_STATUS_GET_P(cpl.status); 407 } 408 } 409 410 while (1) { 411 uint16_t status; 412 413 /* 414 * We need to do this dance to avoid a race between the host and 415 * the device where the device overtakes the host while the host 416 * is reading this record, leaving the status field 'new' and 417 * the sqhd and cid fields potentially stale. If the phase 418 * doesn't match, that means status hasn't yet been updated and 419 * we'll get any pending changes next time. It also means that 420 * the phase must be the same the second time. We have to sync 421 * before reading to ensure any bouncing completes. 422 */ 423 status = le16toh(qpair->cpl[qpair->cq_head].status); 424 if (NVME_STATUS_GET_P(status) != qpair->phase) 425 break; 426 427 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 428 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 429 cpl = qpair->cpl[qpair->cq_head]; 430 nvme_completion_swapbytes(&cpl); 431 432 KASSERT( 433 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status), 434 ("Phase unexpectedly inconsistent")); 435 436 if (cpl.cid < qpair->num_trackers) 437 tr = qpair->act_tr[cpl.cid]; 438 else 439 tr = NULL; 440 441 done = true; 442 if (tr != NULL) { 443 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL); 444 qpair->sq_head = cpl.sqhd; 445 } else if (!in_panic) { 446 /* 447 * A missing tracker is normally an error. However, a 448 * panic can stop the CPU this routine is running on 449 * after completing an I/O but before updating 450 * qpair->cq_head at 1 below. Later, we re-enter this 451 * routine to poll I/O associated with the kernel 452 * dump. We find that the tr has been set to null before 453 * calling the completion routine. If it hasn't 454 * completed (or it triggers a panic), then '1' below 455 * won't have updated cq_head. Rather than panic again, 456 * ignore this condition because it's not unexpected. 457 */ 458 nvme_printf(qpair->ctrlr, 459 "cpl (cid = %u) does not map to outstanding cmd\n", 460 cpl.cid); 461 nvme_qpair_print_completion(qpair, 462 &qpair->cpl[qpair->cq_head]); 463 KASSERT(0, ("received completion for unknown cmd")); 464 } 465 466 /* 467 * There's a number of races with the following (see above) when 468 * the system panics. We compensate for each one of them by 469 * using the atomic store to force strong ordering (at least when 470 * viewed in the aftermath of a panic). 471 */ 472 if (++qpair->cq_head == qpair->num_entries) { /* 1 */ 473 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */ 474 qpair->phase = !qpair->phase; /* 3 */ 475 } 476 } 477 478 if (done) { 479 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, 480 qpair->cq_hdbl_off, qpair->cq_head); 481 } 482 483 return (done); 484 } 485 486 bool 487 nvme_qpair_process_completions(struct nvme_qpair *qpair) 488 { 489 bool done = false; 490 491 /* 492 * Interlock with reset / recovery code. This is an usually uncontended 493 * to make sure that we drain out of the ISRs before we reset the card 494 * and to prevent races with the recovery process called from a timeout 495 * context. 496 */ 497 mtx_lock(&qpair->recovery); 498 499 if (__predict_true(qpair->recovery_state == RECOVERY_NONE)) 500 done = _nvme_qpair_process_completions(qpair); 501 else 502 qpair->num_recovery_nolock++; // XXX likely need to rename 503 504 mtx_unlock(&qpair->recovery); 505 506 return (done); 507 } 508 509 static void 510 nvme_qpair_msi_handler(void *arg) 511 { 512 struct nvme_qpair *qpair = arg; 513 514 nvme_qpair_process_completions(qpair); 515 } 516 517 int 518 nvme_qpair_construct(struct nvme_qpair *qpair, 519 uint32_t num_entries, uint32_t num_trackers, 520 struct nvme_controller *ctrlr) 521 { 522 struct nvme_tracker *tr; 523 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz; 524 uint64_t queuemem_phys, prpmem_phys, list_phys; 525 uint8_t *queuemem, *prpmem, *prp_list; 526 int i, err; 527 528 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0; 529 qpair->num_entries = num_entries; 530 qpair->num_trackers = num_trackers; 531 qpair->ctrlr = ctrlr; 532 533 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); 534 mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF); 535 536 callout_init_mtx(&qpair->timer, &qpair->recovery, 0); 537 qpair->timer_armed = false; 538 qpair->recovery_state = RECOVERY_WAITING; 539 540 /* Note: NVMe PRP format is restricted to 4-byte alignment. */ 541 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 542 4, ctrlr->page_size, BUS_SPACE_MAXADDR, 543 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size, 544 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1, 545 ctrlr->page_size, 0, 546 NULL, NULL, &qpair->dma_tag_payload); 547 if (err != 0) { 548 nvme_printf(ctrlr, "payload tag create failed %d\n", err); 549 goto out; 550 } 551 552 /* 553 * Each component must be page aligned, and individual PRP lists 554 * cannot cross a page boundary. 555 */ 556 cmdsz = qpair->num_entries * sizeof(struct nvme_command); 557 cmdsz = roundup2(cmdsz, ctrlr->page_size); 558 cplsz = qpair->num_entries * sizeof(struct nvme_completion); 559 cplsz = roundup2(cplsz, ctrlr->page_size); 560 /* 561 * For commands requiring more than 2 PRP entries, one PRP will be 562 * embedded in the command (prp1), and the rest of the PRP entries 563 * will be in a list pointed to by the command (prp2). 564 */ 565 prpsz = sizeof(uint64_t) * 566 howmany(ctrlr->max_xfer_size, ctrlr->page_size); 567 prpmemsz = qpair->num_trackers * prpsz; 568 allocsz = cmdsz + cplsz + prpmemsz; 569 570 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 571 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, 572 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag); 573 if (err != 0) { 574 nvme_printf(ctrlr, "tag create failed %d\n", err); 575 goto out; 576 } 577 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain); 578 579 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, 580 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) { 581 nvme_printf(ctrlr, "failed to alloc qpair memory\n"); 582 goto out; 583 } 584 585 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map, 586 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) { 587 nvme_printf(ctrlr, "failed to load qpair memory\n"); 588 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 589 qpair->queuemem_map); 590 goto out; 591 } 592 593 qpair->num_cmds = 0; 594 qpair->num_intr_handler_calls = 0; 595 qpair->num_retries = 0; 596 qpair->num_failures = 0; 597 qpair->num_ignored = 0; 598 qpair->cmd = (struct nvme_command *)queuemem; 599 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); 600 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); 601 qpair->cmd_bus_addr = queuemem_phys; 602 qpair->cpl_bus_addr = queuemem_phys + cmdsz; 603 prpmem_phys = queuemem_phys + cmdsz + cplsz; 604 605 /* 606 * Calcuate the stride of the doorbell register. Many emulators set this 607 * value to correspond to a cache line. However, some hardware has set 608 * it to various small values. 609 */ 610 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) + 611 (qpair->id << (ctrlr->dstrd + 1)); 612 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) + 613 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd); 614 615 TAILQ_INIT(&qpair->free_tr); 616 TAILQ_INIT(&qpair->outstanding_tr); 617 STAILQ_INIT(&qpair->queued_req); 618 619 list_phys = prpmem_phys; 620 prp_list = prpmem; 621 for (i = 0; i < qpair->num_trackers; i++) { 622 if (list_phys + prpsz > prpmem_phys + prpmemsz) { 623 qpair->num_trackers = i; 624 break; 625 } 626 627 /* 628 * Make sure that the PRP list for this tracker doesn't 629 * overflow to another nvme page. 630 */ 631 if (trunc_page(list_phys) != 632 trunc_page(list_phys + prpsz - 1)) { 633 list_phys = roundup2(list_phys, ctrlr->page_size); 634 prp_list = 635 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size); 636 } 637 638 tr = malloc_domainset(sizeof(*tr), M_NVME, 639 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK); 640 bus_dmamap_create(qpair->dma_tag_payload, 0, 641 &tr->payload_dma_map); 642 tr->cid = i; 643 tr->qpair = qpair; 644 tr->prp = (uint64_t *)prp_list; 645 tr->prp_bus_addr = list_phys; 646 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 647 list_phys += prpsz; 648 prp_list += prpsz; 649 } 650 651 if (qpair->num_trackers == 0) { 652 nvme_printf(ctrlr, "failed to allocate enough trackers\n"); 653 goto out; 654 } 655 656 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) * 657 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain), 658 M_ZERO | M_WAITOK); 659 660 if (ctrlr->msi_count > 1) { 661 /* 662 * MSI-X vector resource IDs start at 1, so we add one to 663 * the queue's vector to get the corresponding rid to use. 664 */ 665 qpair->rid = qpair->vector + 1; 666 667 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, 668 &qpair->rid, RF_ACTIVE); 669 if (qpair->res == NULL) { 670 nvme_printf(ctrlr, "unable to allocate MSI\n"); 671 goto out; 672 } 673 if (bus_setup_intr(ctrlr->dev, qpair->res, 674 INTR_TYPE_MISC | INTR_MPSAFE, NULL, 675 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) { 676 nvme_printf(ctrlr, "unable to setup MSI\n"); 677 goto out; 678 } 679 if (qpair->id == 0) { 680 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 681 "admin"); 682 } else { 683 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 684 "io%d", qpair->id - 1); 685 } 686 } 687 688 return (0); 689 690 out: 691 nvme_qpair_destroy(qpair); 692 return (ENOMEM); 693 } 694 695 static void 696 nvme_qpair_destroy(struct nvme_qpair *qpair) 697 { 698 struct nvme_tracker *tr; 699 700 mtx_lock(&qpair->recovery); 701 qpair->timer_armed = false; 702 mtx_unlock(&qpair->recovery); 703 callout_drain(&qpair->timer); 704 705 if (qpair->tag) { 706 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); 707 qpair->tag = NULL; 708 } 709 710 if (qpair->act_tr) { 711 free(qpair->act_tr, M_NVME); 712 qpair->act_tr = NULL; 713 } 714 715 while (!TAILQ_EMPTY(&qpair->free_tr)) { 716 tr = TAILQ_FIRST(&qpair->free_tr); 717 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 718 bus_dmamap_destroy(qpair->dma_tag_payload, 719 tr->payload_dma_map); 720 free(tr, M_NVME); 721 } 722 723 if (qpair->cmd != NULL) { 724 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map); 725 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 726 qpair->queuemem_map); 727 qpair->cmd = NULL; 728 } 729 730 if (qpair->dma_tag) { 731 bus_dma_tag_destroy(qpair->dma_tag); 732 qpair->dma_tag = NULL; 733 } 734 735 if (qpair->dma_tag_payload) { 736 bus_dma_tag_destroy(qpair->dma_tag_payload); 737 qpair->dma_tag_payload = NULL; 738 } 739 740 if (mtx_initialized(&qpair->lock)) 741 mtx_destroy(&qpair->lock); 742 if (mtx_initialized(&qpair->recovery)) 743 mtx_destroy(&qpair->recovery); 744 745 if (qpair->res) { 746 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, 747 rman_get_rid(qpair->res), qpair->res); 748 qpair->res = NULL; 749 } 750 } 751 752 static void 753 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) 754 { 755 struct nvme_tracker *tr; 756 757 /* 758 * nvme_complete_tracker must be called without the qpair lock held. It 759 * takes the lock to adjust outstanding_tr list, so make sure we don't 760 * have it yet. We need the lock to make the list traverse safe, but 761 * have to drop the lock to complete any AER. We restart the list scan 762 * when we do this to make this safe. There's interlock with the ISR so 763 * we know this tracker won't be completed twice. 764 */ 765 mtx_assert(&qpair->lock, MA_NOTOWNED); 766 767 mtx_lock(&qpair->lock); 768 tr = TAILQ_FIRST(&qpair->outstanding_tr); 769 while (tr != NULL) { 770 if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) { 771 tr = TAILQ_NEXT(tr, tailq); 772 continue; 773 } 774 mtx_unlock(&qpair->lock); 775 nvme_qpair_manual_complete_tracker(tr, 776 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, 777 ERROR_PRINT_NONE); 778 mtx_lock(&qpair->lock); 779 tr = TAILQ_FIRST(&qpair->outstanding_tr); 780 } 781 mtx_unlock(&qpair->lock); 782 } 783 784 void 785 nvme_admin_qpair_destroy(struct nvme_qpair *qpair) 786 { 787 mtx_assert(&qpair->lock, MA_NOTOWNED); 788 789 nvme_admin_qpair_abort_aers(qpair); 790 nvme_qpair_destroy(qpair); 791 } 792 793 void 794 nvme_io_qpair_destroy(struct nvme_qpair *qpair) 795 { 796 nvme_qpair_destroy(qpair); 797 } 798 799 static void 800 nvme_abort_complete(void *arg, const struct nvme_completion *status) 801 { 802 struct nvme_tracker *tr = arg; 803 804 /* 805 * If cdw0 bit 0 == 1, the controller was not able to abort the command 806 * we requested. We still need to check the active tracker array, to 807 * cover race where I/O timed out at same time controller was completing 808 * the I/O. An abort command always is on the admin queue, but affects 809 * either an admin or an I/O queue, so take the appropriate qpair lock 810 * for the original command's queue, since we'll need it to avoid races 811 * with the completion code and to complete the command manually. 812 */ 813 mtx_lock(&tr->qpair->lock); 814 if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) { 815 /* 816 * An I/O has timed out, and the controller was unable to abort 817 * it for some reason. And we've not processed a completion for 818 * it yet. Construct a fake completion status, and then complete 819 * the I/O's tracker manually. 820 */ 821 nvme_printf(tr->qpair->ctrlr, 822 "abort command failed, aborting command manually\n"); 823 nvme_qpair_manual_complete_tracker(tr, 824 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); 825 } 826 /* 827 * XXX We don't check status for the possible 'Could not abort because 828 * excess aborts were submitted to the controller'. We don't prevent 829 * that, either. Document for the future here, since the standard is 830 * squishy and only says 'may generate' but implies anything is possible 831 * including hangs if you exceed the ACL. 832 */ 833 mtx_unlock(&tr->qpair->lock); 834 } 835 836 static void 837 nvme_qpair_timeout(void *arg) 838 { 839 struct nvme_qpair *qpair = arg; 840 struct nvme_controller *ctrlr = qpair->ctrlr; 841 struct nvme_tracker *tr; 842 sbintime_t now; 843 bool idle = true; 844 bool is_admin = qpair == &ctrlr->adminq; 845 bool fast; 846 uint32_t csts; 847 uint8_t cfs; 848 849 mtx_assert(&qpair->recovery, MA_OWNED); 850 851 /* 852 * If the controller is failed, then stop polling. This ensures that any 853 * failure processing that races with the qpair timeout will fail 854 * safely. 855 */ 856 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) { 857 nvme_printf(qpair->ctrlr, 858 "%sFailed controller, stopping watchdog timeout.\n", 859 is_admin ? "Complete " : ""); 860 qpair->timer_armed = false; 861 return; 862 } 863 864 /* 865 * Shutdown condition: We set qpair->timer_armed to false in 866 * nvme_qpair_destroy before calling callout_drain. When we call that, 867 * this routine might get called one last time. Exit w/o setting a 868 * timeout. None of the watchdog stuff needs to be done since we're 869 * destroying the qpair. 870 */ 871 if (!qpair->timer_armed) { 872 nvme_printf(qpair->ctrlr, 873 "Timeout fired during nvme_qpair_destroy\n"); 874 return; 875 } 876 877 switch (qpair->recovery_state) { 878 case RECOVERY_NONE: 879 /* 880 * Read csts to get value of cfs - controller fatal status. If 881 * we are in the hot-plug or controller failed status proceed 882 * directly to reset. We also bail early if the status reads all 883 * 1's or the control fatal status bit is now 1. The latter is 884 * always true when the former is true, but not vice versa. The 885 * intent of the code is that if the card is gone (all 1's) or 886 * we've failed, then try to do a reset (which someitmes 887 * unwedges a card reading all 1's that's not gone away, but 888 * usually doesn't). 889 */ 890 csts = nvme_mmio_read_4(ctrlr, csts); 891 cfs = NVMEV(NVME_CSTS_REG_CFS, csts); 892 if (csts == NVME_GONE || cfs == 1) { 893 /* 894 * We've had a command timeout that we weren't able to 895 * abort or we have aborts disabled and any command 896 * timed out. 897 * 898 * If we get here due to a possible surprise hot-unplug 899 * event, then we let nvme_ctrlr_reset confirm and fail 900 * the controller. 901 */ 902 do_reset: 903 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", 904 (csts == 0xffffffff) ? " and possible hot unplug" : 905 (cfs ? " and fatal error status" : "")); 906 qpair->recovery_state = RECOVERY_WAITING; 907 nvme_ctrlr_reset(ctrlr); 908 idle = false; 909 break; 910 } 911 912 913 /* 914 * See if there's any recovery needed. First, do a fast check to 915 * see if anything could have timed out. If not, then skip 916 * everything else. 917 */ 918 fast = false; 919 mtx_lock(&qpair->lock); 920 now = getsbinuptime(); 921 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { 922 /* 923 * Skip async commands, they are posted to the card for 924 * an indefinite amount of time and have no deadline. 925 */ 926 if (tr->deadline == SBT_MAX) 927 continue; 928 929 /* 930 * If the first real transaction is not in timeout, then 931 * we're done. Otherwise, we try recovery. 932 */ 933 idle = false; 934 if (now <= tr->deadline) 935 fast = true; 936 break; 937 } 938 mtx_unlock(&qpair->lock); 939 if (idle || fast) 940 break; 941 942 /* 943 * There's a stale transaction at the start of the queue whose 944 * deadline has passed. Poll the competions as a last-ditch 945 * effort in case an interrupt has been missed. Warn the user if 946 * transactions were found of possible interrupt issues, but 947 * just once per controller. 948 */ 949 if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) { 950 nvme_printf(ctrlr, "System interrupt issues?\n"); 951 ctrlr->isr_warned = true; 952 } 953 954 /* 955 * Now that we've run the ISR, re-rheck to see if there's any 956 * timed out commands and abort them or reset the card if so. 957 */ 958 mtx_lock(&qpair->lock); 959 idle = true; 960 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { 961 /* 962 * Skip async commands, they are posted to the card for 963 * an indefinite amount of time and have no deadline. 964 */ 965 if (tr->deadline == SBT_MAX) 966 continue; 967 968 /* 969 * If we know this tracker hasn't timed out, we also 970 * know all subsequent ones haven't timed out. The tr 971 * queue is in submission order and all normal commands 972 * in a queue have the same timeout (or the timeout was 973 * changed by the user, but we eventually timeout then). 974 */ 975 idle = false; 976 if (now <= tr->deadline) 977 break; 978 979 /* 980 * Timeout expired, abort it or reset controller. 981 */ 982 if (ctrlr->enable_aborts && 983 tr->req->cb_fn != nvme_abort_complete) { 984 /* 985 * This isn't an abort command, ask for a 986 * hardware abort. This goes to the admin 987 * queue which will reset the card if it 988 * times out. 989 */ 990 nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, 991 nvme_abort_complete, tr); 992 } else { 993 /* 994 * We have a live command in the card (either 995 * one we couldn't abort, or aborts weren't 996 * enabled). We can only reset. 997 */ 998 mtx_unlock(&qpair->lock); 999 goto do_reset; 1000 } 1001 } 1002 mtx_unlock(&qpair->lock); 1003 break; 1004 1005 case RECOVERY_WAITING: 1006 /* 1007 * These messages aren't interesting while we're suspended. We 1008 * put the queues into waiting state while 1009 * suspending. Suspending takes a while, so we'll see these 1010 * during that time and they aren't diagnostic. At other times, 1011 * they indicate a problem that's worth complaining about. 1012 */ 1013 if (!device_is_suspended(ctrlr->dev)) 1014 nvme_printf(ctrlr, "Waiting for reset to complete\n"); 1015 idle = false; /* We want to keep polling */ 1016 break; 1017 } 1018 1019 /* 1020 * Rearm the timeout. 1021 */ 1022 if (!idle) { 1023 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0); 1024 } else { 1025 qpair->timer_armed = false; 1026 } 1027 } 1028 1029 /* 1030 * Submit the tracker to the hardware. Must already be in the 1031 * outstanding queue when called. 1032 */ 1033 void 1034 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) 1035 { 1036 struct nvme_request *req; 1037 struct nvme_controller *ctrlr; 1038 int timeout; 1039 1040 mtx_assert(&qpair->lock, MA_OWNED); 1041 1042 req = tr->req; 1043 req->cmd.cid = tr->cid; 1044 qpair->act_tr[tr->cid] = tr; 1045 ctrlr = qpair->ctrlr; 1046 1047 if (req->timeout) { 1048 if (req->cb_fn == nvme_completion_poll_cb) 1049 timeout = 1; 1050 else if (qpair->id == 0) 1051 timeout = ctrlr->admin_timeout_period; 1052 else 1053 timeout = ctrlr->timeout_period; 1054 tr->deadline = getsbinuptime() + timeout * SBT_1S; 1055 if (!qpair->timer_armed) { 1056 qpair->timer_armed = true; 1057 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 1058 nvme_qpair_timeout, qpair, qpair->cpu, 0); 1059 } 1060 } else 1061 tr->deadline = SBT_MAX; 1062 1063 /* Copy the command from the tracker to the submission queue. */ 1064 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); 1065 1066 if (++qpair->sq_tail == qpair->num_entries) 1067 qpair->sq_tail = 0; 1068 1069 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 1070 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 1071 bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle, 1072 qpair->sq_tdbl_off, qpair->sq_tail); 1073 qpair->num_cmds++; 1074 } 1075 1076 static void 1077 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) 1078 { 1079 struct nvme_tracker *tr = arg; 1080 uint32_t cur_nseg; 1081 1082 /* 1083 * If the mapping operation failed, return immediately. The caller 1084 * is responsible for detecting the error status and failing the 1085 * tracker manually. 1086 */ 1087 if (error != 0) { 1088 nvme_printf(tr->qpair->ctrlr, 1089 "nvme_payload_map err %d\n", error); 1090 return; 1091 } 1092 1093 /* 1094 * Note that we specified ctrlr->page_size for alignment and max 1095 * segment size when creating the bus dma tags. So here we can safely 1096 * just transfer each segment to its associated PRP entry. 1097 */ 1098 tr->req->cmd.prp1 = htole64(seg[0].ds_addr); 1099 1100 if (nseg == 2) { 1101 tr->req->cmd.prp2 = htole64(seg[1].ds_addr); 1102 } else if (nseg > 2) { 1103 cur_nseg = 1; 1104 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr); 1105 while (cur_nseg < nseg) { 1106 tr->prp[cur_nseg-1] = 1107 htole64((uint64_t)seg[cur_nseg].ds_addr); 1108 cur_nseg++; 1109 } 1110 } else { 1111 /* 1112 * prp2 should not be used by the controller 1113 * since there is only one segment, but set 1114 * to 0 just to be safe. 1115 */ 1116 tr->req->cmd.prp2 = 0; 1117 } 1118 1119 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map, 1120 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 1121 nvme_qpair_submit_tracker(tr->qpair, tr); 1122 } 1123 1124 static void 1125 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 1126 { 1127 struct nvme_tracker *tr; 1128 int err = 0; 1129 bool is_admin = qpair == &qpair->ctrlr->adminq; 1130 1131 mtx_assert(&qpair->lock, MA_OWNED); 1132 1133 tr = TAILQ_FIRST(&qpair->free_tr); 1134 req->qpair = qpair; 1135 1136 /* 1137 * The controller has failed, so fail the request. Note, that this races 1138 * the recovery / timeout code. Since we hold the qpair lock, we know 1139 * it's safe to fail directly. is_failed is set when we fail the 1140 * controller. It is only ever reset in the ioctl reset controller 1141 * path, which is safe to race (for failed controllers, we make no 1142 * guarantees about bringing it out of failed state relative to other 1143 * commands). We try hard to allow admin commands when the entire 1144 * controller hasn't failed, only something related to I/O queues. 1145 */ 1146 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) { 1147 nvme_qpair_manual_complete_request(qpair, req, 1148 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1, 1149 ERROR_PRINT_NONE); 1150 return; 1151 } 1152 1153 /* 1154 * No tracker is available, or the qpair is disabled due to an 1155 * in-progress controller-level reset. If we lose the race with 1156 * recovery_state, then we may add an extra request to the queue which 1157 * will be resubmitted later. We only set recovery_state to NONE with 1158 * qpair->lock also held, so if we observe that the state is not NONE, 1159 * we know it won't transition back to NONE without retrying queued 1160 * request. 1161 */ 1162 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { 1163 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 1164 return; 1165 } 1166 1167 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 1168 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); 1169 tr->deadline = SBT_MAX; 1170 tr->req = req; 1171 1172 if (!req->payload_valid) { 1173 nvme_qpair_submit_tracker(tr->qpair, tr); 1174 return; 1175 } 1176 1177 /* 1178 * tr->deadline updating when nvme_payload_map calls 1179 * nvme_qpair_submit_tracker (we call it above directly 1180 * when there's no map to load). 1181 */ 1182 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload, 1183 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0); 1184 if (err != 0) { 1185 /* 1186 * The dmamap operation failed, so we manually fail the 1187 * tracker here with DATA_TRANSFER_ERROR status. 1188 * 1189 * nvme_qpair_manual_complete_tracker must not be called 1190 * with the qpair lock held. 1191 */ 1192 nvme_printf(qpair->ctrlr, 1193 "bus_dmamap_load_mem returned 0x%x!\n", err); 1194 mtx_unlock(&qpair->lock); 1195 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1196 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL); 1197 mtx_lock(&qpair->lock); 1198 } 1199 } 1200 1201 void 1202 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 1203 { 1204 mtx_lock(&qpair->lock); 1205 _nvme_qpair_submit_request(qpair, req); 1206 mtx_unlock(&qpair->lock); 1207 } 1208 1209 static void 1210 nvme_qpair_enable(struct nvme_qpair *qpair) 1211 { 1212 bool is_admin __unused = qpair == &qpair->ctrlr->adminq; 1213 1214 if (mtx_initialized(&qpair->recovery)) 1215 mtx_assert(&qpair->recovery, MA_OWNED); 1216 if (mtx_initialized(&qpair->lock)) 1217 mtx_assert(&qpair->lock, MA_OWNED); 1218 KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed), 1219 ("Enabling a failed qpair\n")); 1220 1221 qpair->recovery_state = RECOVERY_NONE; 1222 } 1223 1224 void 1225 nvme_qpair_reset(struct nvme_qpair *qpair) 1226 { 1227 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0; 1228 1229 /* 1230 * First time through the completion queue, HW will set phase 1231 * bit on completions to 1. So set this to 1 here, indicating 1232 * we're looking for a 1 to know which entries have completed. 1233 * we'll toggle the bit each time when the completion queue 1234 * rolls over. 1235 */ 1236 qpair->phase = 1; 1237 1238 memset(qpair->cmd, 0, 1239 qpair->num_entries * sizeof(struct nvme_command)); 1240 memset(qpair->cpl, 0, 1241 qpair->num_entries * sizeof(struct nvme_completion)); 1242 } 1243 1244 void 1245 nvme_admin_qpair_enable(struct nvme_qpair *qpair) 1246 { 1247 struct nvme_tracker *tr; 1248 struct nvme_tracker *tr_temp; 1249 bool rpt; 1250 1251 /* 1252 * Manually abort each outstanding admin command. Do not retry 1253 * admin commands found here, since they will be left over from 1254 * a controller reset and its likely the context in which the 1255 * command was issued no longer applies. 1256 */ 1257 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr); 1258 if (rpt) 1259 nvme_printf(qpair->ctrlr, 1260 "aborting outstanding admin command\n"); 1261 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1262 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1263 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); 1264 } 1265 if (rpt) 1266 nvme_printf(qpair->ctrlr, 1267 "done aborting outstanding admin\n"); 1268 1269 mtx_lock(&qpair->recovery); 1270 mtx_lock(&qpair->lock); 1271 nvme_qpair_enable(qpair); 1272 mtx_unlock(&qpair->lock); 1273 mtx_unlock(&qpair->recovery); 1274 } 1275 1276 void 1277 nvme_io_qpair_enable(struct nvme_qpair *qpair) 1278 { 1279 STAILQ_HEAD(, nvme_request) temp; 1280 struct nvme_tracker *tr; 1281 struct nvme_tracker *tr_temp; 1282 struct nvme_request *req; 1283 bool report; 1284 1285 /* 1286 * Manually abort each outstanding I/O. This normally results in a 1287 * retry, unless the retry count on the associated request has 1288 * reached its limit. 1289 */ 1290 report = !TAILQ_EMPTY(&qpair->outstanding_tr); 1291 if (report) 1292 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); 1293 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1294 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1295 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY); 1296 } 1297 if (report) 1298 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n"); 1299 1300 mtx_lock(&qpair->recovery); 1301 mtx_lock(&qpair->lock); 1302 nvme_qpair_enable(qpair); 1303 1304 STAILQ_INIT(&temp); 1305 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request); 1306 1307 report = !STAILQ_EMPTY(&temp); 1308 if (report) 1309 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n"); 1310 while (!STAILQ_EMPTY(&temp)) { 1311 req = STAILQ_FIRST(&temp); 1312 STAILQ_REMOVE_HEAD(&temp, stailq); 1313 nvme_qpair_print_command(qpair, &req->cmd); 1314 _nvme_qpair_submit_request(qpair, req); 1315 } 1316 if (report) 1317 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n"); 1318 1319 mtx_unlock(&qpair->lock); 1320 mtx_unlock(&qpair->recovery); 1321 } 1322 1323 static void 1324 nvme_qpair_disable(struct nvme_qpair *qpair) 1325 { 1326 struct nvme_tracker *tr, *tr_temp; 1327 1328 if (mtx_initialized(&qpair->recovery)) 1329 mtx_assert(&qpair->recovery, MA_OWNED); 1330 if (mtx_initialized(&qpair->lock)) 1331 mtx_assert(&qpair->lock, MA_OWNED); 1332 1333 qpair->recovery_state = RECOVERY_WAITING; 1334 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1335 tr->deadline = SBT_MAX; 1336 } 1337 } 1338 1339 void 1340 nvme_admin_qpair_disable(struct nvme_qpair *qpair) 1341 { 1342 mtx_lock(&qpair->recovery); 1343 1344 mtx_lock(&qpair->lock); 1345 nvme_qpair_disable(qpair); 1346 mtx_unlock(&qpair->lock); 1347 1348 nvme_admin_qpair_abort_aers(qpair); 1349 1350 mtx_unlock(&qpair->recovery); 1351 } 1352 1353 void 1354 nvme_io_qpair_disable(struct nvme_qpair *qpair) 1355 { 1356 mtx_lock(&qpair->recovery); 1357 mtx_lock(&qpair->lock); 1358 1359 nvme_qpair_disable(qpair); 1360 1361 mtx_unlock(&qpair->lock); 1362 mtx_unlock(&qpair->recovery); 1363 } 1364 1365 void 1366 nvme_qpair_fail(struct nvme_qpair *qpair) 1367 { 1368 struct nvme_tracker *tr; 1369 struct nvme_request *req; 1370 1371 if (!mtx_initialized(&qpair->lock)) 1372 return; 1373 1374 mtx_lock(&qpair->lock); 1375 1376 if (!STAILQ_EMPTY(&qpair->queued_req)) { 1377 nvme_printf(qpair->ctrlr, "failing queued i/o\n"); 1378 } 1379 while (!STAILQ_EMPTY(&qpair->queued_req)) { 1380 req = STAILQ_FIRST(&qpair->queued_req); 1381 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1382 mtx_unlock(&qpair->lock); 1383 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, 1384 NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL); 1385 mtx_lock(&qpair->lock); 1386 } 1387 1388 if (!TAILQ_EMPTY(&qpair->outstanding_tr)) { 1389 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); 1390 } 1391 /* Manually abort each outstanding I/O. */ 1392 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { 1393 tr = TAILQ_FIRST(&qpair->outstanding_tr); 1394 /* 1395 * Do not remove the tracker. The abort_tracker path will 1396 * do that for us. 1397 */ 1398 mtx_unlock(&qpair->lock); 1399 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1400 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); 1401 mtx_lock(&qpair->lock); 1402 } 1403 1404 mtx_unlock(&qpair->lock); 1405 } 1406