1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (C) 2012-2014 Intel Corporation 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bus.h> 34 #include <sys/conf.h> 35 #include <sys/domainset.h> 36 #include <sys/proc.h> 37 38 #include <dev/pci/pcivar.h> 39 40 #include "nvme_private.h" 41 42 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t; 43 #define DO_NOT_RETRY 1 44 45 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, 46 struct nvme_request *req); 47 static void nvme_qpair_destroy(struct nvme_qpair *qpair); 48 49 #define DEFAULT_INDEX 256 50 #define DEFAULT_ENTRY(x) [DEFAULT_INDEX] = x 51 #define OPC_ENTRY(x) [NVME_OPC_ ## x] = #x 52 53 static const char *admin_opcode[DEFAULT_INDEX + 1] = { 54 OPC_ENTRY(DELETE_IO_SQ), 55 OPC_ENTRY(CREATE_IO_SQ), 56 OPC_ENTRY(GET_LOG_PAGE), 57 OPC_ENTRY(DELETE_IO_CQ), 58 OPC_ENTRY(CREATE_IO_CQ), 59 OPC_ENTRY(IDENTIFY), 60 OPC_ENTRY(ABORT), 61 OPC_ENTRY(SET_FEATURES), 62 OPC_ENTRY(GET_FEATURES), 63 OPC_ENTRY(ASYNC_EVENT_REQUEST), 64 OPC_ENTRY(NAMESPACE_MANAGEMENT), 65 OPC_ENTRY(FIRMWARE_ACTIVATE), 66 OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD), 67 OPC_ENTRY(DEVICE_SELF_TEST), 68 OPC_ENTRY(NAMESPACE_ATTACHMENT), 69 OPC_ENTRY(KEEP_ALIVE), 70 OPC_ENTRY(DIRECTIVE_SEND), 71 OPC_ENTRY(DIRECTIVE_RECEIVE), 72 OPC_ENTRY(VIRTUALIZATION_MANAGEMENT), 73 OPC_ENTRY(NVME_MI_SEND), 74 OPC_ENTRY(NVME_MI_RECEIVE), 75 OPC_ENTRY(CAPACITY_MANAGEMENT), 76 OPC_ENTRY(LOCKDOWN), 77 OPC_ENTRY(DOORBELL_BUFFER_CONFIG), 78 OPC_ENTRY(FABRICS_COMMANDS), 79 OPC_ENTRY(FORMAT_NVM), 80 OPC_ENTRY(SECURITY_SEND), 81 OPC_ENTRY(SECURITY_RECEIVE), 82 OPC_ENTRY(SANITIZE), 83 OPC_ENTRY(GET_LBA_STATUS), 84 DEFAULT_ENTRY("ADMIN COMMAND"), 85 }; 86 87 static const char *io_opcode[DEFAULT_INDEX + 1] = { 88 OPC_ENTRY(FLUSH), 89 OPC_ENTRY(WRITE), 90 OPC_ENTRY(READ), 91 OPC_ENTRY(WRITE_UNCORRECTABLE), 92 OPC_ENTRY(COMPARE), 93 OPC_ENTRY(WRITE_ZEROES), 94 OPC_ENTRY(DATASET_MANAGEMENT), 95 OPC_ENTRY(VERIFY), 96 OPC_ENTRY(RESERVATION_REGISTER), 97 OPC_ENTRY(RESERVATION_REPORT), 98 OPC_ENTRY(RESERVATION_ACQUIRE), 99 OPC_ENTRY(RESERVATION_RELEASE), 100 OPC_ENTRY(COPY), 101 DEFAULT_ENTRY("IO COMMAND"), 102 }; 103 104 static const char * 105 get_opcode_string(const char *op[DEFAULT_INDEX + 1], uint16_t opc) 106 { 107 const char *nm = opc < DEFAULT_INDEX ? op[opc] : op[DEFAULT_INDEX]; 108 109 return (nm != NULL ? nm : op[DEFAULT_INDEX]); 110 } 111 112 static const char * 113 get_admin_opcode_string(uint16_t opc) 114 { 115 return (get_opcode_string(admin_opcode, opc)); 116 } 117 118 static const char * 119 get_io_opcode_string(uint16_t opc) 120 { 121 return (get_opcode_string(io_opcode, opc)); 122 } 123 124 static void 125 nvme_admin_qpair_print_command(struct nvme_qpair *qpair, 126 struct nvme_command *cmd) 127 { 128 129 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x " 130 "cdw10:%08x cdw11:%08x\n", 131 get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, 132 le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11)); 133 } 134 135 static void 136 nvme_io_qpair_print_command(struct nvme_qpair *qpair, 137 struct nvme_command *cmd) 138 { 139 140 switch (cmd->opc) { 141 case NVME_OPC_WRITE: 142 case NVME_OPC_READ: 143 case NVME_OPC_WRITE_UNCORRECTABLE: 144 case NVME_OPC_COMPARE: 145 case NVME_OPC_WRITE_ZEROES: 146 case NVME_OPC_VERIFY: 147 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " 148 "lba:%llu len:%d\n", 149 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid), 150 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10), 151 (le32toh(cmd->cdw12) & 0xFFFF) + 1); 152 break; 153 case NVME_OPC_FLUSH: 154 case NVME_OPC_DATASET_MANAGEMENT: 155 case NVME_OPC_RESERVATION_REGISTER: 156 case NVME_OPC_RESERVATION_REPORT: 157 case NVME_OPC_RESERVATION_ACQUIRE: 158 case NVME_OPC_RESERVATION_RELEASE: 159 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", 160 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid)); 161 break; 162 default: 163 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n", 164 get_io_opcode_string(cmd->opc), cmd->opc, qpair->id, 165 cmd->cid, le32toh(cmd->nsid)); 166 break; 167 } 168 } 169 170 void 171 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) 172 { 173 if (qpair->id == 0) 174 nvme_admin_qpair_print_command(qpair, cmd); 175 else 176 nvme_io_qpair_print_command(qpair, cmd); 177 if (nvme_verbose_cmd_dump) { 178 nvme_printf(qpair->ctrlr, 179 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n", 180 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr, 181 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2); 182 nvme_printf(qpair->ctrlr, 183 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n", 184 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14, 185 cmd->cdw15); 186 } 187 } 188 189 struct nvme_status_string { 190 uint16_t sc; 191 const char * str; 192 }; 193 194 static struct nvme_status_string generic_status[] = { 195 { NVME_SC_SUCCESS, "SUCCESS" }, 196 { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, 197 { NVME_SC_INVALID_FIELD, "INVALID_FIELD" }, 198 { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, 199 { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, 200 { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, 201 { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, 202 { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, 203 { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, 204 { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, 205 { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, 206 { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, 207 { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, 208 { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" }, 209 { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" }, 210 { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" }, 211 { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" }, 212 { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" }, 213 { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" }, 214 { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" }, 215 { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" }, 216 { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" }, 217 { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" }, 218 { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" }, 219 { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" }, 220 { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" }, 221 { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" }, 222 { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" }, 223 { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" }, 224 { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" }, 225 { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" }, 226 { NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" }, 227 { NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" }, 228 { NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" }, 229 230 { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, 231 { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, 232 { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, 233 { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" }, 234 { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" }, 235 { 0xFFFF, "GENERIC" } 236 }; 237 238 static struct nvme_status_string command_specific_status[] = { 239 { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, 240 { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, 241 { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" }, 242 { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, 243 { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, 244 { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, 245 { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, 246 { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, 247 { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, 248 { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, 249 { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" }, 250 { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" }, 251 { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" }, 252 { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" }, 253 { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" }, 254 { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" }, 255 { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" }, 256 { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" }, 257 { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" }, 258 { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" }, 259 { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" }, 260 { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" }, 261 { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" }, 262 { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" }, 263 { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" }, 264 { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" }, 265 { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" }, 266 { NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" }, 267 { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" }, 268 { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" }, 269 { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" }, 270 { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" }, 271 { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" }, 272 { NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" }, 273 { NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" }, 274 { NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" }, 275 276 { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, 277 { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, 278 { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" }, 279 { 0xFFFF, "COMMAND SPECIFIC" } 280 }; 281 282 static struct nvme_status_string media_error_status[] = { 283 { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, 284 { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, 285 { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, 286 { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, 287 { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, 288 { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, 289 { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, 290 { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" }, 291 { 0xFFFF, "MEDIA ERROR" } 292 }; 293 294 static struct nvme_status_string path_related_status[] = { 295 { NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" }, 296 { NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" }, 297 { NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" }, 298 { NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" }, 299 { NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" }, 300 { NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" }, 301 { NVME_SC_COMMAND_ABORTED_BY_HOST, "COMMAND ABORTED BY HOST" }, 302 { 0xFFFF, "PATH RELATED" }, 303 }; 304 305 static const char * 306 get_status_string(uint16_t sct, uint16_t sc) 307 { 308 struct nvme_status_string *entry; 309 310 switch (sct) { 311 case NVME_SCT_GENERIC: 312 entry = generic_status; 313 break; 314 case NVME_SCT_COMMAND_SPECIFIC: 315 entry = command_specific_status; 316 break; 317 case NVME_SCT_MEDIA_ERROR: 318 entry = media_error_status; 319 break; 320 case NVME_SCT_PATH_RELATED: 321 entry = path_related_status; 322 break; 323 case NVME_SCT_VENDOR_SPECIFIC: 324 return ("VENDOR SPECIFIC"); 325 default: 326 return ("RESERVED"); 327 } 328 329 while (entry->sc != 0xFFFF) { 330 if (entry->sc == sc) 331 return (entry->str); 332 entry++; 333 } 334 return (entry->str); 335 } 336 337 void 338 nvme_qpair_print_completion(struct nvme_qpair *qpair, 339 struct nvme_completion *cpl) 340 { 341 uint8_t sct, sc, crd, m, dnr, p; 342 343 sct = NVME_STATUS_GET_SCT(cpl->status); 344 sc = NVME_STATUS_GET_SC(cpl->status); 345 crd = NVME_STATUS_GET_CRD(cpl->status); 346 m = NVME_STATUS_GET_M(cpl->status); 347 dnr = NVME_STATUS_GET_DNR(cpl->status); 348 p = NVME_STATUS_GET_P(cpl->status); 349 350 nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x p:%d " 351 "sqid:%d cid:%d cdw0:%x\n", 352 get_status_string(sct, sc), sct, sc, crd, m, dnr, p, 353 cpl->sqid, cpl->cid, cpl->cdw0); 354 } 355 356 static bool 357 nvme_completion_is_retry(const struct nvme_completion *cpl) 358 { 359 uint8_t sct, sc, dnr; 360 361 sct = NVME_STATUS_GET_SCT(cpl->status); 362 sc = NVME_STATUS_GET_SC(cpl->status); 363 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */ 364 365 /* 366 * TODO: spec is not clear how commands that are aborted due 367 * to TLER will be marked. So for now, it seems 368 * NAMESPACE_NOT_READY is the only case where we should 369 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST 370 * set the DNR bit correctly since the driver controls that. 371 */ 372 switch (sct) { 373 case NVME_SCT_GENERIC: 374 switch (sc) { 375 case NVME_SC_ABORTED_BY_REQUEST: 376 case NVME_SC_NAMESPACE_NOT_READY: 377 if (dnr) 378 return (0); 379 else 380 return (1); 381 case NVME_SC_INVALID_OPCODE: 382 case NVME_SC_INVALID_FIELD: 383 case NVME_SC_COMMAND_ID_CONFLICT: 384 case NVME_SC_DATA_TRANSFER_ERROR: 385 case NVME_SC_ABORTED_POWER_LOSS: 386 case NVME_SC_INTERNAL_DEVICE_ERROR: 387 case NVME_SC_ABORTED_SQ_DELETION: 388 case NVME_SC_ABORTED_FAILED_FUSED: 389 case NVME_SC_ABORTED_MISSING_FUSED: 390 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: 391 case NVME_SC_COMMAND_SEQUENCE_ERROR: 392 case NVME_SC_LBA_OUT_OF_RANGE: 393 case NVME_SC_CAPACITY_EXCEEDED: 394 default: 395 return (0); 396 } 397 case NVME_SCT_COMMAND_SPECIFIC: 398 case NVME_SCT_MEDIA_ERROR: 399 return (0); 400 case NVME_SCT_PATH_RELATED: 401 switch (sc) { 402 case NVME_SC_INTERNAL_PATH_ERROR: 403 if (dnr) 404 return (0); 405 else 406 return (1); 407 default: 408 return (0); 409 } 410 case NVME_SCT_VENDOR_SPECIFIC: 411 default: 412 return (0); 413 } 414 } 415 416 static void 417 nvme_qpair_complete_tracker(struct nvme_tracker *tr, 418 struct nvme_completion *cpl, error_print_t print_on_error) 419 { 420 struct nvme_qpair * qpair = tr->qpair; 421 struct nvme_request *req; 422 bool retry, error, retriable; 423 424 req = tr->req; 425 error = nvme_completion_is_error(cpl); 426 retriable = nvme_completion_is_retry(cpl); 427 retry = error && retriable && req->retries < nvme_retry_count; 428 if (retry) 429 qpair->num_retries++; 430 if (error && req->retries >= nvme_retry_count && retriable) 431 qpair->num_failures++; 432 433 if (error && (print_on_error == ERROR_PRINT_ALL || 434 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) { 435 nvme_qpair_print_command(qpair, &req->cmd); 436 nvme_qpair_print_completion(qpair, cpl); 437 } 438 439 qpair->act_tr[cpl->cid] = NULL; 440 441 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); 442 443 if (!retry) { 444 if (req->payload_valid) { 445 bus_dmamap_sync(qpair->dma_tag_payload, 446 tr->payload_dma_map, 447 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 448 } 449 if (req->cb_fn) 450 req->cb_fn(req->cb_arg, cpl); 451 } 452 453 mtx_lock(&qpair->lock); 454 455 if (retry) { 456 req->retries++; 457 nvme_qpair_submit_tracker(qpair, tr); 458 } else { 459 if (req->payload_valid) { 460 bus_dmamap_unload(qpair->dma_tag_payload, 461 tr->payload_dma_map); 462 } 463 464 nvme_free_request(req); 465 tr->req = NULL; 466 467 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq); 468 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 469 470 /* 471 * If the controller is in the middle of resetting, don't 472 * try to submit queued requests here - let the reset logic 473 * handle that instead. 474 */ 475 if (!STAILQ_EMPTY(&qpair->queued_req) && 476 !qpair->ctrlr->is_resetting) { 477 req = STAILQ_FIRST(&qpair->queued_req); 478 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 479 _nvme_qpair_submit_request(qpair, req); 480 } 481 } 482 483 mtx_unlock(&qpair->lock); 484 } 485 486 static void 487 nvme_qpair_manual_complete_tracker( 488 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 489 error_print_t print_on_error) 490 { 491 struct nvme_completion cpl; 492 493 memset(&cpl, 0, sizeof(cpl)); 494 495 struct nvme_qpair * qpair = tr->qpair; 496 497 cpl.sqid = qpair->id; 498 cpl.cid = tr->cid; 499 cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT; 500 cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 501 cpl.status |= (dnr & NVME_STATUS_DNR_MASK) << NVME_STATUS_DNR_SHIFT; 502 /* M=0 : this is artificial so no data in error log page */ 503 /* CRD=0 : this is artificial and no delayed retry support anyway */ 504 /* P=0 : phase not checked */ 505 nvme_qpair_complete_tracker(tr, &cpl, print_on_error); 506 } 507 508 void 509 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, 510 struct nvme_request *req, uint32_t sct, uint32_t sc) 511 { 512 struct nvme_completion cpl; 513 bool error; 514 515 memset(&cpl, 0, sizeof(cpl)); 516 cpl.sqid = qpair->id; 517 cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT; 518 cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 519 520 error = nvme_completion_is_error(&cpl); 521 522 if (error) { 523 nvme_qpair_print_command(qpair, &req->cmd); 524 nvme_qpair_print_completion(qpair, &cpl); 525 } 526 527 if (req->cb_fn) 528 req->cb_fn(req->cb_arg, &cpl); 529 530 nvme_free_request(req); 531 } 532 533 bool 534 nvme_qpair_process_completions(struct nvme_qpair *qpair) 535 { 536 struct nvme_tracker *tr; 537 struct nvme_completion cpl; 538 int done = 0; 539 bool in_panic = dumping || SCHEDULER_STOPPED(); 540 541 /* 542 * qpair is not enabled, likely because a controller reset is in 543 * progress. Ignore the interrupt - any I/O that was associated with 544 * this interrupt will get retried when the reset is complete. Any 545 * pending completions for when we're in startup will be completed 546 * as soon as initialization is complete and we start sending commands 547 * to the device. 548 */ 549 if (qpair->recovery_state != RECOVERY_NONE) { 550 qpair->num_ignored++; 551 return (false); 552 } 553 554 /* 555 * Sanity check initialization. After we reset the hardware, the phase 556 * is defined to be 1. So if we get here with zero prior calls and the 557 * phase is 0, it means that we've lost a race between the 558 * initialization and the ISR running. With the phase wrong, we'll 559 * process a bunch of completions that aren't really completions leading 560 * to a KASSERT below. 561 */ 562 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0), 563 ("%s: Phase wrong for first interrupt call.", 564 device_get_nameunit(qpair->ctrlr->dev))); 565 566 qpair->num_intr_handler_calls++; 567 568 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 569 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 570 /* 571 * A panic can stop the CPU this routine is running on at any point. If 572 * we're called during a panic, complete the sq_head wrap protocol for 573 * the case where we are interrupted just after the increment at 1 574 * below, but before we can reset cq_head to zero at 2. Also cope with 575 * the case where we do the zero at 2, but may or may not have done the 576 * phase adjustment at step 3. The panic machinery flushes all pending 577 * memory writes, so we can make these strong ordering assumptions 578 * that would otherwise be unwise if we were racing in real time. 579 */ 580 if (__predict_false(in_panic)) { 581 if (qpair->cq_head == qpair->num_entries) { 582 /* 583 * Here we know that we need to zero cq_head and then negate 584 * the phase, which hasn't been assigned if cq_head isn't 585 * zero due to the atomic_store_rel. 586 */ 587 qpair->cq_head = 0; 588 qpair->phase = !qpair->phase; 589 } else if (qpair->cq_head == 0) { 590 /* 591 * In this case, we know that the assignment at 2 592 * happened below, but we don't know if it 3 happened or 593 * not. To do this, we look at the last completion 594 * entry and set the phase to the opposite phase 595 * that it has. This gets us back in sync 596 */ 597 cpl = qpair->cpl[qpair->num_entries - 1]; 598 nvme_completion_swapbytes(&cpl); 599 qpair->phase = !NVME_STATUS_GET_P(cpl.status); 600 } 601 } 602 603 while (1) { 604 uint16_t status; 605 606 /* 607 * We need to do this dance to avoid a race between the host and 608 * the device where the device overtakes the host while the host 609 * is reading this record, leaving the status field 'new' and 610 * the sqhd and cid fields potentially stale. If the phase 611 * doesn't match, that means status hasn't yet been updated and 612 * we'll get any pending changes next time. It also means that 613 * the phase must be the same the second time. We have to sync 614 * before reading to ensure any bouncing completes. 615 */ 616 status = le16toh(qpair->cpl[qpair->cq_head].status); 617 if (NVME_STATUS_GET_P(status) != qpair->phase) 618 break; 619 620 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 621 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 622 cpl = qpair->cpl[qpair->cq_head]; 623 nvme_completion_swapbytes(&cpl); 624 625 KASSERT( 626 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status), 627 ("Phase unexpectedly inconsistent")); 628 629 if (cpl.cid < qpair->num_trackers) 630 tr = qpair->act_tr[cpl.cid]; 631 else 632 tr = NULL; 633 634 done++; 635 if (tr != NULL) { 636 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL); 637 qpair->sq_head = cpl.sqhd; 638 } else if (!in_panic) { 639 /* 640 * A missing tracker is normally an error. However, a 641 * panic can stop the CPU this routine is running on 642 * after completing an I/O but before updating 643 * qpair->cq_head at 1 below. Later, we re-enter this 644 * routine to poll I/O associated with the kernel 645 * dump. We find that the tr has been set to null before 646 * calling the completion routine. If it hasn't 647 * completed (or it triggers a panic), then '1' below 648 * won't have updated cq_head. Rather than panic again, 649 * ignore this condition because it's not unexpected. 650 */ 651 nvme_printf(qpair->ctrlr, 652 "cpl (cid = %u) does not map to outstanding cmd\n", 653 cpl.cid); 654 nvme_qpair_print_completion(qpair, 655 &qpair->cpl[qpair->cq_head]); 656 KASSERT(0, ("received completion for unknown cmd")); 657 } 658 659 /* 660 * There's a number of races with the following (see above) when 661 * the system panics. We compensate for each one of them by 662 * using the atomic store to force strong ordering (at least when 663 * viewed in the aftermath of a panic). 664 */ 665 if (++qpair->cq_head == qpair->num_entries) { /* 1 */ 666 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */ 667 qpair->phase = !qpair->phase; /* 3 */ 668 } 669 } 670 671 if (done != 0) { 672 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, 673 qpair->cq_hdbl_off, qpair->cq_head); 674 } 675 676 return (done != 0); 677 } 678 679 static void 680 nvme_qpair_msi_handler(void *arg) 681 { 682 struct nvme_qpair *qpair = arg; 683 684 nvme_qpair_process_completions(qpair); 685 } 686 687 int 688 nvme_qpair_construct(struct nvme_qpair *qpair, 689 uint32_t num_entries, uint32_t num_trackers, 690 struct nvme_controller *ctrlr) 691 { 692 struct nvme_tracker *tr; 693 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz; 694 uint64_t queuemem_phys, prpmem_phys, list_phys; 695 uint8_t *queuemem, *prpmem, *prp_list; 696 int i, err; 697 698 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0; 699 qpair->num_entries = num_entries; 700 qpair->num_trackers = num_trackers; 701 qpair->ctrlr = ctrlr; 702 703 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); 704 705 /* Note: NVMe PRP format is restricted to 4-byte alignment. */ 706 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 707 4, ctrlr->page_size, BUS_SPACE_MAXADDR, 708 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size, 709 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1, 710 ctrlr->page_size, 0, 711 NULL, NULL, &qpair->dma_tag_payload); 712 if (err != 0) { 713 nvme_printf(ctrlr, "payload tag create failed %d\n", err); 714 goto out; 715 } 716 717 /* 718 * Each component must be page aligned, and individual PRP lists 719 * cannot cross a page boundary. 720 */ 721 cmdsz = qpair->num_entries * sizeof(struct nvme_command); 722 cmdsz = roundup2(cmdsz, ctrlr->page_size); 723 cplsz = qpair->num_entries * sizeof(struct nvme_completion); 724 cplsz = roundup2(cplsz, ctrlr->page_size); 725 /* 726 * For commands requiring more than 2 PRP entries, one PRP will be 727 * embedded in the command (prp1), and the rest of the PRP entries 728 * will be in a list pointed to by the command (prp2). 729 */ 730 prpsz = sizeof(uint64_t) * 731 howmany(ctrlr->max_xfer_size, ctrlr->page_size); 732 prpmemsz = qpair->num_trackers * prpsz; 733 allocsz = cmdsz + cplsz + prpmemsz; 734 735 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 736 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, 737 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag); 738 if (err != 0) { 739 nvme_printf(ctrlr, "tag create failed %d\n", err); 740 goto out; 741 } 742 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain); 743 744 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, 745 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) { 746 nvme_printf(ctrlr, "failed to alloc qpair memory\n"); 747 goto out; 748 } 749 750 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map, 751 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) { 752 nvme_printf(ctrlr, "failed to load qpair memory\n"); 753 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 754 qpair->queuemem_map); 755 goto out; 756 } 757 758 qpair->num_cmds = 0; 759 qpair->num_intr_handler_calls = 0; 760 qpair->num_retries = 0; 761 qpair->num_failures = 0; 762 qpair->num_ignored = 0; 763 qpair->cmd = (struct nvme_command *)queuemem; 764 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); 765 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); 766 qpair->cmd_bus_addr = queuemem_phys; 767 qpair->cpl_bus_addr = queuemem_phys + cmdsz; 768 prpmem_phys = queuemem_phys + cmdsz + cplsz; 769 770 callout_init(&qpair->timer, 1); 771 qpair->timer_armed = false; 772 qpair->recovery_state = RECOVERY_WAITING; 773 774 /* 775 * Calcuate the stride of the doorbell register. Many emulators set this 776 * value to correspond to a cache line. However, some hardware has set 777 * it to various small values. 778 */ 779 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) + 780 (qpair->id << (ctrlr->dstrd + 1)); 781 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) + 782 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd); 783 784 TAILQ_INIT(&qpair->free_tr); 785 TAILQ_INIT(&qpair->outstanding_tr); 786 STAILQ_INIT(&qpair->queued_req); 787 788 list_phys = prpmem_phys; 789 prp_list = prpmem; 790 for (i = 0; i < qpair->num_trackers; i++) { 791 if (list_phys + prpsz > prpmem_phys + prpmemsz) { 792 qpair->num_trackers = i; 793 break; 794 } 795 796 /* 797 * Make sure that the PRP list for this tracker doesn't 798 * overflow to another nvme page. 799 */ 800 if (trunc_page(list_phys) != 801 trunc_page(list_phys + prpsz - 1)) { 802 list_phys = roundup2(list_phys, ctrlr->page_size); 803 prp_list = 804 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size); 805 } 806 807 tr = malloc_domainset(sizeof(*tr), M_NVME, 808 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK); 809 bus_dmamap_create(qpair->dma_tag_payload, 0, 810 &tr->payload_dma_map); 811 tr->cid = i; 812 tr->qpair = qpair; 813 tr->prp = (uint64_t *)prp_list; 814 tr->prp_bus_addr = list_phys; 815 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 816 list_phys += prpsz; 817 prp_list += prpsz; 818 } 819 820 if (qpair->num_trackers == 0) { 821 nvme_printf(ctrlr, "failed to allocate enough trackers\n"); 822 goto out; 823 } 824 825 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) * 826 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain), 827 M_ZERO | M_WAITOK); 828 829 if (ctrlr->msi_count > 1) { 830 /* 831 * MSI-X vector resource IDs start at 1, so we add one to 832 * the queue's vector to get the corresponding rid to use. 833 */ 834 qpair->rid = qpair->vector + 1; 835 836 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, 837 &qpair->rid, RF_ACTIVE); 838 if (qpair->res == NULL) { 839 nvme_printf(ctrlr, "unable to allocate MSI\n"); 840 goto out; 841 } 842 if (bus_setup_intr(ctrlr->dev, qpair->res, 843 INTR_TYPE_MISC | INTR_MPSAFE, NULL, 844 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) { 845 nvme_printf(ctrlr, "unable to setup MSI\n"); 846 goto out; 847 } 848 if (qpair->id == 0) { 849 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 850 "admin"); 851 } else { 852 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 853 "io%d", qpair->id - 1); 854 } 855 } 856 857 return (0); 858 859 out: 860 nvme_qpair_destroy(qpair); 861 return (ENOMEM); 862 } 863 864 static void 865 nvme_qpair_destroy(struct nvme_qpair *qpair) 866 { 867 struct nvme_tracker *tr; 868 869 callout_drain(&qpair->timer); 870 871 if (qpair->tag) { 872 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); 873 qpair->tag = NULL; 874 } 875 876 if (qpair->act_tr) { 877 free(qpair->act_tr, M_NVME); 878 qpair->act_tr = NULL; 879 } 880 881 while (!TAILQ_EMPTY(&qpair->free_tr)) { 882 tr = TAILQ_FIRST(&qpair->free_tr); 883 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 884 bus_dmamap_destroy(qpair->dma_tag_payload, 885 tr->payload_dma_map); 886 free(tr, M_NVME); 887 } 888 889 if (qpair->cmd != NULL) { 890 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map); 891 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 892 qpair->queuemem_map); 893 qpair->cmd = NULL; 894 } 895 896 if (qpair->dma_tag) { 897 bus_dma_tag_destroy(qpair->dma_tag); 898 qpair->dma_tag = NULL; 899 } 900 901 if (qpair->dma_tag_payload) { 902 bus_dma_tag_destroy(qpair->dma_tag_payload); 903 qpair->dma_tag_payload = NULL; 904 } 905 906 if (mtx_initialized(&qpair->lock)) 907 mtx_destroy(&qpair->lock); 908 909 if (qpair->res) { 910 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, 911 rman_get_rid(qpair->res), qpair->res); 912 qpair->res = NULL; 913 } 914 } 915 916 static void 917 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) 918 { 919 struct nvme_tracker *tr; 920 921 tr = TAILQ_FIRST(&qpair->outstanding_tr); 922 while (tr != NULL) { 923 if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { 924 nvme_qpair_manual_complete_tracker(tr, 925 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, 926 ERROR_PRINT_NONE); 927 tr = TAILQ_FIRST(&qpair->outstanding_tr); 928 } else { 929 tr = TAILQ_NEXT(tr, tailq); 930 } 931 } 932 } 933 934 void 935 nvme_admin_qpair_destroy(struct nvme_qpair *qpair) 936 { 937 938 nvme_admin_qpair_abort_aers(qpair); 939 nvme_qpair_destroy(qpair); 940 } 941 942 void 943 nvme_io_qpair_destroy(struct nvme_qpair *qpair) 944 { 945 946 nvme_qpair_destroy(qpair); 947 } 948 949 static void 950 nvme_qpair_timeout(void *arg) 951 { 952 struct nvme_qpair *qpair = arg; 953 struct nvme_controller *ctrlr = qpair->ctrlr; 954 struct nvme_tracker *tr; 955 sbintime_t now; 956 bool idle; 957 uint32_t csts; 958 uint8_t cfs; 959 960 mtx_lock(&qpair->lock); 961 idle = TAILQ_EMPTY(&qpair->outstanding_tr); 962 again: 963 switch (qpair->recovery_state) { 964 case RECOVERY_NONE: 965 if (idle) 966 break; 967 now = getsbinuptime(); 968 idle = true; 969 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { 970 if (tr->deadline == SBT_MAX) 971 continue; 972 idle = false; 973 if (now > tr->deadline) { 974 /* 975 * We're now passed our earliest deadline. We 976 * need to do expensive things to cope, but next 977 * time. Flag that and close the door to any 978 * further processing. 979 */ 980 qpair->recovery_state = RECOVERY_START; 981 nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd\n", 982 (uintmax_t)now, (uintmax_t)tr->deadline); 983 break; 984 } 985 } 986 break; 987 case RECOVERY_START: 988 /* 989 * Read csts to get value of cfs - controller fatal status. 990 * If no fatal status, try to call the completion routine, and 991 * if completes transactions, report a missed interrupt and 992 * return (this may need to be rate limited). Otherwise, if 993 * aborts are enabled and the controller is not reporting 994 * fatal status, abort the command. Otherwise, just reset the 995 * controller and hope for the best. 996 */ 997 csts = nvme_mmio_read_4(ctrlr, csts); 998 cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; 999 if (cfs) { 1000 nvme_printf(ctrlr, "Controller in fatal status, resetting\n"); 1001 qpair->recovery_state = RECOVERY_RESET; 1002 goto again; 1003 } 1004 mtx_unlock(&qpair->lock); 1005 if (nvme_qpair_process_completions(qpair)) { 1006 nvme_printf(ctrlr, "Completions present in output without an interrupt\n"); 1007 qpair->recovery_state = RECOVERY_NONE; 1008 } else { 1009 nvme_printf(ctrlr, "timeout with nothing complete, resetting\n"); 1010 qpair->recovery_state = RECOVERY_RESET; 1011 mtx_lock(&qpair->lock); 1012 goto again; 1013 } 1014 mtx_lock(&qpair->lock); 1015 break; 1016 case RECOVERY_RESET: 1017 /* 1018 * If we get here due to a possible surprise hot-unplug event, 1019 * then we let nvme_ctrlr_reset confirm and fail the 1020 * controller. 1021 */ 1022 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", 1023 (csts == 0xffffffff) ? " and possible hot unplug" : 1024 (cfs ? " and fatal error status" : "")); 1025 nvme_printf(ctrlr, "RECOVERY_WAITING\n"); 1026 qpair->recovery_state = RECOVERY_WAITING; 1027 nvme_ctrlr_reset(ctrlr); 1028 break; 1029 case RECOVERY_WAITING: 1030 nvme_printf(ctrlr, "waiting\n"); 1031 break; 1032 } 1033 1034 /* 1035 * Rearm the timeout. 1036 */ 1037 if (!idle) { 1038 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0); 1039 } else { 1040 qpair->timer_armed = false; 1041 } 1042 mtx_unlock(&qpair->lock); 1043 } 1044 1045 /* 1046 * Submit the tracker to the hardware. Must already be in the 1047 * outstanding queue when called. 1048 */ 1049 void 1050 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) 1051 { 1052 struct nvme_request *req; 1053 struct nvme_controller *ctrlr; 1054 int timeout; 1055 1056 mtx_assert(&qpair->lock, MA_OWNED); 1057 1058 req = tr->req; 1059 req->cmd.cid = tr->cid; 1060 qpair->act_tr[tr->cid] = tr; 1061 ctrlr = qpair->ctrlr; 1062 1063 if (req->timeout) { 1064 if (req->cb_fn == nvme_completion_poll_cb) 1065 timeout = 1; 1066 else 1067 timeout = ctrlr->timeout_period; 1068 tr->deadline = getsbinuptime() + timeout * SBT_1S; 1069 if (!qpair->timer_armed) { 1070 qpair->timer_armed = true; 1071 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 1072 nvme_qpair_timeout, qpair, qpair->cpu, 0); 1073 } 1074 } else 1075 tr->deadline = SBT_MAX; 1076 1077 /* Copy the command from the tracker to the submission queue. */ 1078 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); 1079 1080 if (++qpair->sq_tail == qpair->num_entries) 1081 qpair->sq_tail = 0; 1082 1083 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 1084 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 1085 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, 1086 qpair->sq_tdbl_off, qpair->sq_tail); 1087 qpair->num_cmds++; 1088 } 1089 1090 static void 1091 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) 1092 { 1093 struct nvme_tracker *tr = arg; 1094 uint32_t cur_nseg; 1095 1096 /* 1097 * If the mapping operation failed, return immediately. The caller 1098 * is responsible for detecting the error status and failing the 1099 * tracker manually. 1100 */ 1101 if (error != 0) { 1102 nvme_printf(tr->qpair->ctrlr, 1103 "nvme_payload_map err %d\n", error); 1104 return; 1105 } 1106 1107 /* 1108 * Note that we specified ctrlr->page_size for alignment and max 1109 * segment size when creating the bus dma tags. So here we can safely 1110 * just transfer each segment to its associated PRP entry. 1111 */ 1112 tr->req->cmd.prp1 = htole64(seg[0].ds_addr); 1113 1114 if (nseg == 2) { 1115 tr->req->cmd.prp2 = htole64(seg[1].ds_addr); 1116 } else if (nseg > 2) { 1117 cur_nseg = 1; 1118 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr); 1119 while (cur_nseg < nseg) { 1120 tr->prp[cur_nseg-1] = 1121 htole64((uint64_t)seg[cur_nseg].ds_addr); 1122 cur_nseg++; 1123 } 1124 } else { 1125 /* 1126 * prp2 should not be used by the controller 1127 * since there is only one segment, but set 1128 * to 0 just to be safe. 1129 */ 1130 tr->req->cmd.prp2 = 0; 1131 } 1132 1133 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map, 1134 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 1135 nvme_qpair_submit_tracker(tr->qpair, tr); 1136 } 1137 1138 static void 1139 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 1140 { 1141 struct nvme_tracker *tr; 1142 int err = 0; 1143 1144 mtx_assert(&qpair->lock, MA_OWNED); 1145 1146 tr = TAILQ_FIRST(&qpair->free_tr); 1147 req->qpair = qpair; 1148 1149 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { 1150 /* 1151 * No tracker is available, or the qpair is disabled due to 1152 * an in-progress controller-level reset or controller 1153 * failure. 1154 */ 1155 1156 if (qpair->ctrlr->is_failed) { 1157 /* 1158 * The controller has failed, so fail the request. 1159 */ 1160 nvme_qpair_manual_complete_request(qpair, req, 1161 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST); 1162 } else { 1163 /* 1164 * Put the request on the qpair's request queue to be 1165 * processed when a tracker frees up via a command 1166 * completion or when the controller reset is 1167 * completed. 1168 */ 1169 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 1170 } 1171 return; 1172 } 1173 1174 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 1175 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); 1176 tr->deadline = SBT_MAX; 1177 tr->req = req; 1178 1179 if (!req->payload_valid) { 1180 nvme_qpair_submit_tracker(tr->qpair, tr); 1181 return; 1182 } 1183 1184 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload, 1185 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0); 1186 if (err != 0) { 1187 /* 1188 * The dmamap operation failed, so we manually fail the 1189 * tracker here with DATA_TRANSFER_ERROR status. 1190 * 1191 * nvme_qpair_manual_complete_tracker must not be called 1192 * with the qpair lock held. 1193 */ 1194 nvme_printf(qpair->ctrlr, 1195 "bus_dmamap_load_mem returned 0x%x!\n", err); 1196 mtx_unlock(&qpair->lock); 1197 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1198 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL); 1199 mtx_lock(&qpair->lock); 1200 } 1201 } 1202 1203 void 1204 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 1205 { 1206 1207 mtx_lock(&qpair->lock); 1208 _nvme_qpair_submit_request(qpair, req); 1209 mtx_unlock(&qpair->lock); 1210 } 1211 1212 static void 1213 nvme_qpair_enable(struct nvme_qpair *qpair) 1214 { 1215 mtx_assert(&qpair->lock, MA_OWNED); 1216 1217 qpair->recovery_state = RECOVERY_NONE; 1218 } 1219 1220 void 1221 nvme_qpair_reset(struct nvme_qpair *qpair) 1222 { 1223 1224 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0; 1225 1226 /* 1227 * First time through the completion queue, HW will set phase 1228 * bit on completions to 1. So set this to 1 here, indicating 1229 * we're looking for a 1 to know which entries have completed. 1230 * we'll toggle the bit each time when the completion queue 1231 * rolls over. 1232 */ 1233 qpair->phase = 1; 1234 1235 memset(qpair->cmd, 0, 1236 qpair->num_entries * sizeof(struct nvme_command)); 1237 memset(qpair->cpl, 0, 1238 qpair->num_entries * sizeof(struct nvme_completion)); 1239 } 1240 1241 void 1242 nvme_admin_qpair_enable(struct nvme_qpair *qpair) 1243 { 1244 struct nvme_tracker *tr; 1245 struct nvme_tracker *tr_temp; 1246 bool rpt; 1247 1248 /* 1249 * Manually abort each outstanding admin command. Do not retry 1250 * admin commands found here, since they will be left over from 1251 * a controller reset and its likely the context in which the 1252 * command was issued no longer applies. 1253 */ 1254 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr); 1255 if (rpt) 1256 nvme_printf(qpair->ctrlr, 1257 "aborting outstanding admin command\n"); 1258 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1259 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1260 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); 1261 } 1262 if (rpt) 1263 nvme_printf(qpair->ctrlr, 1264 "done aborting outstanding admin\n"); 1265 1266 mtx_lock(&qpair->lock); 1267 nvme_qpair_enable(qpair); 1268 mtx_unlock(&qpair->lock); 1269 } 1270 1271 void 1272 nvme_io_qpair_enable(struct nvme_qpair *qpair) 1273 { 1274 STAILQ_HEAD(, nvme_request) temp; 1275 struct nvme_tracker *tr; 1276 struct nvme_tracker *tr_temp; 1277 struct nvme_request *req; 1278 bool report; 1279 1280 /* 1281 * Manually abort each outstanding I/O. This normally results in a 1282 * retry, unless the retry count on the associated request has 1283 * reached its limit. 1284 */ 1285 report = !TAILQ_EMPTY(&qpair->outstanding_tr); 1286 if (report) 1287 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); 1288 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1289 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1290 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY); 1291 } 1292 if (report) 1293 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n"); 1294 1295 mtx_lock(&qpair->lock); 1296 1297 nvme_qpair_enable(qpair); 1298 1299 STAILQ_INIT(&temp); 1300 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request); 1301 1302 report = !STAILQ_EMPTY(&temp); 1303 if (report) 1304 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n"); 1305 while (!STAILQ_EMPTY(&temp)) { 1306 req = STAILQ_FIRST(&temp); 1307 STAILQ_REMOVE_HEAD(&temp, stailq); 1308 nvme_qpair_print_command(qpair, &req->cmd); 1309 _nvme_qpair_submit_request(qpair, req); 1310 } 1311 if (report) 1312 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n"); 1313 1314 mtx_unlock(&qpair->lock); 1315 } 1316 1317 static void 1318 nvme_qpair_disable(struct nvme_qpair *qpair) 1319 { 1320 struct nvme_tracker *tr, *tr_temp; 1321 1322 mtx_lock(&qpair->lock); 1323 qpair->recovery_state = RECOVERY_WAITING; 1324 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 1325 tr->deadline = SBT_MAX; 1326 } 1327 mtx_unlock(&qpair->lock); 1328 } 1329 1330 void 1331 nvme_admin_qpair_disable(struct nvme_qpair *qpair) 1332 { 1333 1334 nvme_qpair_disable(qpair); 1335 nvme_admin_qpair_abort_aers(qpair); 1336 } 1337 1338 void 1339 nvme_io_qpair_disable(struct nvme_qpair *qpair) 1340 { 1341 1342 nvme_qpair_disable(qpair); 1343 } 1344 1345 void 1346 nvme_qpair_fail(struct nvme_qpair *qpair) 1347 { 1348 struct nvme_tracker *tr; 1349 struct nvme_request *req; 1350 1351 if (!mtx_initialized(&qpair->lock)) 1352 return; 1353 1354 mtx_lock(&qpair->lock); 1355 1356 while (!STAILQ_EMPTY(&qpair->queued_req)) { 1357 req = STAILQ_FIRST(&qpair->queued_req); 1358 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1359 nvme_printf(qpair->ctrlr, "failing queued i/o\n"); 1360 mtx_unlock(&qpair->lock); 1361 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, 1362 NVME_SC_ABORTED_BY_REQUEST); 1363 mtx_lock(&qpair->lock); 1364 } 1365 1366 /* Manually abort each outstanding I/O. */ 1367 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { 1368 tr = TAILQ_FIRST(&qpair->outstanding_tr); 1369 /* 1370 * Do not remove the tracker. The abort_tracker path will 1371 * do that for us. 1372 */ 1373 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); 1374 mtx_unlock(&qpair->lock); 1375 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, 1376 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); 1377 mtx_lock(&qpair->lock); 1378 } 1379 1380 mtx_unlock(&qpair->lock); 1381 } 1382