1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2024 Oxide Computer Company 14 */ 15 16 #include "ena_hw.h" 17 #include "ena.h" 18 19 bool ena_force_reset = false; 20 21 static void 22 ena_watchdog(void *arg) 23 { 24 ena_t *ena = arg; 25 uint32_t statusreg; 26 enum { 27 RESET_NONE = 0, 28 RESET_FORCED, 29 RESET_ERROR, 30 RESET_FATAL, 31 RESET_KEEPALIVE, 32 RESET_TX_STALL, 33 } reset = RESET_NONE; 34 35 if (ena_force_reset) { 36 ena_force_reset = false; 37 reset = RESET_FORCED; 38 goto out; 39 } 40 41 if (ena->ena_state & ENA_STATE_ERROR) { 42 atomic_and_32(&ena->ena_state, ~ENA_STATE_ERROR); 43 reset = RESET_ERROR; 44 goto out; 45 } 46 47 statusreg = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); 48 if ((statusreg & ENAHW_DEV_STS_FATAL_ERROR_MASK) >> 49 ENAHW_DEV_STS_FATAL_ERROR_SHIFT != 0) { 50 reset = RESET_FATAL; 51 goto out; 52 } 53 54 if (ena->ena_watchdog_last_keepalive > 0 && 55 gethrtime() - ena->ena_watchdog_last_keepalive > 56 ENA_DEVICE_KEEPALIVE_TIMEOUT_NS) { 57 reset = RESET_KEEPALIVE; 58 goto out; 59 } 60 61 bool stalled = false; 62 uint_t stalledq = 0; 63 for (uint_t i = 0; i < ena->ena_num_txqs; i++) { 64 ena_txq_t *txq = &ena->ena_txqs[i]; 65 uint32_t s; 66 67 mutex_enter(&txq->et_lock); 68 if (txq->et_blocked) 69 s = ++txq->et_stall_watchdog; 70 else 71 s = txq->et_stall_watchdog = 0; 72 mutex_exit(&txq->et_lock); 73 74 if (s > ENA_TX_STALL_TIMEOUT) { 75 stalled = true; 76 stalledq = i; 77 break; 78 } 79 } 80 if (stalled) { 81 reset = RESET_TX_STALL; 82 goto out; 83 } 84 85 out: 86 if (reset != RESET_NONE) { 87 enahw_reset_reason_t reason; 88 89 mutex_enter(&ena->ena_lock); 90 switch (reset) { 91 case RESET_FORCED: 92 ena->ena_device_stat.eds_reset_forced.value.ui64++; 93 ena_err(ena, "forced reset"); 94 reason = ENAHW_RESET_USER_TRIGGER; 95 break; 96 case RESET_ERROR: 97 /* 98 * Whoever set the error bit will have also set the 99 * reset reason for us. 100 */ 101 ena->ena_device_stat.eds_reset_error.value.ui64++; 102 ena_err(ena, "error state detected"); 103 reason = ena->ena_reset_reason; 104 break; 105 case RESET_FATAL: 106 ena->ena_device_stat.eds_reset_fatal.value.ui64++; 107 ena_err(ena, "device reports fatal error (status 0x%x)" 108 ", resetting", statusreg); 109 reason = ENAHW_RESET_GENERIC; 110 break; 111 case RESET_KEEPALIVE: 112 ena->ena_device_stat.eds_reset_keepalive.value.ui64++; 113 ena_err(ena, "device keepalive timeout"); 114 reason = ENAHW_RESET_KEEP_ALIVE_TO; 115 break; 116 case RESET_TX_STALL: 117 ena->ena_device_stat.eds_reset_txstall.value.ui64++; 118 ena_err(ena, "TX ring 0x%x appears stalled, resetting", 119 stalledq); 120 reason = ENAHW_RESET_MISS_TX_CMPL; 121 break; 122 default: 123 ena_panic(ena, "unhandled case in reset switch"); 124 } 125 ena->ena_reset_reason = reason; 126 mutex_exit(&ena->ena_lock); 127 128 if (!ena_reset(ena, reason)) 129 ena_panic(ena, "failed to reset device"); 130 } 131 } 132 133 void 134 ena_enable_watchdog(ena_t *ena) 135 { 136 mutex_enter(&ena->ena_watchdog_lock); 137 if (ena->ena_watchdog_periodic == NULL) { 138 ena->ena_watchdog_periodic = ddi_periodic_add(ena_watchdog, 139 (void *)ena, ENA_WATCHDOG_INTERVAL_NS, DDI_IPL_0); 140 } 141 mutex_exit(&ena->ena_watchdog_lock); 142 } 143 144 void 145 ena_disable_watchdog(ena_t *ena) 146 { 147 mutex_enter(&ena->ena_watchdog_lock); 148 if (ena->ena_watchdog_periodic != NULL) { 149 ddi_periodic_delete(ena->ena_watchdog_periodic); 150 ena->ena_watchdog_periodic = NULL; 151 } 152 mutex_exit(&ena->ena_watchdog_lock); 153 } 154