1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2024 Oxide Computer Company
14 */
15
16 #include "ena_hw.h"
17 #include "ena.h"
18
19 bool ena_force_reset = false;
20
21 static void
ena_watchdog(void * arg)22 ena_watchdog(void *arg)
23 {
24 ena_t *ena = arg;
25 uint32_t statusreg;
26 enum {
27 RESET_NONE = 0,
28 RESET_FORCED,
29 RESET_ERROR,
30 RESET_FATAL,
31 RESET_KEEPALIVE,
32 RESET_TX_STALL,
33 } reset = RESET_NONE;
34
35 if (ena_force_reset) {
36 ena_force_reset = false;
37 reset = RESET_FORCED;
38 goto out;
39 }
40
41 if (ena->ena_state & ENA_STATE_ERROR) {
42 atomic_and_32(&ena->ena_state, ~ENA_STATE_ERROR);
43 reset = RESET_ERROR;
44 goto out;
45 }
46
47 statusreg = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
48 if ((statusreg & ENAHW_DEV_STS_FATAL_ERROR_MASK) >>
49 ENAHW_DEV_STS_FATAL_ERROR_SHIFT != 0) {
50 reset = RESET_FATAL;
51 goto out;
52 }
53
54 if (ena->ena_watchdog_last_keepalive > 0 &&
55 gethrtime() - ena->ena_watchdog_last_keepalive >
56 ENA_DEVICE_KEEPALIVE_TIMEOUT_NS) {
57 reset = RESET_KEEPALIVE;
58 goto out;
59 }
60
61 bool stalled = false;
62 uint_t stalledq = 0;
63 for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
64 ena_txq_t *txq = &ena->ena_txqs[i];
65 uint32_t s;
66
67 mutex_enter(&txq->et_lock);
68 if (txq->et_blocked)
69 s = ++txq->et_stall_watchdog;
70 else
71 s = txq->et_stall_watchdog = 0;
72 mutex_exit(&txq->et_lock);
73
74 if (s > ENA_TX_STALL_TIMEOUT) {
75 stalled = true;
76 stalledq = i;
77 break;
78 }
79 }
80 if (stalled) {
81 reset = RESET_TX_STALL;
82 goto out;
83 }
84
85 out:
86 if (reset != RESET_NONE) {
87 enahw_reset_reason_t reason;
88
89 mutex_enter(&ena->ena_lock);
90 switch (reset) {
91 case RESET_FORCED:
92 ena->ena_device_stat.eds_reset_forced.value.ui64++;
93 ena_err(ena, "forced reset");
94 reason = ENAHW_RESET_USER_TRIGGER;
95 break;
96 case RESET_ERROR:
97 /*
98 * Whoever set the error bit will have also set the
99 * reset reason for us.
100 */
101 ena->ena_device_stat.eds_reset_error.value.ui64++;
102 ena_err(ena, "error state detected");
103 reason = ena->ena_reset_reason;
104 break;
105 case RESET_FATAL:
106 ena->ena_device_stat.eds_reset_fatal.value.ui64++;
107 ena_err(ena, "device reports fatal error (status 0x%x)"
108 ", resetting", statusreg);
109 reason = ENAHW_RESET_GENERIC;
110 break;
111 case RESET_KEEPALIVE:
112 ena->ena_device_stat.eds_reset_keepalive.value.ui64++;
113 ena_err(ena, "device keepalive timeout");
114 reason = ENAHW_RESET_KEEP_ALIVE_TO;
115 break;
116 case RESET_TX_STALL:
117 ena->ena_device_stat.eds_reset_txstall.value.ui64++;
118 ena_err(ena, "TX ring 0x%x appears stalled, resetting",
119 stalledq);
120 reason = ENAHW_RESET_MISS_TX_CMPL;
121 break;
122 default:
123 ena_panic(ena, "unhandled case in reset switch");
124 }
125 ena->ena_reset_reason = reason;
126 mutex_exit(&ena->ena_lock);
127
128 if (!ena_reset(ena, reason))
129 ena_panic(ena, "failed to reset device");
130 }
131 }
132
133 void
ena_enable_watchdog(ena_t * ena)134 ena_enable_watchdog(ena_t *ena)
135 {
136 mutex_enter(&ena->ena_watchdog_lock);
137 if (ena->ena_watchdog_periodic == NULL) {
138 ena->ena_watchdog_periodic = ddi_periodic_add(ena_watchdog,
139 (void *)ena, ENA_WATCHDOG_INTERVAL_NS, DDI_IPL_0);
140 }
141 mutex_exit(&ena->ena_watchdog_lock);
142 }
143
144 void
ena_disable_watchdog(ena_t * ena)145 ena_disable_watchdog(ena_t *ena)
146 {
147 mutex_enter(&ena->ena_watchdog_lock);
148 if (ena->ena_watchdog_periodic != NULL) {
149 ddi_periodic_delete(ena->ena_watchdog_periodic);
150 ena->ena_watchdog_periodic = NULL;
151 }
152 mutex_exit(&ena->ena_watchdog_lock);
153 }
154