xref: /illumos-gate/usr/src/uts/common/io/ena/ena_watchdog.c (revision c46e4de36cfe07276cbfb1e05a749e7765e50533)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2024 Oxide Computer Company
14  */
15 
16 #include "ena_hw.h"
17 #include "ena.h"
18 
19 bool ena_force_reset = false;
20 
21 static void
ena_watchdog(void * arg)22 ena_watchdog(void *arg)
23 {
24 	ena_t *ena = arg;
25 	uint32_t statusreg;
26 	enum {
27 		RESET_NONE = 0,
28 		RESET_FORCED,
29 		RESET_ERROR,
30 		RESET_FATAL,
31 		RESET_KEEPALIVE,
32 		RESET_TX_STALL,
33 	} reset = RESET_NONE;
34 
35 	if (ena_force_reset) {
36 		ena_force_reset = false;
37 		reset = RESET_FORCED;
38 		goto out;
39 	}
40 
41 	if (ena->ena_state & ENA_STATE_ERROR) {
42 		atomic_and_32(&ena->ena_state, ~ENA_STATE_ERROR);
43 		reset = RESET_ERROR;
44 		goto out;
45 	}
46 
47 	statusreg = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
48 	if ((statusreg & ENAHW_DEV_STS_FATAL_ERROR_MASK) >>
49 	    ENAHW_DEV_STS_FATAL_ERROR_SHIFT != 0) {
50 		reset = RESET_FATAL;
51 		goto out;
52 	}
53 
54 	if (ena->ena_watchdog_last_keepalive > 0 &&
55 	    gethrtime() - ena->ena_watchdog_last_keepalive >
56 	    ENA_DEVICE_KEEPALIVE_TIMEOUT_NS) {
57 		reset = RESET_KEEPALIVE;
58 		goto out;
59 	}
60 
61 	bool stalled = false;
62 	uint_t stalledq = 0;
63 	for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
64 		ena_txq_t *txq = &ena->ena_txqs[i];
65 		uint32_t s;
66 
67 		mutex_enter(&txq->et_lock);
68 		if (txq->et_blocked)
69 			s = ++txq->et_stall_watchdog;
70 		else
71 			s = txq->et_stall_watchdog = 0;
72 		mutex_exit(&txq->et_lock);
73 
74 		if (s > ENA_TX_STALL_TIMEOUT) {
75 			stalled = true;
76 			stalledq = i;
77 			break;
78 		}
79 	}
80 	if (stalled) {
81 		reset = RESET_TX_STALL;
82 		goto out;
83 	}
84 
85 out:
86 	if (reset != RESET_NONE) {
87 		enahw_reset_reason_t reason;
88 
89 		mutex_enter(&ena->ena_lock);
90 		switch (reset) {
91 		case RESET_FORCED:
92 			ena->ena_device_stat.eds_reset_forced.value.ui64++;
93 			ena_err(ena, "forced reset");
94 			reason = ENAHW_RESET_USER_TRIGGER;
95 			break;
96 		case RESET_ERROR:
97 			/*
98 			 * Whoever set the error bit will have also set the
99 			 * reset reason for us.
100 			 */
101 			ena->ena_device_stat.eds_reset_error.value.ui64++;
102 			ena_err(ena, "error state detected");
103 			reason = ena->ena_reset_reason;
104 			break;
105 		case RESET_FATAL:
106 			ena->ena_device_stat.eds_reset_fatal.value.ui64++;
107 			ena_err(ena, "device reports fatal error (status 0x%x)"
108 			    ", resetting", statusreg);
109 			reason = ENAHW_RESET_GENERIC;
110 			break;
111 		case RESET_KEEPALIVE:
112 			ena->ena_device_stat.eds_reset_keepalive.value.ui64++;
113 			ena_err(ena, "device keepalive timeout");
114 			reason = ENAHW_RESET_KEEP_ALIVE_TO;
115 			break;
116 		case RESET_TX_STALL:
117 			ena->ena_device_stat.eds_reset_txstall.value.ui64++;
118 			ena_err(ena, "TX ring 0x%x appears stalled, resetting",
119 			    stalledq);
120 			reason = ENAHW_RESET_MISS_TX_CMPL;
121 			break;
122 		default:
123 			ena_panic(ena, "unhandled case in reset switch");
124 		}
125 		ena->ena_reset_reason = reason;
126 		mutex_exit(&ena->ena_lock);
127 
128 		if (!ena_reset(ena, reason))
129 			ena_panic(ena, "failed to reset device");
130 	}
131 }
132 
133 void
ena_enable_watchdog(ena_t * ena)134 ena_enable_watchdog(ena_t *ena)
135 {
136 	mutex_enter(&ena->ena_watchdog_lock);
137 	if (ena->ena_watchdog_periodic == NULL) {
138 		ena->ena_watchdog_periodic = ddi_periodic_add(ena_watchdog,
139 		    (void *)ena, ENA_WATCHDOG_INTERVAL_NS, DDI_IPL_0);
140 	}
141 	mutex_exit(&ena->ena_watchdog_lock);
142 }
143 
144 void
ena_disable_watchdog(ena_t * ena)145 ena_disable_watchdog(ena_t *ena)
146 {
147 	mutex_enter(&ena->ena_watchdog_lock);
148 	if (ena->ena_watchdog_periodic != NULL) {
149 		ddi_periodic_delete(ena->ena_watchdog_periodic);
150 		ena->ena_watchdog_periodic = NULL;
151 	}
152 	mutex_exit(&ena->ena_watchdog_lock);
153 }
154