xref: /linux/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include "ras.h"
25 #include "ras_core_status.h"
26 #include "ras_log_ring.h"
27 
28 #define RAS_LOG_MAX_QUERY_SIZE   0xC000
29 #define RAS_LOG_MEM_TEMP_SIZE    0x200
30 #define RAS_LOG_MEMPOOL_SIZE \
31 	(RAS_LOG_MAX_QUERY_SIZE + RAS_LOG_MEM_TEMP_SIZE)
32 
33 #define BATCH_IDX_TO_TREE_IDX(batch_idx, sn) (((batch_idx) << 8) | (sn))
34 
35 static const uint64_t ras_rma_aca_reg[ACA_REG_MAX_COUNT] = {
36 	[ACA_REG_IDX__CTL]    = 0x1,
37 	[ACA_REG_IDX__STATUS] = 0xB000000000000137,
38 	[ACA_REG_IDX__ADDR]   = 0x0,
39 	[ACA_REG_IDX__MISC0]  = 0x0,
40 	[ACA_REG_IDX__CONFG] = 0x1ff00000002,
41 	[ACA_REG_IDX__IPID]   = 0x9600000000,
42 	[ACA_REG_IDX__SYND]   = 0x0,
43 };
44 
ras_log_ring_get_logged_ecc_count(struct ras_core_context * ras_core)45 static uint64_t ras_log_ring_get_logged_ecc_count(struct ras_core_context *ras_core)
46 {
47 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
48 	uint64_t count = 0;
49 
50 	if (log_ring->logged_ecc_count < 0) {
51 		RAS_DEV_WARN(ras_core->dev,
52 			"Error: the logged ras count should not less than 0!\n");
53 		count = 0;
54 	} else {
55 		count = log_ring->logged_ecc_count;
56 	}
57 
58 	if (count > RAS_LOG_MEMPOOL_SIZE)
59 		RAS_DEV_WARN(ras_core->dev,
60 			"Error: the logged ras count is out of range!\n");
61 
62 	return count;
63 }
64 
ras_log_ring_add_data(struct ras_core_context * ras_core,struct ras_log_info * log,struct ras_log_batch_tag * batch_tag)65 static int ras_log_ring_add_data(struct ras_core_context *ras_core,
66 			struct ras_log_info *log, struct ras_log_batch_tag *batch_tag)
67 {
68 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
69 	unsigned long flags = 0;
70 	int ret = 0;
71 
72 	if (batch_tag && (batch_tag->sub_seqno >= MAX_RECORD_PER_BATCH)) {
73 		RAS_DEV_ERR(ras_core->dev,
74 			"Invalid batch sub seqno:%d, batch:0x%llx\n",
75 			batch_tag->sub_seqno, batch_tag->batch_id);
76 		return -EINVAL;
77 	}
78 
79 	spin_lock_irqsave(&log_ring->spin_lock, flags);
80 	if (batch_tag) {
81 		log->seqno =
82 			BATCH_IDX_TO_TREE_IDX(batch_tag->batch_id, batch_tag->sub_seqno);
83 		batch_tag->sub_seqno++;
84 	} else {
85 		log->seqno = BATCH_IDX_TO_TREE_IDX(log_ring->mono_upward_batch_id, 0);
86 		log_ring->mono_upward_batch_id++;
87 	}
88 	ret = radix_tree_insert(&log_ring->ras_log_root, log->seqno, log);
89 	if (!ret)
90 		log_ring->logged_ecc_count++;
91 	spin_unlock_irqrestore(&log_ring->spin_lock, flags);
92 
93 	if (ret) {
94 		RAS_DEV_ERR(ras_core->dev,
95 			"Failed to add ras log! seqno:0x%llx, ret:%d\n",
96 			log->seqno, ret);
97 		mempool_free(log, log_ring->ras_log_mempool);
98 	}
99 
100 	return ret;
101 }
102 
ras_log_ring_delete_data(struct ras_core_context * ras_core,uint32_t count)103 static int ras_log_ring_delete_data(struct ras_core_context *ras_core, uint32_t count)
104 {
105 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
106 	unsigned long flags = 0;
107 	uint32_t i = 0, j = 0;
108 	uint64_t batch_id, idx;
109 	void *data;
110 	int ret = -ENODATA;
111 
112 	if (count > ras_log_ring_get_logged_ecc_count(ras_core))
113 		return -EINVAL;
114 
115 	spin_lock_irqsave(&log_ring->spin_lock, flags);
116 	batch_id = log_ring->last_del_batch_id;
117 	while (batch_id < log_ring->mono_upward_batch_id) {
118 		for (j = 0; j < MAX_RECORD_PER_BATCH; j++) {
119 			idx = BATCH_IDX_TO_TREE_IDX(batch_id, j);
120 			data = radix_tree_delete(&log_ring->ras_log_root, idx);
121 			if (data) {
122 				mempool_free(data, log_ring->ras_log_mempool);
123 				log_ring->logged_ecc_count--;
124 				i++;
125 			}
126 		}
127 		batch_id = ++log_ring->last_del_batch_id;
128 		if (i >= count) {
129 			ret = 0;
130 			break;
131 		}
132 	}
133 	spin_unlock_irqrestore(&log_ring->spin_lock, flags);
134 
135 	return ret;
136 }
137 
ras_log_ring_clear_log_tree(struct ras_core_context * ras_core)138 static void ras_log_ring_clear_log_tree(struct ras_core_context *ras_core)
139 {
140 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
141 	uint64_t batch_id, idx;
142 	unsigned long flags = 0;
143 	void *data;
144 	int j;
145 
146 	if ((log_ring->mono_upward_batch_id <= log_ring->last_del_batch_id) &&
147 		!log_ring->logged_ecc_count)
148 		return;
149 
150 	spin_lock_irqsave(&log_ring->spin_lock, flags);
151 	batch_id = log_ring->last_del_batch_id;
152 	while (batch_id < log_ring->mono_upward_batch_id) {
153 		for (j = 0; j < MAX_RECORD_PER_BATCH; j++) {
154 			idx = BATCH_IDX_TO_TREE_IDX(batch_id, j);
155 			data = radix_tree_delete(&log_ring->ras_log_root, idx);
156 			if (data) {
157 				mempool_free(data, log_ring->ras_log_mempool);
158 				log_ring->logged_ecc_count--;
159 			}
160 		}
161 		batch_id++;
162 	}
163 	spin_unlock_irqrestore(&log_ring->spin_lock, flags);
164 
165 }
166 
ras_log_ring_sw_init(struct ras_core_context * ras_core)167 int ras_log_ring_sw_init(struct ras_core_context *ras_core)
168 {
169 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
170 
171 	memset(log_ring, 0, sizeof(*log_ring));
172 
173 	log_ring->ras_log_mempool = mempool_create_kmalloc_pool(
174 			RAS_LOG_MEMPOOL_SIZE, sizeof(struct ras_log_info));
175 	if (!log_ring->ras_log_mempool)
176 		return -ENOMEM;
177 
178 	INIT_RADIX_TREE(&log_ring->ras_log_root, GFP_KERNEL);
179 
180 	spin_lock_init(&log_ring->spin_lock);
181 
182 	return 0;
183 }
184 
ras_log_ring_sw_fini(struct ras_core_context * ras_core)185 int ras_log_ring_sw_fini(struct ras_core_context *ras_core)
186 {
187 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
188 
189 	ras_log_ring_clear_log_tree(ras_core);
190 	log_ring->logged_ecc_count = 0;
191 	log_ring->last_del_batch_id = 0;
192 	log_ring->mono_upward_batch_id = 0;
193 
194 	mempool_destroy(log_ring->ras_log_mempool);
195 
196 	return 0;
197 }
198 
ras_log_ring_create_batch_tag(struct ras_core_context * ras_core)199 struct ras_log_batch_tag *ras_log_ring_create_batch_tag(struct ras_core_context *ras_core)
200 {
201 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
202 	struct ras_log_batch_tag *batch_tag;
203 	unsigned long flags = 0;
204 
205 	batch_tag = kzalloc_obj(*batch_tag);
206 	if (!batch_tag)
207 		return NULL;
208 
209 	spin_lock_irqsave(&log_ring->spin_lock, flags);
210 	batch_tag->batch_id = log_ring->mono_upward_batch_id;
211 	log_ring->mono_upward_batch_id++;
212 	spin_unlock_irqrestore(&log_ring->spin_lock, flags);
213 
214 	batch_tag->sub_seqno = 0;
215 	batch_tag->timestamp = ras_core_get_utc_second_timestamp(ras_core);
216 	return batch_tag;
217 }
218 
ras_log_ring_destroy_batch_tag(struct ras_core_context * ras_core,struct ras_log_batch_tag * batch_tag)219 void ras_log_ring_destroy_batch_tag(struct ras_core_context *ras_core,
220 		struct ras_log_batch_tag *batch_tag)
221 {
222 	kfree(batch_tag);
223 }
224 
ras_log_ring_add_log_event(struct ras_core_context * ras_core,enum ras_log_event event,void * data,struct ras_log_batch_tag * batch_tag)225 void ras_log_ring_add_log_event(struct ras_core_context *ras_core,
226 		enum ras_log_event event, void *data, struct ras_log_batch_tag *batch_tag)
227 {
228 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
229 	struct device_system_info dev_info = {0};
230 	struct ras_log_info *log;
231 	uint64_t socket_id;
232 	void *obj;
233 
234 	obj = mempool_alloc_preallocated(log_ring->ras_log_mempool);
235 	if (!obj ||
236 	   (ras_log_ring_get_logged_ecc_count(ras_core) >= RAS_LOG_MEMPOOL_SIZE)) {
237 		ras_log_ring_delete_data(ras_core, RAS_LOG_MEM_TEMP_SIZE);
238 		if (!obj)
239 			obj = mempool_alloc_preallocated(log_ring->ras_log_mempool);
240 	}
241 
242 	if (!obj) {
243 		RAS_DEV_ERR(ras_core->dev, "ERROR: Failed to alloc ras log buffer!\n");
244 		return;
245 	}
246 
247 	log = (struct ras_log_info *)obj;
248 
249 	memset(log, 0, sizeof(*log));
250 	log->timestamp =
251 		batch_tag ? batch_tag->timestamp : ras_core_get_utc_second_timestamp(ras_core);
252 	log->event = event;
253 
254 	if (data)
255 		memcpy(&log->aca_reg, data, sizeof(log->aca_reg));
256 
257 	if (event == RAS_LOG_EVENT_RMA) {
258 		memcpy(&log->aca_reg, ras_rma_aca_reg, sizeof(log->aca_reg));
259 		ras_core_get_device_system_info(ras_core, &dev_info);
260 		socket_id = dev_info.socket_id;
261 		log->aca_reg.regs[ACA_REG_IDX__IPID] |= ((socket_id / 4) & 0x01);
262 		log->aca_reg.regs[ACA_REG_IDX__IPID] |= (((socket_id % 4) & 0x3) << 44);
263 	}
264 
265 	ras_log_ring_add_data(ras_core, log, batch_tag);
266 }
267 
ras_log_ring_lookup_data(struct ras_core_context * ras_core,uint64_t idx)268 static struct ras_log_info *ras_log_ring_lookup_data(struct ras_core_context *ras_core,
269 					uint64_t idx)
270 {
271 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
272 	unsigned long flags = 0;
273 	void *data;
274 
275 	spin_lock_irqsave(&log_ring->spin_lock, flags);
276 	data = radix_tree_lookup(&log_ring->ras_log_root, idx);
277 	spin_unlock_irqrestore(&log_ring->spin_lock, flags);
278 
279 	return (struct ras_log_info *)data;
280 }
281 
ras_log_ring_get_batch_records(struct ras_core_context * ras_core,uint64_t batch_id,struct ras_log_info ** log_arr,uint32_t arr_num)282 int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
283 		struct ras_log_info **log_arr, uint32_t arr_num)
284 {
285 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
286 	uint32_t i, idx, count = 0;
287 	void *data;
288 
289 	if ((batch_id >= log_ring->mono_upward_batch_id) ||
290 		(batch_id < log_ring->last_del_batch_id))
291 		return -EINVAL;
292 
293 	for (i = 0; i < MAX_RECORD_PER_BATCH; i++) {
294 		idx = BATCH_IDX_TO_TREE_IDX(batch_id, i);
295 		data = ras_log_ring_lookup_data(ras_core, idx);
296 		if (data) {
297 			log_arr[count++] = data;
298 			if (count >= arr_num)
299 				break;
300 		}
301 	}
302 
303 	return count;
304 }
305 
ras_log_ring_get_batch_overview(struct ras_core_context * ras_core,struct ras_log_batch_overview * overview)306 int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core,
307 		struct ras_log_batch_overview *overview)
308 {
309 	struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
310 
311 	overview->logged_batch_count =
312 		log_ring->mono_upward_batch_id - log_ring->last_del_batch_id;
313 	overview->last_batch_id = log_ring->mono_upward_batch_id;
314 	overview->first_batch_id = log_ring->last_del_batch_id;
315 
316 	return 0;
317 }
318