xref: /linux/drivers/gpu/drm/xe/xe_ras.c (revision a3e50e7279996cd987001fd8a3db36e72665f8f7)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2026 Intel Corporation
4  */
5 
6 #include "xe_device.h"
7 #include "xe_printk.h"
8 #include "xe_ras.h"
9 #include "xe_ras_types.h"
10 #include "xe_sysctrl.h"
11 #include "xe_sysctrl_event_types.h"
12 
13 /* Severity of detected errors  */
14 enum xe_ras_severity {
15 	XE_RAS_SEV_NOT_SUPPORTED = 0,
16 	XE_RAS_SEV_CORRECTABLE,
17 	XE_RAS_SEV_UNCORRECTABLE,
18 	XE_RAS_SEV_INFORMATIONAL,
19 	XE_RAS_SEV_MAX
20 };
21 
22 /* Major IP blocks/components where errors can originate */
23 enum xe_ras_component {
24 	XE_RAS_COMP_NOT_SUPPORTED = 0,
25 	XE_RAS_COMP_DEVICE_MEMORY,
26 	XE_RAS_COMP_CORE_COMPUTE,
27 	XE_RAS_COMP_RESERVED,
28 	XE_RAS_COMP_PCIE,
29 	XE_RAS_COMP_FABRIC,
30 	XE_RAS_COMP_SOC_INTERNAL,
31 	XE_RAS_COMP_MAX
32 };
33 
34 static const char *const xe_ras_severities[] = {
35 	[XE_RAS_SEV_NOT_SUPPORTED]		= "Not Supported",
36 	[XE_RAS_SEV_CORRECTABLE]		= "Correctable Error",
37 	[XE_RAS_SEV_UNCORRECTABLE]		= "Uncorrectable Error",
38 	[XE_RAS_SEV_INFORMATIONAL]		= "Informational Error",
39 };
40 static_assert(ARRAY_SIZE(xe_ras_severities) == XE_RAS_SEV_MAX);
41 
42 static const char *const xe_ras_components[] = {
43 	[XE_RAS_COMP_NOT_SUPPORTED]		= "Not Supported",
44 	[XE_RAS_COMP_DEVICE_MEMORY]		= "Device Memory",
45 	[XE_RAS_COMP_CORE_COMPUTE]		= "Core Compute",
46 	[XE_RAS_COMP_RESERVED]			= "Reserved",
47 	[XE_RAS_COMP_PCIE]			= "PCIe",
48 	[XE_RAS_COMP_FABRIC]			= "Fabric",
49 	[XE_RAS_COMP_SOC_INTERNAL]		= "SoC Internal",
50 };
51 static_assert(ARRAY_SIZE(xe_ras_components) == XE_RAS_COMP_MAX);
52 
53 static inline const char *sev_to_str(u8 severity)
54 {
55 	if (severity >= XE_RAS_SEV_MAX)
56 		severity = XE_RAS_SEV_NOT_SUPPORTED;
57 
58 	return xe_ras_severities[severity];
59 }
60 
61 static inline const char *comp_to_str(u8 component)
62 {
63 	if (component >= XE_RAS_COMP_MAX)
64 		component = XE_RAS_COMP_NOT_SUPPORTED;
65 
66 	return xe_ras_components[component];
67 }
68 
69 void xe_ras_counter_threshold_crossed(struct xe_device *xe,
70 				      struct xe_sysctrl_event_response *response)
71 {
72 	struct xe_ras_threshold_crossed *pending = (void *)&response->data;
73 	struct xe_ras_error_class *errors = pending->counters;
74 	u32 id, ncounters = pending->ncounters;
75 
76 	BUILD_BUG_ON(sizeof(response->data) < sizeof(*pending));
77 	xe_device_assert_mem_access(xe);
78 
79 	if (!ncounters || ncounters > XE_RAS_NUM_COUNTERS)
80 		xe_err(xe, "sysctrl: unexpected counter threshold crossed %u\n", ncounters);
81 	else
82 		xe_warn(xe, "[RAS]: counter threshold crossed, %u new errors\n", ncounters);
83 
84 	for (id = 0; id < ncounters && id < XE_RAS_NUM_COUNTERS; id++) {
85 		u8 severity, component;
86 
87 		severity = errors[id].common.severity;
88 		component = errors[id].common.component;
89 
90 		xe_warn(xe, "[RAS]: %s %s detected\n",
91 			comp_to_str(component), sev_to_str(severity));
92 	}
93 }
94