xref: /linux/drivers/gpu/drm/xe/xe_drm_ras.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2026 Intel Corporation
4  */
5 
6 #include <linux/bitmap.h>
7 
8 #include <drm/drm_managed.h>
9 #include <drm/drm_print.h>
10 #include <drm/drm_ras.h>
11 
12 #include "xe_device_types.h"
13 #include "xe_drm_ras.h"
14 
15 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES;
16 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
17 
18 static int hw_query_error_counter(struct xe_drm_ras_counter *info,
19 				  u32 error_id, const char **name, u32 *val)
20 {
21 	if (!info || !info[error_id].name)
22 		return -ENOENT;
23 
24 	*name = info[error_id].name;
25 	*val = atomic_read(&info[error_id].counter);
26 
27 	return 0;
28 }
29 
30 static int hw_clear_error_counter(struct xe_drm_ras_counter *info, u32 error_id)
31 {
32 	if (!info || !info[error_id].name)
33 		return -ENOENT;
34 
35 	atomic_set(&info[error_id].counter, 0);
36 
37 	return 0;
38 }
39 
40 static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id,
41 					     const char **name, u32 *val)
42 {
43 	struct xe_device *xe = ep->priv;
44 	struct xe_drm_ras *ras = &xe->ras;
45 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];
46 
47 	return hw_query_error_counter(info, error_id, name, val);
48 }
49 
50 static int clear_uncorrectable_error_counter(struct drm_ras_node *node, u32 error_id)
51 {
52 	struct xe_device *xe = node->priv;
53 	struct xe_drm_ras *ras = &xe->ras;
54 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];
55 
56 	return hw_clear_error_counter(info, error_id);
57 }
58 
59 static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id,
60 					   const char **name, u32 *val)
61 {
62 	struct xe_device *xe = ep->priv;
63 	struct xe_drm_ras *ras = &xe->ras;
64 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];
65 
66 	return hw_query_error_counter(info, error_id, name, val);
67 }
68 
69 static int clear_correctable_error_counter(struct drm_ras_node *node, u32 error_id)
70 {
71 	struct xe_device *xe = node->priv;
72 	struct xe_drm_ras *ras = &xe->ras;
73 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];
74 
75 	return hw_clear_error_counter(info, error_id);
76 }
77 
78 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe)
79 {
80 	struct xe_drm_ras_counter *counter;
81 	int i;
82 
83 	counter = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
84 	if (!counter)
85 		return ERR_PTR(-ENOMEM);
86 
87 	for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) {
88 		if (!error_components[i])
89 			continue;
90 
91 		counter[i].name = error_components[i];
92 		atomic_set(&counter[i].counter, 0);
93 	}
94 
95 	return counter;
96 }
97 
98 static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
99 			      const enum drm_xe_ras_error_severity severity)
100 {
101 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
102 	struct xe_drm_ras *ras = &xe->ras;
103 	const char *device_name;
104 
105 	device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
106 				pci_domain_nr(pdev->bus), pdev->bus->number,
107 				PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
108 
109 	if (!device_name)
110 		return -ENOMEM;
111 
112 	node->device_name = device_name;
113 	node->node_name = error_severity[severity];
114 	node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
115 	node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE;
116 	node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1;
117 	node->priv = xe;
118 
119 	ras->info[severity] = allocate_and_copy_counters(xe);
120 	if (IS_ERR(ras->info[severity]))
121 		return PTR_ERR(ras->info[severity]);
122 
123 	if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) {
124 		node->query_error_counter = query_correctable_error_counter;
125 		node->clear_error_counter = clear_correctable_error_counter;
126 	} else {
127 		node->query_error_counter = query_uncorrectable_error_counter;
128 		node->clear_error_counter = clear_uncorrectable_error_counter;
129 	}
130 
131 	return 0;
132 }
133 
134 static void cleanup_node_param(struct drm_ras_node *node)
135 {
136 	kfree(node->device_name);
137 	node->device_name = NULL;
138 }
139 
140 static void cleanup_node(struct drm_device *drm, void *node)
141 {
142 	drm_ras_node_unregister(node);
143 	cleanup_node_param(node);
144 }
145 
146 static int register_nodes(struct xe_device *xe)
147 {
148 	struct xe_drm_ras *ras = &xe->ras;
149 	struct drm_ras_node *node;
150 	int i, ret;
151 
152 	for_each_error_severity(i) {
153 		node = &ras->node[i];
154 
155 		ret = assign_node_params(xe, node, i);
156 		if (ret)
157 			goto free_param;
158 
159 		ret = drm_ras_node_register(node);
160 		if (ret)
161 			goto free_param;
162 
163 		ret = drmm_add_action_or_reset(&xe->drm, cleanup_node, node);
164 		if (ret)
165 			goto null_info;
166 	}
167 
168 	return 0;
169 
170 free_param:
171 	cleanup_node_param(node);
172 null_info:
173 	ras->info[i] = NULL;
174 	return ret;
175 }
176 
177 /**
178  * xe_drm_ras_init() - Initialize DRM RAS
179  * @xe: xe device instance
180  *
181  * Allocate and register DRM RAS nodes per device
182  *
183  * Return: 0 on success, negative error code otherwise.
184  */
185 int xe_drm_ras_init(struct xe_device *xe)
186 {
187 	struct xe_drm_ras *ras = &xe->ras;
188 	struct drm_ras_node *node;
189 	int err;
190 
191 	node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL);
192 	if (!node)
193 		return -ENOMEM;
194 
195 	ras->node = node;
196 
197 	err = register_nodes(xe);
198 	if (err) {
199 		drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err));
200 		return err;
201 	}
202 
203 	return 0;
204 }
205