xref: /linux/drivers/gpu/drm/xe/xe_drm_ras.c (revision aec2f682d47c54ef434b2d440992626d80b1ebdc)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2026 Intel Corporation
4  */
5 
6 #include <linux/bitmap.h>
7 
8 #include <drm/drm_managed.h>
9 #include <drm/drm_print.h>
10 #include <drm/drm_ras.h>
11 
12 #include "xe_device_types.h"
13 #include "xe_drm_ras.h"
14 
15 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES;
16 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
17 
18 static int hw_query_error_counter(struct xe_drm_ras_counter *info,
19 				  u32 error_id, const char **name, u32 *val)
20 {
21 	if (!info || !info[error_id].name)
22 		return -ENOENT;
23 
24 	*name = info[error_id].name;
25 	*val = atomic_read(&info[error_id].counter);
26 
27 	return 0;
28 }
29 
30 static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id,
31 					     const char **name, u32 *val)
32 {
33 	struct xe_device *xe = ep->priv;
34 	struct xe_drm_ras *ras = &xe->ras;
35 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];
36 
37 	return hw_query_error_counter(info, error_id, name, val);
38 }
39 
40 static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id,
41 					   const char **name, u32 *val)
42 {
43 	struct xe_device *xe = ep->priv;
44 	struct xe_drm_ras *ras = &xe->ras;
45 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];
46 
47 	return hw_query_error_counter(info, error_id, name, val);
48 }
49 
50 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe)
51 {
52 	struct xe_drm_ras_counter *counter;
53 	int i;
54 
55 	counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
56 	if (!counter)
57 		return ERR_PTR(-ENOMEM);
58 
59 	for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) {
60 		if (!error_components[i])
61 			continue;
62 
63 		counter[i].name = error_components[i];
64 		atomic_set(&counter[i].counter, 0);
65 	}
66 
67 	return counter;
68 }
69 
70 static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
71 			      const enum drm_xe_ras_error_severity severity)
72 {
73 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
74 	struct xe_drm_ras *ras = &xe->ras;
75 	const char *device_name;
76 
77 	device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
78 				pci_domain_nr(pdev->bus), pdev->bus->number,
79 				PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
80 
81 	if (!device_name)
82 		return -ENOMEM;
83 
84 	node->device_name = device_name;
85 	node->node_name = error_severity[severity];
86 	node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
87 	node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE;
88 	node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1;
89 	node->priv = xe;
90 
91 	ras->info[severity] = allocate_and_copy_counters(xe);
92 	if (IS_ERR(ras->info[severity]))
93 		return PTR_ERR(ras->info[severity]);
94 
95 	if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
96 		node->query_error_counter = query_correctable_error_counter;
97 	else
98 		node->query_error_counter = query_uncorrectable_error_counter;
99 
100 	return 0;
101 }
102 
103 static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity)
104 {
105 	struct drm_ras_node *node = &ras->node[severity];
106 
107 	kfree(ras->info[severity]);
108 	ras->info[severity] = NULL;
109 
110 	kfree(node->device_name);
111 	node->device_name = NULL;
112 }
113 
114 static int register_nodes(struct xe_device *xe)
115 {
116 	struct xe_drm_ras *ras = &xe->ras;
117 	int i;
118 
119 	for_each_error_severity(i) {
120 		struct drm_ras_node *node = &ras->node[i];
121 		int ret;
122 
123 		ret = assign_node_params(xe, node, i);
124 		if (ret) {
125 			cleanup_node_param(ras, i);
126 			return ret;
127 		}
128 
129 		ret = drm_ras_node_register(node);
130 		if (ret) {
131 			cleanup_node_param(ras, i);
132 			return ret;
133 		}
134 	}
135 
136 	return 0;
137 }
138 
139 static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg)
140 {
141 	struct xe_device *xe = arg;
142 	struct xe_drm_ras *ras = &xe->ras;
143 	int i;
144 
145 	for_each_error_severity(i) {
146 		struct drm_ras_node *node = &ras->node[i];
147 
148 		drm_ras_node_unregister(node);
149 		cleanup_node_param(ras, i);
150 	}
151 }
152 
153 /**
154  * xe_drm_ras_init() - Initialize DRM RAS
155  * @xe: xe device instance
156  *
157  * Allocate and register DRM RAS nodes per device
158  *
159  * Return: 0 on success, negative error code otherwise.
160  */
161 int xe_drm_ras_init(struct xe_device *xe)
162 {
163 	struct xe_drm_ras *ras = &xe->ras;
164 	struct drm_ras_node *node;
165 	int err;
166 
167 	node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL);
168 	if (!node)
169 		return -ENOMEM;
170 
171 	ras->node = node;
172 
173 	err = register_nodes(xe);
174 	if (err) {
175 		drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err));
176 		return err;
177 	}
178 
179 	err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe);
180 	if (err) {
181 		drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err));
182 		return err;
183 	}
184 
185 	return 0;
186 }
187