xref: /linux/drivers/gpu/drm/drm_ras.c (revision c06b6cde2a1c3bcbb561bd57bb6f34eae9030921)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2026 Intel Corporation
4  */
5 
6 #include <linux/module.h>
7 #include <linux/kernel.h>
8 #include <linux/netdevice.h>
9 #include <linux/xarray.h>
10 #include <net/genetlink.h>
11 
12 #include <drm/drm_ras.h>
13 
14 #include "drm_ras_nl.h"
15 
16 /**
17  * DOC: DRM RAS Node Management
18  *
19  * This module provides the infrastructure to manage RAS (Reliability,
20  * Availability, and Serviceability) nodes for DRM drivers. Each
21  * DRM driver may register one or more RAS nodes, which represent
22  * logical components capable of reporting error counters and other
23  * reliability metrics.
24  *
25  * The nodes are stored in a global xarray `drm_ras_xa` to allow
26  * efficient lookup by ID. Nodes can be registered or unregistered
27  * dynamically at runtime.
28  *
29  * A Generic Netlink family `drm_ras` exposes the below operations to
30  * userspace:
31  *
32  * 1. LIST_NODES: Dump all currently registered RAS nodes.
33  *    The user receives an array of node IDs, names, and types.
34  *
35  * 2. GET_ERROR_COUNTER: Get error counters of a given node.
36  *    Userspace must provide Node ID, Error ID (Optional for specific counter).
37  *    Returns all counters of a node if only Node ID is provided or specific
38  *    error counters.
39  *
40  * 3. CLEAR_ERROR_COUNTER: Clear error counter of a given node.
41  *    Userspace must provide Node ID, Error ID.
42  *    Clears specific error counter of a node if supported.
43  *
44  * Node registration:
45  *
46  * - drm_ras_node_register(): Registers a new node and assigns
47  *   it a unique ID in the xarray.
48  * - drm_ras_node_unregister(): Removes a previously registered
49  *   node from the xarray.
50  *
51  * Node type:
52  *
53  * - ERROR_COUNTER:
54  *     + Currently, only error counters are supported.
55  *     + The driver must implement the query_error_counter() callback to provide
56  *       the name and the value of the error counter.
57  *     + The driver must provide a error_counter_range.last value informing the
58  *       last valid error ID.
59  *     + The driver can provide a error_counter_range.first value informing the
60  *       first valid error ID.
61  *     + The error counters in the driver doesn't need to be contiguous, but the
62  *       driver must return -ENOENT to the query_error_counter as an indication
63  *       that the ID should be skipped and not listed in the netlink API.
64  *
65  * Netlink handlers:
66  *
67  * - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES
68  *   operation, iterating over the xarray.
69  * - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit
70  *   operation, fetching all counters from a specific node.
71  * - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit
72  *   operation, fetching a counter value from a specific node.
73  * - drm_ras_nl_clear_error_counter_doit(): Implements the CLEAR_ERROR_COUNTER doit
74  *   operation, clearing a counter value from a specific node.
75  */
76 
77 static DEFINE_XARRAY_ALLOC(drm_ras_xa);
78 
79 /*
80  * The netlink callback context carries dump state across multiple dumpit calls
81  */
82 struct drm_ras_ctx {
83 	/* Which xarray id to restart the dump from */
84 	unsigned long restart;
85 };
86 
87 /**
88  * drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes
89  * @skb: Netlink message buffer
90  * @cb: Callback context for multi-part dumps
91  *
92  * Iterates over all registered RAS nodes in the global xarray and appends
93  * their attributes (ID, name, type) to the given netlink message buffer.
94  * Uses @cb->ctx to track progress in case the message buffer fills up, allowing
95  * multi-part dump support. On buffer overflow, updates the context to resume
96  * from the last node on the next invocation.
97  *
98  * Return: 0 if all nodes fit in @skb, number of bytes added to @skb if
99  *          the buffer filled up (requires multi-part continuation), or
100  *          a negative error code on failure.
101  */
102 int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb,
103 				 struct netlink_callback *cb)
104 {
105 	const struct genl_info *info = genl_info_dump(cb);
106 	struct drm_ras_ctx *ctx = (void *)cb->ctx;
107 	struct drm_ras_node *node;
108 	struct nlattr *hdr;
109 	unsigned long id;
110 	int ret;
111 
112 	xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) {
113 		hdr = genlmsg_iput(skb, info);
114 		if (!hdr) {
115 			ret = -EMSGSIZE;
116 			break;
117 		}
118 
119 		ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id);
120 		if (ret) {
121 			genlmsg_cancel(skb, hdr);
122 			break;
123 		}
124 
125 		ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME,
126 				     node->device_name);
127 		if (ret) {
128 			genlmsg_cancel(skb, hdr);
129 			break;
130 		}
131 
132 		ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME,
133 				     node->node_name);
134 		if (ret) {
135 			genlmsg_cancel(skb, hdr);
136 			break;
137 		}
138 
139 		ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE,
140 				  node->type);
141 		if (ret) {
142 			genlmsg_cancel(skb, hdr);
143 			break;
144 		}
145 
146 		genlmsg_end(skb, hdr);
147 	}
148 
149 	if (ret == -EMSGSIZE)
150 		ctx->restart = id;
151 
152 	return ret;
153 }
154 
155 static int get_node_error_counter(u32 node_id, u32 error_id,
156 				  const char **name, u32 *value)
157 {
158 	struct drm_ras_node *node;
159 
160 	node = xa_load(&drm_ras_xa, node_id);
161 	if (!node || !node->query_error_counter)
162 		return -ENOENT;
163 
164 	if (error_id < node->error_counter_range.first ||
165 	    error_id > node->error_counter_range.last)
166 		return -EINVAL;
167 
168 	return node->query_error_counter(node, error_id, name, value);
169 }
170 
171 static int msg_reply_value(struct sk_buff *msg, u32 error_id,
172 			   const char *error_name, u32 value)
173 {
174 	int ret;
175 
176 	ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id);
177 	if (ret)
178 		return ret;
179 
180 	ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME,
181 			     error_name);
182 	if (ret)
183 		return ret;
184 
185 	return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE,
186 			   value);
187 }
188 
189 static int doit_reply_value(struct genl_info *info, u32 node_id,
190 			    u32 error_id)
191 {
192 	struct sk_buff *msg;
193 	struct nlattr *hdr;
194 	const char *error_name;
195 	u32 value;
196 	int ret;
197 
198 	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
199 	if (!msg)
200 		return -ENOMEM;
201 
202 	hdr = genlmsg_iput(msg, info);
203 	if (!hdr) {
204 		nlmsg_free(msg);
205 		return -EMSGSIZE;
206 	}
207 
208 	ret = get_node_error_counter(node_id, error_id,
209 				     &error_name, &value);
210 	if (ret)
211 		return ret;
212 
213 	ret = msg_reply_value(msg, error_id, error_name, value);
214 	if (ret) {
215 		genlmsg_cancel(msg, hdr);
216 		nlmsg_free(msg);
217 		return ret;
218 	}
219 
220 	genlmsg_end(msg, hdr);
221 
222 	return genlmsg_reply(msg, info);
223 }
224 
225 /**
226  * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters
227  * @skb: Netlink message buffer
228  * @cb: Callback context for multi-part dumps
229  *
230  * Iterates over all error counters in a given Node and appends
231  * their attributes (ID, name, value) to the given netlink message buffer.
232  * Uses @cb->ctx to track progress in case the message buffer fills up, allowing
233  * multi-part dump support. On buffer overflow, updates the context to resume
234  * from the last node on the next invocation.
235  *
236  * Return: 0 if all errors fit in @skb, number of bytes added to @skb if
237  *          the buffer filled up (requires multi-part continuation), or
238  *          a negative error code on failure.
239  */
240 int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb,
241 					struct netlink_callback *cb)
242 {
243 	const struct genl_info *info = genl_info_dump(cb);
244 	struct drm_ras_ctx *ctx = (void *)cb->ctx;
245 	struct drm_ras_node *node;
246 	struct nlattr *hdr;
247 	const char *error_name;
248 	u32 node_id, error_id, value;
249 	int ret;
250 
251 	if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID))
252 		return -EINVAL;
253 
254 	node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
255 
256 	node = xa_load(&drm_ras_xa, node_id);
257 	if (!node)
258 		return -ENOENT;
259 
260 	for (error_id = max(node->error_counter_range.first, ctx->restart);
261 	     error_id <= node->error_counter_range.last;
262 	     error_id++) {
263 		ret = get_node_error_counter(node_id, error_id,
264 					     &error_name, &value);
265 		/*
266 		 * For non-contiguous range, driver return -ENOENT as indication
267 		 * to skip this ID when listing all errors.
268 		 */
269 		if (ret == -ENOENT)
270 			continue;
271 		if (ret)
272 			return ret;
273 
274 		hdr = genlmsg_iput(skb, info);
275 
276 		if (!hdr) {
277 			ret = -EMSGSIZE;
278 			break;
279 		}
280 
281 		ret = msg_reply_value(skb, error_id, error_name, value);
282 		if (ret) {
283 			genlmsg_cancel(skb, hdr);
284 			break;
285 		}
286 
287 		genlmsg_end(skb, hdr);
288 	}
289 
290 	if (ret == -EMSGSIZE)
291 		ctx->restart = error_id;
292 
293 	return ret;
294 }
295 
296 /**
297  * drm_ras_nl_get_error_counter_doit() - Query an error counter of an node
298  * @skb: Netlink message buffer
299  * @info: Generic Netlink info containing attributes of the request
300  *
301  * Extracts the node ID and error ID from the netlink attributes and
302  * retrieves the current value of the corresponding error counter. Sends the
303  * result back to the requesting user via the standard Genl reply.
304  *
305  * Return: 0 on success, or negative errno on failure.
306  */
307 int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb,
308 				      struct genl_info *info)
309 {
310 	u32 node_id, error_id;
311 
312 	if (!info->attrs ||
313 	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) ||
314 	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID))
315 		return -EINVAL;
316 
317 	node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
318 	error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]);
319 
320 	return doit_reply_value(info, node_id, error_id);
321 }
322 
323 /**
324  * drm_ras_nl_clear_error_counter_doit() - Clear an error counter of a node
325  * @skb: Netlink message buffer
326  * @info: Generic Netlink info containing attributes of the request
327  *
328  * Extracts the node ID and error ID from the netlink attributes and
329  * clears the current value.
330  *
331  * Return: 0 on success, or negative errno on failure.
332  */
333 int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb,
334 					struct genl_info *info)
335 {
336 	struct drm_ras_node *node;
337 	u32 node_id, error_id;
338 
339 	if (!info->attrs ||
340 	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) ||
341 	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID))
342 		return -EINVAL;
343 
344 	node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
345 	error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]);
346 
347 	node = xa_load(&drm_ras_xa, node_id);
348 	if (!node || !node->clear_error_counter)
349 		return -ENOENT;
350 
351 	if (error_id < node->error_counter_range.first ||
352 	    error_id > node->error_counter_range.last)
353 		return -EINVAL;
354 
355 	return node->clear_error_counter(node, error_id);
356 }
357 
358 /**
359  * drm_ras_node_register() - Register a new RAS node
360  * @node: Node structure to register
361  *
362  * Adds the given RAS node to the global node xarray and assigns it
363  * a unique ID. Both @node->name and @node->type must be valid.
364  *
365  * Return: 0 on success, or negative errno on failure:
366  */
367 int drm_ras_node_register(struct drm_ras_node *node)
368 {
369 	if (!node->device_name || !node->node_name)
370 		return -EINVAL;
371 
372 	/* Currently, only Error Counter Endpoints are supported */
373 	if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER)
374 		return -EINVAL;
375 
376 	/* Mandatory entries for Error Counter Node */
377 	if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER &&
378 	    (!node->error_counter_range.last || !node->query_error_counter))
379 		return -EINVAL;
380 
381 	return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL);
382 }
383 EXPORT_SYMBOL(drm_ras_node_register);
384 
385 /**
386  * drm_ras_node_unregister() - Unregister a previously registered node
387  * @node: Node structure to unregister
388  *
389  * Removes the given node from the global node xarray using its ID.
390  */
391 void drm_ras_node_unregister(struct drm_ras_node *node)
392 {
393 	xa_erase(&drm_ras_xa, node->id);
394 }
395 EXPORT_SYMBOL(drm_ras_node_unregister);
396