xref: /linux/drivers/gpu/drm/drm_ras.c (revision 13c072b8e91a5ccb5855ca1ba6fe3ea467dbf94d)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2026 Intel Corporation
4  */
5 
6 #include <linux/module.h>
7 #include <linux/kernel.h>
8 #include <linux/netdevice.h>
9 #include <linux/xarray.h>
10 #include <net/genetlink.h>
11 
12 #include <drm/drm_ras.h>
13 
14 #include "drm_ras_nl.h"
15 
16 /**
17  * DOC: DRM RAS Node Management
18  *
19  * This module provides the infrastructure to manage RAS (Reliability,
20  * Availability, and Serviceability) nodes for DRM drivers. Each
21  * DRM driver may register one or more RAS nodes, which represent
22  * logical components capable of reporting error counters and other
23  * reliability metrics.
24  *
25  * The nodes are stored in a global xarray `drm_ras_xa` to allow
26  * efficient lookup by ID. Nodes can be registered or unregistered
27  * dynamically at runtime.
28  *
29  * A Generic Netlink family `drm_ras` exposes two main operations to
30  * userspace:
31  *
32  * 1. LIST_NODES: Dump all currently registered RAS nodes.
33  *    The user receives an array of node IDs, names, and types.
34  *
35  * 2. GET_ERROR_COUNTER: Get error counters of a given node.
36  *    Userspace must provide Node ID, Error ID (Optional for specific counter).
37  *    Returns all counters of a node if only Node ID is provided or specific
38  *    error counters.
39  *
40  * Node registration:
41  *
42  * - drm_ras_node_register(): Registers a new node and assigns
43  *   it a unique ID in the xarray.
44  * - drm_ras_node_unregister(): Removes a previously registered
45  *   node from the xarray.
46  *
47  * Node type:
48  *
49  * - ERROR_COUNTER:
50  *     + Currently, only error counters are supported.
51  *     + The driver must implement the query_error_counter() callback to provide
52  *       the name and the value of the error counter.
53  *     + The driver must provide a error_counter_range.last value informing the
54  *       last valid error ID.
55  *     + The driver can provide a error_counter_range.first value informing the
56  *       first valid error ID.
57  *     + The error counters in the driver doesn't need to be contiguous, but the
58  *       driver must return -ENOENT to the query_error_counter as an indication
59  *       that the ID should be skipped and not listed in the netlink API.
60  *
61  * Netlink handlers:
62  *
63  * - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES
64  *   operation, iterating over the xarray.
65  * - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit
66  *   operation, fetching all counters from a specific node.
67  * - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit
68  *   operation, fetching a counter value from a specific node.
69  */
70 
71 static DEFINE_XARRAY_ALLOC(drm_ras_xa);
72 
73 /*
74  * The netlink callback context carries dump state across multiple dumpit calls
75  */
76 struct drm_ras_ctx {
77 	/* Which xarray id to restart the dump from */
78 	unsigned long restart;
79 };
80 
81 /**
82  * drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes
83  * @skb: Netlink message buffer
84  * @cb: Callback context for multi-part dumps
85  *
86  * Iterates over all registered RAS nodes in the global xarray and appends
87  * their attributes (ID, name, type) to the given netlink message buffer.
88  * Uses @cb->ctx to track progress in case the message buffer fills up, allowing
89  * multi-part dump support. On buffer overflow, updates the context to resume
90  * from the last node on the next invocation.
91  *
92  * Return: 0 if all nodes fit in @skb, number of bytes added to @skb if
93  *          the buffer filled up (requires multi-part continuation), or
94  *          a negative error code on failure.
95  */
96 int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb,
97 				 struct netlink_callback *cb)
98 {
99 	const struct genl_info *info = genl_info_dump(cb);
100 	struct drm_ras_ctx *ctx = (void *)cb->ctx;
101 	struct drm_ras_node *node;
102 	struct nlattr *hdr;
103 	unsigned long id;
104 	int ret;
105 
106 	xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) {
107 		hdr = genlmsg_iput(skb, info);
108 		if (!hdr) {
109 			ret = -EMSGSIZE;
110 			break;
111 		}
112 
113 		ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id);
114 		if (ret) {
115 			genlmsg_cancel(skb, hdr);
116 			break;
117 		}
118 
119 		ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME,
120 				     node->device_name);
121 		if (ret) {
122 			genlmsg_cancel(skb, hdr);
123 			break;
124 		}
125 
126 		ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME,
127 				     node->node_name);
128 		if (ret) {
129 			genlmsg_cancel(skb, hdr);
130 			break;
131 		}
132 
133 		ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE,
134 				  node->type);
135 		if (ret) {
136 			genlmsg_cancel(skb, hdr);
137 			break;
138 		}
139 
140 		genlmsg_end(skb, hdr);
141 	}
142 
143 	if (ret == -EMSGSIZE)
144 		ctx->restart = id;
145 
146 	return ret;
147 }
148 
149 static int get_node_error_counter(u32 node_id, u32 error_id,
150 				  const char **name, u32 *value)
151 {
152 	struct drm_ras_node *node;
153 
154 	node = xa_load(&drm_ras_xa, node_id);
155 	if (!node || !node->query_error_counter)
156 		return -ENOENT;
157 
158 	if (error_id < node->error_counter_range.first ||
159 	    error_id > node->error_counter_range.last)
160 		return -EINVAL;
161 
162 	return node->query_error_counter(node, error_id, name, value);
163 }
164 
165 static int msg_reply_value(struct sk_buff *msg, u32 error_id,
166 			   const char *error_name, u32 value)
167 {
168 	int ret;
169 
170 	ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id);
171 	if (ret)
172 		return ret;
173 
174 	ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME,
175 			     error_name);
176 	if (ret)
177 		return ret;
178 
179 	return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE,
180 			   value);
181 }
182 
183 static int doit_reply_value(struct genl_info *info, u32 node_id,
184 			    u32 error_id)
185 {
186 	struct sk_buff *msg;
187 	struct nlattr *hdr;
188 	const char *error_name;
189 	u32 value;
190 	int ret;
191 
192 	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
193 	if (!msg)
194 		return -ENOMEM;
195 
196 	hdr = genlmsg_iput(msg, info);
197 	if (!hdr) {
198 		nlmsg_free(msg);
199 		return -EMSGSIZE;
200 	}
201 
202 	ret = get_node_error_counter(node_id, error_id,
203 				     &error_name, &value);
204 	if (ret)
205 		return ret;
206 
207 	ret = msg_reply_value(msg, error_id, error_name, value);
208 	if (ret) {
209 		genlmsg_cancel(msg, hdr);
210 		nlmsg_free(msg);
211 		return ret;
212 	}
213 
214 	genlmsg_end(msg, hdr);
215 
216 	return genlmsg_reply(msg, info);
217 }
218 
219 /**
220  * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters
221  * @skb: Netlink message buffer
222  * @cb: Callback context for multi-part dumps
223  *
224  * Iterates over all error counters in a given Node and appends
225  * their attributes (ID, name, value) to the given netlink message buffer.
226  * Uses @cb->ctx to track progress in case the message buffer fills up, allowing
227  * multi-part dump support. On buffer overflow, updates the context to resume
228  * from the last node on the next invocation.
229  *
230  * Return: 0 if all errors fit in @skb, number of bytes added to @skb if
231  *          the buffer filled up (requires multi-part continuation), or
232  *          a negative error code on failure.
233  */
234 int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb,
235 					struct netlink_callback *cb)
236 {
237 	const struct genl_info *info = genl_info_dump(cb);
238 	struct drm_ras_ctx *ctx = (void *)cb->ctx;
239 	struct drm_ras_node *node;
240 	struct nlattr *hdr;
241 	const char *error_name;
242 	u32 node_id, error_id, value;
243 	int ret;
244 
245 	if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID))
246 		return -EINVAL;
247 
248 	node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
249 
250 	node = xa_load(&drm_ras_xa, node_id);
251 	if (!node)
252 		return -ENOENT;
253 
254 	for (error_id = max(node->error_counter_range.first, ctx->restart);
255 	     error_id <= node->error_counter_range.last;
256 	     error_id++) {
257 		ret = get_node_error_counter(node_id, error_id,
258 					     &error_name, &value);
259 		/*
260 		 * For non-contiguous range, driver return -ENOENT as indication
261 		 * to skip this ID when listing all errors.
262 		 */
263 		if (ret == -ENOENT)
264 			continue;
265 		if (ret)
266 			return ret;
267 
268 		hdr = genlmsg_iput(skb, info);
269 
270 		if (!hdr) {
271 			ret = -EMSGSIZE;
272 			break;
273 		}
274 
275 		ret = msg_reply_value(skb, error_id, error_name, value);
276 		if (ret) {
277 			genlmsg_cancel(skb, hdr);
278 			break;
279 		}
280 
281 		genlmsg_end(skb, hdr);
282 	}
283 
284 	if (ret == -EMSGSIZE)
285 		ctx->restart = error_id;
286 
287 	return ret;
288 }
289 
290 /**
291  * drm_ras_nl_get_error_counter_doit() - Query an error counter of an node
292  * @skb: Netlink message buffer
293  * @info: Generic Netlink info containing attributes of the request
294  *
295  * Extracts the node ID and error ID from the netlink attributes and
296  * retrieves the current value of the corresponding error counter. Sends the
297  * result back to the requesting user via the standard Genl reply.
298  *
299  * Return: 0 on success, or negative errno on failure.
300  */
301 int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb,
302 				      struct genl_info *info)
303 {
304 	u32 node_id, error_id;
305 
306 	if (!info->attrs ||
307 	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) ||
308 	    GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID))
309 		return -EINVAL;
310 
311 	node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
312 	error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]);
313 
314 	return doit_reply_value(info, node_id, error_id);
315 }
316 
317 /**
318  * drm_ras_node_register() - Register a new RAS node
319  * @node: Node structure to register
320  *
321  * Adds the given RAS node to the global node xarray and assigns it
322  * a unique ID. Both @node->name and @node->type must be valid.
323  *
324  * Return: 0 on success, or negative errno on failure:
325  */
326 int drm_ras_node_register(struct drm_ras_node *node)
327 {
328 	if (!node->device_name || !node->node_name)
329 		return -EINVAL;
330 
331 	/* Currently, only Error Counter Endpoints are supported */
332 	if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER)
333 		return -EINVAL;
334 
335 	/* Mandatory entries for Error Counter Node */
336 	if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER &&
337 	    (!node->error_counter_range.last || !node->query_error_counter))
338 		return -EINVAL;
339 
340 	return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL);
341 }
342 EXPORT_SYMBOL(drm_ras_node_register);
343 
344 /**
345  * drm_ras_node_unregister() - Unregister a previously registered node
346  * @node: Node structure to unregister
347  *
348  * Removes the given node from the global node xarray using its ID.
349  */
350 void drm_ras_node_unregister(struct drm_ras_node *node)
351 {
352 	xa_erase(&drm_ras_xa, node->id);
353 }
354 EXPORT_SYMBOL(drm_ras_node_unregister);
355