1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2026 Intel Corporation 4 */ 5 6 #include <linux/module.h> 7 #include <linux/kernel.h> 8 #include <linux/netdevice.h> 9 #include <linux/xarray.h> 10 #include <net/genetlink.h> 11 12 #include <drm/drm_ras.h> 13 14 #include "drm_ras_nl.h" 15 16 /** 17 * DOC: DRM RAS Node Management 18 * 19 * This module provides the infrastructure to manage RAS (Reliability, 20 * Availability, and Serviceability) nodes for DRM drivers. Each 21 * DRM driver may register one or more RAS nodes, which represent 22 * logical components capable of reporting error counters and other 23 * reliability metrics. 24 * 25 * The nodes are stored in a global xarray `drm_ras_xa` to allow 26 * efficient lookup by ID. Nodes can be registered or unregistered 27 * dynamically at runtime. 28 * 29 * A Generic Netlink family `drm_ras` exposes the below operations to 30 * userspace: 31 * 32 * 1. LIST_NODES: Dump all currently registered RAS nodes. 33 * The user receives an array of node IDs, names, and types. 34 * 35 * 2. GET_ERROR_COUNTER: Get error counters of a given node. 36 * Userspace must provide Node ID, Error ID (Optional for specific counter). 37 * Returns all counters of a node if only Node ID is provided or specific 38 * error counters. 39 * 40 * 3. CLEAR_ERROR_COUNTER: Clear error counter of a given node. 41 * Userspace must provide Node ID, Error ID. 42 * Clears specific error counter of a node if supported. 43 * 44 * Node registration: 45 * 46 * - drm_ras_node_register(): Registers a new node and assigns 47 * it a unique ID in the xarray. 48 * - drm_ras_node_unregister(): Removes a previously registered 49 * node from the xarray. 50 * 51 * Node type: 52 * 53 * - ERROR_COUNTER: 54 * + Currently, only error counters are supported. 55 * + The driver must implement the query_error_counter() callback to provide 56 * the name and the value of the error counter. 57 * + The driver must provide a error_counter_range.last value informing the 58 * last valid error ID. 59 * + The driver can provide a error_counter_range.first value informing the 60 * first valid error ID. 61 * + The error counters in the driver doesn't need to be contiguous, but the 62 * driver must return -ENOENT to the query_error_counter as an indication 63 * that the ID should be skipped and not listed in the netlink API. 64 * 65 * Netlink handlers: 66 * 67 * - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES 68 * operation, iterating over the xarray. 69 * - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit 70 * operation, fetching all counters from a specific node. 71 * - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit 72 * operation, fetching a counter value from a specific node. 73 * - drm_ras_nl_clear_error_counter_doit(): Implements the CLEAR_ERROR_COUNTER doit 74 * operation, clearing a counter value from a specific node. 75 */ 76 77 static DEFINE_XARRAY_ALLOC(drm_ras_xa); 78 79 /* 80 * The netlink callback context carries dump state across multiple dumpit calls 81 */ 82 struct drm_ras_ctx { 83 /* Which xarray id to restart the dump from */ 84 unsigned long restart; 85 }; 86 87 /** 88 * drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes 89 * @skb: Netlink message buffer 90 * @cb: Callback context for multi-part dumps 91 * 92 * Iterates over all registered RAS nodes in the global xarray and appends 93 * their attributes (ID, name, type) to the given netlink message buffer. 94 * Uses @cb->ctx to track progress in case the message buffer fills up, allowing 95 * multi-part dump support. On buffer overflow, updates the context to resume 96 * from the last node on the next invocation. 97 * 98 * Return: 0 if all nodes fit in @skb, number of bytes added to @skb if 99 * the buffer filled up (requires multi-part continuation), or 100 * a negative error code on failure. 101 */ 102 int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb, 103 struct netlink_callback *cb) 104 { 105 const struct genl_info *info = genl_info_dump(cb); 106 struct drm_ras_ctx *ctx = (void *)cb->ctx; 107 struct drm_ras_node *node; 108 struct nlattr *hdr; 109 unsigned long id; 110 int ret; 111 112 xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) { 113 hdr = genlmsg_iput(skb, info); 114 if (!hdr) { 115 ret = -EMSGSIZE; 116 break; 117 } 118 119 ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id); 120 if (ret) { 121 genlmsg_cancel(skb, hdr); 122 break; 123 } 124 125 ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME, 126 node->device_name); 127 if (ret) { 128 genlmsg_cancel(skb, hdr); 129 break; 130 } 131 132 ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME, 133 node->node_name); 134 if (ret) { 135 genlmsg_cancel(skb, hdr); 136 break; 137 } 138 139 ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE, 140 node->type); 141 if (ret) { 142 genlmsg_cancel(skb, hdr); 143 break; 144 } 145 146 genlmsg_end(skb, hdr); 147 } 148 149 if (ret == -EMSGSIZE) 150 ctx->restart = id; 151 152 return ret; 153 } 154 155 static int get_node_error_counter(u32 node_id, u32 error_id, 156 const char **name, u32 *value) 157 { 158 struct drm_ras_node *node; 159 160 node = xa_load(&drm_ras_xa, node_id); 161 if (!node || !node->query_error_counter) 162 return -ENOENT; 163 164 if (error_id < node->error_counter_range.first || 165 error_id > node->error_counter_range.last) 166 return -EINVAL; 167 168 return node->query_error_counter(node, error_id, name, value); 169 } 170 171 static int msg_reply_value(struct sk_buff *msg, u32 error_id, 172 const char *error_name, u32 value) 173 { 174 int ret; 175 176 ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id); 177 if (ret) 178 return ret; 179 180 ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, 181 error_name); 182 if (ret) 183 return ret; 184 185 return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE, 186 value); 187 } 188 189 static int doit_reply_value(struct genl_info *info, u32 node_id, 190 u32 error_id) 191 { 192 struct sk_buff *msg; 193 struct nlattr *hdr; 194 const char *error_name; 195 u32 value; 196 int ret; 197 198 msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 199 if (!msg) 200 return -ENOMEM; 201 202 hdr = genlmsg_iput(msg, info); 203 if (!hdr) { 204 nlmsg_free(msg); 205 return -EMSGSIZE; 206 } 207 208 ret = get_node_error_counter(node_id, error_id, 209 &error_name, &value); 210 if (ret) 211 return ret; 212 213 ret = msg_reply_value(msg, error_id, error_name, value); 214 if (ret) { 215 genlmsg_cancel(msg, hdr); 216 nlmsg_free(msg); 217 return ret; 218 } 219 220 genlmsg_end(msg, hdr); 221 222 return genlmsg_reply(msg, info); 223 } 224 225 /** 226 * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters 227 * @skb: Netlink message buffer 228 * @cb: Callback context for multi-part dumps 229 * 230 * Iterates over all error counters in a given Node and appends 231 * their attributes (ID, name, value) to the given netlink message buffer. 232 * Uses @cb->ctx to track progress in case the message buffer fills up, allowing 233 * multi-part dump support. On buffer overflow, updates the context to resume 234 * from the last node on the next invocation. 235 * 236 * Return: 0 if all errors fit in @skb, number of bytes added to @skb if 237 * the buffer filled up (requires multi-part continuation), or 238 * a negative error code on failure. 239 */ 240 int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, 241 struct netlink_callback *cb) 242 { 243 const struct genl_info *info = genl_info_dump(cb); 244 struct drm_ras_ctx *ctx = (void *)cb->ctx; 245 struct drm_ras_node *node; 246 struct nlattr *hdr; 247 const char *error_name; 248 u32 node_id, error_id, value; 249 int ret; 250 251 if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID)) 252 return -EINVAL; 253 254 node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); 255 256 node = xa_load(&drm_ras_xa, node_id); 257 if (!node) 258 return -ENOENT; 259 260 for (error_id = max(node->error_counter_range.first, ctx->restart); 261 error_id <= node->error_counter_range.last; 262 error_id++) { 263 ret = get_node_error_counter(node_id, error_id, 264 &error_name, &value); 265 /* 266 * For non-contiguous range, driver return -ENOENT as indication 267 * to skip this ID when listing all errors. 268 */ 269 if (ret == -ENOENT) 270 continue; 271 if (ret) 272 return ret; 273 274 hdr = genlmsg_iput(skb, info); 275 276 if (!hdr) { 277 ret = -EMSGSIZE; 278 break; 279 } 280 281 ret = msg_reply_value(skb, error_id, error_name, value); 282 if (ret) { 283 genlmsg_cancel(skb, hdr); 284 break; 285 } 286 287 genlmsg_end(skb, hdr); 288 } 289 290 if (ret == -EMSGSIZE) 291 ctx->restart = error_id; 292 293 return ret; 294 } 295 296 /** 297 * drm_ras_nl_get_error_counter_doit() - Query an error counter of an node 298 * @skb: Netlink message buffer 299 * @info: Generic Netlink info containing attributes of the request 300 * 301 * Extracts the node ID and error ID from the netlink attributes and 302 * retrieves the current value of the corresponding error counter. Sends the 303 * result back to the requesting user via the standard Genl reply. 304 * 305 * Return: 0 on success, or negative errno on failure. 306 */ 307 int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb, 308 struct genl_info *info) 309 { 310 u32 node_id, error_id; 311 312 if (!info->attrs || 313 GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || 314 GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID)) 315 return -EINVAL; 316 317 node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); 318 error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); 319 320 return doit_reply_value(info, node_id, error_id); 321 } 322 323 /** 324 * drm_ras_nl_clear_error_counter_doit() - Clear an error counter of a node 325 * @skb: Netlink message buffer 326 * @info: Generic Netlink info containing attributes of the request 327 * 328 * Extracts the node ID and error ID from the netlink attributes and 329 * clears the current value. 330 * 331 * Return: 0 on success, or negative errno on failure. 332 */ 333 int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb, 334 struct genl_info *info) 335 { 336 struct drm_ras_node *node; 337 u32 node_id, error_id; 338 339 if (!info->attrs || 340 GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || 341 GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID)) 342 return -EINVAL; 343 344 node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); 345 error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); 346 347 node = xa_load(&drm_ras_xa, node_id); 348 if (!node || !node->clear_error_counter) 349 return -ENOENT; 350 351 if (error_id < node->error_counter_range.first || 352 error_id > node->error_counter_range.last) 353 return -EINVAL; 354 355 return node->clear_error_counter(node, error_id); 356 } 357 358 /** 359 * drm_ras_node_register() - Register a new RAS node 360 * @node: Node structure to register 361 * 362 * Adds the given RAS node to the global node xarray and assigns it 363 * a unique ID. Both @node->name and @node->type must be valid. 364 * 365 * Return: 0 on success, or negative errno on failure: 366 */ 367 int drm_ras_node_register(struct drm_ras_node *node) 368 { 369 if (!node->device_name || !node->node_name) 370 return -EINVAL; 371 372 /* Currently, only Error Counter Endpoints are supported */ 373 if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER) 374 return -EINVAL; 375 376 /* Mandatory entries for Error Counter Node */ 377 if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER && 378 (!node->error_counter_range.last || !node->query_error_counter)) 379 return -EINVAL; 380 381 return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL); 382 } 383 EXPORT_SYMBOL(drm_ras_node_register); 384 385 /** 386 * drm_ras_node_unregister() - Unregister a previously registered node 387 * @node: Node structure to unregister 388 * 389 * Removes the given node from the global node xarray using its ID. 390 */ 391 void drm_ras_node_unregister(struct drm_ras_node *node) 392 { 393 xa_erase(&drm_ras_xa, node->id); 394 } 395 EXPORT_SYMBOL(drm_ras_node_unregister); 396