1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2026 Intel Corporation 4 */ 5 6 #include <linux/module.h> 7 #include <linux/kernel.h> 8 #include <linux/netdevice.h> 9 #include <linux/xarray.h> 10 #include <net/genetlink.h> 11 12 #include <drm/drm_ras.h> 13 14 #include "drm_ras_nl.h" 15 16 /** 17 * DOC: DRM RAS Node Management 18 * 19 * This module provides the infrastructure to manage RAS (Reliability, 20 * Availability, and Serviceability) nodes for DRM drivers. Each 21 * DRM driver may register one or more RAS nodes, which represent 22 * logical components capable of reporting error counters and other 23 * reliability metrics. 24 * 25 * The nodes are stored in a global xarray `drm_ras_xa` to allow 26 * efficient lookup by ID. Nodes can be registered or unregistered 27 * dynamically at runtime. 28 * 29 * A Generic Netlink family `drm_ras` exposes two main operations to 30 * userspace: 31 * 32 * 1. LIST_NODES: Dump all currently registered RAS nodes. 33 * The user receives an array of node IDs, names, and types. 34 * 35 * 2. GET_ERROR_COUNTER: Get error counters of a given node. 36 * Userspace must provide Node ID, Error ID (Optional for specific counter). 37 * Returns all counters of a node if only Node ID is provided or specific 38 * error counters. 39 * 40 * Node registration: 41 * 42 * - drm_ras_node_register(): Registers a new node and assigns 43 * it a unique ID in the xarray. 44 * - drm_ras_node_unregister(): Removes a previously registered 45 * node from the xarray. 46 * 47 * Node type: 48 * 49 * - ERROR_COUNTER: 50 * + Currently, only error counters are supported. 51 * + The driver must implement the query_error_counter() callback to provide 52 * the name and the value of the error counter. 53 * + The driver must provide a error_counter_range.last value informing the 54 * last valid error ID. 55 * + The driver can provide a error_counter_range.first value informing the 56 * first valid error ID. 57 * + The error counters in the driver doesn't need to be contiguous, but the 58 * driver must return -ENOENT to the query_error_counter as an indication 59 * that the ID should be skipped and not listed in the netlink API. 60 * 61 * Netlink handlers: 62 * 63 * - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES 64 * operation, iterating over the xarray. 65 * - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit 66 * operation, fetching all counters from a specific node. 67 * - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit 68 * operation, fetching a counter value from a specific node. 69 */ 70 71 static DEFINE_XARRAY_ALLOC(drm_ras_xa); 72 73 /* 74 * The netlink callback context carries dump state across multiple dumpit calls 75 */ 76 struct drm_ras_ctx { 77 /* Which xarray id to restart the dump from */ 78 unsigned long restart; 79 }; 80 81 /** 82 * drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes 83 * @skb: Netlink message buffer 84 * @cb: Callback context for multi-part dumps 85 * 86 * Iterates over all registered RAS nodes in the global xarray and appends 87 * their attributes (ID, name, type) to the given netlink message buffer. 88 * Uses @cb->ctx to track progress in case the message buffer fills up, allowing 89 * multi-part dump support. On buffer overflow, updates the context to resume 90 * from the last node on the next invocation. 91 * 92 * Return: 0 if all nodes fit in @skb, number of bytes added to @skb if 93 * the buffer filled up (requires multi-part continuation), or 94 * a negative error code on failure. 95 */ 96 int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb, 97 struct netlink_callback *cb) 98 { 99 const struct genl_info *info = genl_info_dump(cb); 100 struct drm_ras_ctx *ctx = (void *)cb->ctx; 101 struct drm_ras_node *node; 102 struct nlattr *hdr; 103 unsigned long id; 104 int ret; 105 106 xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) { 107 hdr = genlmsg_iput(skb, info); 108 if (!hdr) { 109 ret = -EMSGSIZE; 110 break; 111 } 112 113 ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id); 114 if (ret) { 115 genlmsg_cancel(skb, hdr); 116 break; 117 } 118 119 ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME, 120 node->device_name); 121 if (ret) { 122 genlmsg_cancel(skb, hdr); 123 break; 124 } 125 126 ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME, 127 node->node_name); 128 if (ret) { 129 genlmsg_cancel(skb, hdr); 130 break; 131 } 132 133 ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE, 134 node->type); 135 if (ret) { 136 genlmsg_cancel(skb, hdr); 137 break; 138 } 139 140 genlmsg_end(skb, hdr); 141 } 142 143 if (ret == -EMSGSIZE) 144 ctx->restart = id; 145 146 return ret; 147 } 148 149 static int get_node_error_counter(u32 node_id, u32 error_id, 150 const char **name, u32 *value) 151 { 152 struct drm_ras_node *node; 153 154 node = xa_load(&drm_ras_xa, node_id); 155 if (!node || !node->query_error_counter) 156 return -ENOENT; 157 158 if (error_id < node->error_counter_range.first || 159 error_id > node->error_counter_range.last) 160 return -EINVAL; 161 162 return node->query_error_counter(node, error_id, name, value); 163 } 164 165 static int msg_reply_value(struct sk_buff *msg, u32 error_id, 166 const char *error_name, u32 value) 167 { 168 int ret; 169 170 ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id); 171 if (ret) 172 return ret; 173 174 ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, 175 error_name); 176 if (ret) 177 return ret; 178 179 return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE, 180 value); 181 } 182 183 static int doit_reply_value(struct genl_info *info, u32 node_id, 184 u32 error_id) 185 { 186 struct sk_buff *msg; 187 struct nlattr *hdr; 188 const char *error_name; 189 u32 value; 190 int ret; 191 192 msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 193 if (!msg) 194 return -ENOMEM; 195 196 hdr = genlmsg_iput(msg, info); 197 if (!hdr) { 198 nlmsg_free(msg); 199 return -EMSGSIZE; 200 } 201 202 ret = get_node_error_counter(node_id, error_id, 203 &error_name, &value); 204 if (ret) 205 return ret; 206 207 ret = msg_reply_value(msg, error_id, error_name, value); 208 if (ret) { 209 genlmsg_cancel(msg, hdr); 210 nlmsg_free(msg); 211 return ret; 212 } 213 214 genlmsg_end(msg, hdr); 215 216 return genlmsg_reply(msg, info); 217 } 218 219 /** 220 * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters 221 * @skb: Netlink message buffer 222 * @cb: Callback context for multi-part dumps 223 * 224 * Iterates over all error counters in a given Node and appends 225 * their attributes (ID, name, value) to the given netlink message buffer. 226 * Uses @cb->ctx to track progress in case the message buffer fills up, allowing 227 * multi-part dump support. On buffer overflow, updates the context to resume 228 * from the last node on the next invocation. 229 * 230 * Return: 0 if all errors fit in @skb, number of bytes added to @skb if 231 * the buffer filled up (requires multi-part continuation), or 232 * a negative error code on failure. 233 */ 234 int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, 235 struct netlink_callback *cb) 236 { 237 const struct genl_info *info = genl_info_dump(cb); 238 struct drm_ras_ctx *ctx = (void *)cb->ctx; 239 struct drm_ras_node *node; 240 struct nlattr *hdr; 241 const char *error_name; 242 u32 node_id, error_id, value; 243 int ret; 244 245 if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID)) 246 return -EINVAL; 247 248 node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); 249 250 node = xa_load(&drm_ras_xa, node_id); 251 if (!node) 252 return -ENOENT; 253 254 for (error_id = max(node->error_counter_range.first, ctx->restart); 255 error_id <= node->error_counter_range.last; 256 error_id++) { 257 ret = get_node_error_counter(node_id, error_id, 258 &error_name, &value); 259 /* 260 * For non-contiguous range, driver return -ENOENT as indication 261 * to skip this ID when listing all errors. 262 */ 263 if (ret == -ENOENT) 264 continue; 265 if (ret) 266 return ret; 267 268 hdr = genlmsg_iput(skb, info); 269 270 if (!hdr) { 271 ret = -EMSGSIZE; 272 break; 273 } 274 275 ret = msg_reply_value(skb, error_id, error_name, value); 276 if (ret) { 277 genlmsg_cancel(skb, hdr); 278 break; 279 } 280 281 genlmsg_end(skb, hdr); 282 } 283 284 if (ret == -EMSGSIZE) 285 ctx->restart = error_id; 286 287 return ret; 288 } 289 290 /** 291 * drm_ras_nl_get_error_counter_doit() - Query an error counter of an node 292 * @skb: Netlink message buffer 293 * @info: Generic Netlink info containing attributes of the request 294 * 295 * Extracts the node ID and error ID from the netlink attributes and 296 * retrieves the current value of the corresponding error counter. Sends the 297 * result back to the requesting user via the standard Genl reply. 298 * 299 * Return: 0 on success, or negative errno on failure. 300 */ 301 int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb, 302 struct genl_info *info) 303 { 304 u32 node_id, error_id; 305 306 if (!info->attrs || 307 GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || 308 GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID)) 309 return -EINVAL; 310 311 node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); 312 error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); 313 314 return doit_reply_value(info, node_id, error_id); 315 } 316 317 /** 318 * drm_ras_node_register() - Register a new RAS node 319 * @node: Node structure to register 320 * 321 * Adds the given RAS node to the global node xarray and assigns it 322 * a unique ID. Both @node->name and @node->type must be valid. 323 * 324 * Return: 0 on success, or negative errno on failure: 325 */ 326 int drm_ras_node_register(struct drm_ras_node *node) 327 { 328 if (!node->device_name || !node->node_name) 329 return -EINVAL; 330 331 /* Currently, only Error Counter Endpoints are supported */ 332 if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER) 333 return -EINVAL; 334 335 /* Mandatory entries for Error Counter Node */ 336 if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER && 337 (!node->error_counter_range.last || !node->query_error_counter)) 338 return -EINVAL; 339 340 return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL); 341 } 342 EXPORT_SYMBOL(drm_ras_node_register); 343 344 /** 345 * drm_ras_node_unregister() - Unregister a previously registered node 346 * @node: Node structure to unregister 347 * 348 * Removes the given node from the global node xarray using its ID. 349 */ 350 void drm_ras_node_unregister(struct drm_ras_node *node) 351 { 352 xa_erase(&drm_ras_xa, node->id); 353 } 354 EXPORT_SYMBOL(drm_ras_node_unregister); 355