1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2026 Intel Corporation 4 */ 5 6 #include <linux/bitmap.h> 7 8 #include <drm/drm_managed.h> 9 #include <drm/drm_print.h> 10 #include <drm/drm_ras.h> 11 12 #include "xe_device_types.h" 13 #include "xe_drm_ras.h" 14 15 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; 16 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; 17 18 static int hw_query_error_counter(struct xe_drm_ras_counter *info, 19 u32 error_id, const char **name, u32 *val) 20 { 21 if (!info || !info[error_id].name) 22 return -ENOENT; 23 24 *name = info[error_id].name; 25 *val = atomic_read(&info[error_id].counter); 26 27 return 0; 28 } 29 30 static int hw_clear_error_counter(struct xe_drm_ras_counter *info, u32 error_id) 31 { 32 if (!info || !info[error_id].name) 33 return -ENOENT; 34 35 atomic_set(&info[error_id].counter, 0); 36 37 return 0; 38 } 39 40 static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id, 41 const char **name, u32 *val) 42 { 43 struct xe_device *xe = ep->priv; 44 struct xe_drm_ras *ras = &xe->ras; 45 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; 46 47 return hw_query_error_counter(info, error_id, name, val); 48 } 49 50 static int clear_uncorrectable_error_counter(struct drm_ras_node *node, u32 error_id) 51 { 52 struct xe_device *xe = node->priv; 53 struct xe_drm_ras *ras = &xe->ras; 54 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; 55 56 return hw_clear_error_counter(info, error_id); 57 } 58 59 static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id, 60 const char **name, u32 *val) 61 { 62 struct xe_device *xe = ep->priv; 63 struct xe_drm_ras *ras = &xe->ras; 64 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; 65 66 return hw_query_error_counter(info, error_id, name, val); 67 } 68 69 static int clear_correctable_error_counter(struct drm_ras_node *node, u32 error_id) 70 { 71 struct xe_device *xe = node->priv; 72 struct xe_drm_ras *ras = &xe->ras; 73 struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; 74 75 return hw_clear_error_counter(info, error_id); 76 } 77 78 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) 79 { 80 struct xe_drm_ras_counter *counter; 81 int i; 82 83 counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); 84 if (!counter) 85 return ERR_PTR(-ENOMEM); 86 87 for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) { 88 if (!error_components[i]) 89 continue; 90 91 counter[i].name = error_components[i]; 92 atomic_set(&counter[i].counter, 0); 93 } 94 95 return counter; 96 } 97 98 static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, 99 const enum drm_xe_ras_error_severity severity) 100 { 101 struct pci_dev *pdev = to_pci_dev(xe->drm.dev); 102 struct xe_drm_ras *ras = &xe->ras; 103 const char *device_name; 104 105 device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", 106 pci_domain_nr(pdev->bus), pdev->bus->number, 107 PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); 108 109 if (!device_name) 110 return -ENOMEM; 111 112 node->device_name = device_name; 113 node->node_name = error_severity[severity]; 114 node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; 115 node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; 116 node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1; 117 node->priv = xe; 118 119 ras->info[severity] = allocate_and_copy_counters(xe); 120 if (IS_ERR(ras->info[severity])) 121 return PTR_ERR(ras->info[severity]); 122 123 if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) { 124 node->query_error_counter = query_correctable_error_counter; 125 node->clear_error_counter = clear_correctable_error_counter; 126 } else { 127 node->query_error_counter = query_uncorrectable_error_counter; 128 node->clear_error_counter = clear_uncorrectable_error_counter; 129 } 130 131 return 0; 132 } 133 134 static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity) 135 { 136 struct drm_ras_node *node = &ras->node[severity]; 137 138 kfree(ras->info[severity]); 139 ras->info[severity] = NULL; 140 141 kfree(node->device_name); 142 node->device_name = NULL; 143 } 144 145 static int register_nodes(struct xe_device *xe) 146 { 147 struct xe_drm_ras *ras = &xe->ras; 148 int i; 149 150 for_each_error_severity(i) { 151 struct drm_ras_node *node = &ras->node[i]; 152 int ret; 153 154 ret = assign_node_params(xe, node, i); 155 if (ret) { 156 cleanup_node_param(ras, i); 157 return ret; 158 } 159 160 ret = drm_ras_node_register(node); 161 if (ret) { 162 cleanup_node_param(ras, i); 163 return ret; 164 } 165 } 166 167 return 0; 168 } 169 170 static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg) 171 { 172 struct xe_device *xe = arg; 173 struct xe_drm_ras *ras = &xe->ras; 174 int i; 175 176 for_each_error_severity(i) { 177 struct drm_ras_node *node = &ras->node[i]; 178 179 drm_ras_node_unregister(node); 180 cleanup_node_param(ras, i); 181 } 182 } 183 184 /** 185 * xe_drm_ras_init() - Initialize DRM RAS 186 * @xe: xe device instance 187 * 188 * Allocate and register DRM RAS nodes per device 189 * 190 * Return: 0 on success, negative error code otherwise. 191 */ 192 int xe_drm_ras_init(struct xe_device *xe) 193 { 194 struct xe_drm_ras *ras = &xe->ras; 195 struct drm_ras_node *node; 196 int err; 197 198 node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL); 199 if (!node) 200 return -ENOMEM; 201 202 ras->node = node; 203 204 err = register_nodes(xe); 205 if (err) { 206 drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err)); 207 return err; 208 } 209 210 err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe); 211 if (err) { 212 drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err)); 213 return err; 214 } 215 216 return 0; 217 } 218